Merge "Rewrite memset for cortexa15 to use strd."

2013-04-09 22:58:25 +00:00 · 2013-04-09 22:58:25 +00:00 · e27483c788
commit e27483c788
parent bc37bf00f8 7ffad9c120
1 changed files with 102 additions and 44 deletions
--- a/libc/arch-arm/cortex-a15/bionic/memset.S
+++ b/libc/arch-arm/cortex-a15/bionic/memset.S
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -35,11 +35,12 @@
         * memset() returns its first argument.
 		 */
-    .fpu    neon
+        .fpu        neon
        .syntax     unified
 ENTRY(bzero)
-        mov     r2, r1
+        mov         r2, r1
-        mov     r1, #0
+        mov         r1, #0
        // Fall through to memset...
 END(bzero)
@ -47,60 +48,117 @@ ENTRY(memset)
        .save       {r0}
        stmfd       sp!, {r0}
-        vdup.8      q0, r1
+        // The new algorithm is slower for copies < 16 so use the old
-
+        // neon code in that case.
        /* do we have at least 16-bytes to write (needed for alignment below) */
        cmp         r2, #16
-        blo         3f
+        blo         set_less_than_16_unknown_align
-        /* align destination to 16 bytes for the write-buffer */
+        // Use strd which requires an even and odd register so move the
-        rsb         r3, r0, #0
+        // values so that:
-        ands        r3, r3, #0xF
+        //   r0 and r1 contain the memset value
-        beq         2f
+        //   r2 is the number of bytes to set
        //   r3 is the destination pointer
        mov         r3, r0
-        /* write up to 15-bytes (count in r3) */
+        // Copy the byte value in every byte of r1.
-        sub         r2, r2, r3
+        mov         r1, r1, lsl #24
-        movs        ip, r3, lsl #31
+        orr         r1, r1, r1, lsr #8
-        strmib      r1, [r0], #1
+        orr         r1, r1, r1, lsr #16
        strcsb      r1, [r0], #1
        strcsb      r1, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
-        // writes 4 bytes, 32-bits aligned
+check_alignment:
-        vst1.32     {d0[0]}, [r0, :32]!
+        // Align destination to a double word to avoid the strd crossing
-1:      bcc         2f
+        // a cache line boundary.
        ands        ip, r3, #7
        bne         do_double_word_align
-        // writes 8 bytes, 64-bits aligned
+double_word_aligned:
-        vst1.8      {d0}, [r0, :64]!
+        mov         r0, r1
 2:
        /* make sure we have at least 32 bytes to write */
        subs        r2, r2, #32
        blo         2f
        vmov        q1, q0
-1:      /* The main loop writes 32 bytes at a time */
+        subs        r2, #64
-        subs        r2, r2, #32
+        blo         set_less_than_64
        vst1.8      {d0 - d3}, [r0, :128]!
        bhs         1b
-2:      /* less than 32 left */
+1:      // Main loop sets 64 bytes at a time.
-        add         r2, r2, #32
+        .irp        offset, #0, #8, #16, #24, #32, #40, #48, #56
-        tst         r2, #0x10
+        strd        r0, r1, [r3, \offset]
-        beq         3f
+        .endr
-        // writes 16 bytes, 128-bits aligned
+        add         r3, #64
-        vst1.8      {d0, d1}, [r0, :128]!
+        subs        r2, #64
-3:      /* write up to 15-bytes (count in r2) */
+        bge         1b
 set_less_than_64:
        // Restore r2 to the count of bytes left to set.
        add         r2, #64
        lsls        ip, r2, #27
        bcc         set_less_than_32
        // Set 32 bytes.
        .irp        offset, #0, #8, #16, #24
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #32
 set_less_than_32:
        bpl         set_less_than_16
        // Set 16 bytes.
        .irp        offset, #0, #8
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #16
 set_less_than_16:
        // Less than 16 bytes to set.
        lsls        ip, r2, #29
        bcc         set_less_than_8
        // Set 8 bytes.
        strd        r0, r1, [r3], #8
 set_less_than_8:
        bpl         set_less_than_4
        // Set 4 bytes
        str         r1, [r3], #4
 set_less_than_4:
        lsls        ip, r2, #31
        it          ne
        strbne      r1, [r3], #1
        itt         cs
        strbcs      r1, [r3], #1
        strbcs      r1, [r3]
        ldmfd       sp!, {r0}
        bx          lr
 do_double_word_align:
        rsb         ip, ip, #8
        sub         r2, r2, ip
        movs        r0, ip, lsl #31
        it          mi
        strbmi      r1, [r3], #1
        itt         cs
        strbcs      r1, [r3], #1
        strbcs      r1, [r3], #1
        // Dst is at least word aligned by this point.
        cmp         ip, #4
        blo         double_word_aligned
        str         r1, [r3], #4
        b           double_word_aligned
 set_less_than_16_unknown_align:
        // Set up to 15 bytes.
        vdup.8      d0, r1
        movs        ip, r2, lsl #29
        bcc         1f
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vst1.32     {d0[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
-        strmib      r1, [r0], #1
+        it          mi
-        strcsb      r1, [r0], #1
+        strbmi      r1, [r0], #1
-        strcsb      r1, [r0], #1
+        itt         cs
        strbcs      r1, [r0], #1
        strbcs      r1, [r0], #1
        ldmfd       sp!, {r0}
        bx          lr
 END(memset)