Merge "Add optimized version of memcpy for Cortex A9"

2012-11-08 18:04:27 -08:00 · 2012-11-08 18:04:27 -08:00 · 49677deca2
commit 49677deca2
parent c213291515 6d0bcdc832
1 changed files with 188 additions and 26 deletions
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@ -37,28 +37,35 @@
 #ifdef HAVE_32_BYTE_CACHE_LINE
 /* a prefetch distance of 2 cache-lines */
 #define CACHE_LINE_SIZE     32
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*2)
 #else
 /* a prefetch distance of 4 cache-lines works best experimentally */
 #define CACHE_LINE_SIZE     64
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
 #endif

 ENTRY(memcpy)
        .save       {r0, lr}
        /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        stmfd       sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]

+/* If Neon supports unaligned access then remove the align code,
+ * unless a size limit has been specified.
+ */
+#ifndef NEON_UNALIGNED_ACCESS
        /* do we have at least 16-bytes to copy (needed for alignment below) */
        cmp         r2, #16
        blo         5f

+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
        /* align destination to cache-line for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
-        beq         0f
+        beq         2f

        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
@ -79,10 +86,9 @@ ENTRY(memcpy)
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
-
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]

 #ifdef HAVE_32_BYTE_CACHE_LINE
        /* make sure we have at least 32 bytes to copy */
@ -108,23 +114,22 @@ ENTRY(memcpy)
        subs        r2, r2, #64
        blo         2f

-        /* preload all the cache lines we need.
-         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
-         * ideally would would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-        pld         [r1, #(CACHE_LINE_SIZE*3)]
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        /* preload all the cache lines we need. */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]

 1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
+        vst1.8      {d4 - d7}, [r0, :128]!
        bhs         1b

 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
@ -133,9 +138,9 @@ ENTRY(memcpy)
        blo         4f

 3:      /* 32 bytes at a time. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
+        vld1.8      {d0 - d3}, [r1]!
        subs        r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
        bhs         3b
 #endif
 4:      /* less than 32 left */
@ -145,7 +150,6 @@ ENTRY(memcpy)
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
-
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
@ -164,6 +168,164 @@ ENTRY(memcpy)

        ldmfd       sp!, {r0, lr}
        bx          lr
+
+#else   /* NEON_UNALIGNED_ACCESS */
+
+        // Check so divider is at least 16 bytes, needed for alignment code.
+        cmp         r2, #16
+        blo         5f
+
+#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
+        /* Check the upper size limit for Neon unaligned memory access in memcpy */
+#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
+        cmp         r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
+        blo         3f
+#endif
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         3f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        ldrmib      lr, [r1], #1
+        strmib      lr, [r0], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+2:
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+3:
+#endif
+        /* make sure we have at least 64 bytes to copy */
+        subs        r2, r2, #64
+        blo         2f
+
+        /* preload all the cache lines we need */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+
+1:      /* The main loop copies 64 bytes at a time */
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
+        subs        r2, r2, #64
+        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d4 - d7}, [r0]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        add         r2, r2, #64
+        subs        r2, r2, #32
+        blo         4f
+
+3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        vld1.8      {d0 - d3}, [r1]!
+        subs        r2, r2, #32
+        vst1.8      {d0 - d3}, [r0]!
+        bhs         3b
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0]!
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        ldrmib      r3, [r1], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
+#endif  /* NEON_UNALIGNED_ACCESS */
+11:
+        /* Simple arm-only copy loop to handle aligned copy operations */
+        stmfd       sp!, {r4, r5, r6, r7, r8}
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+        /* Check alignment */
+        rsb         r3, r1, #0
+        ands        r3, #3
+        beq         2f
+
+        /* align source to 32 bits. We need to insert 2 instructions between
+         * a ldr[b|h] and str[b|h] because byte and half-word instructions
+         * stall 2 cycles.
+         */
+        movs        r12, r3, lsl #31
+        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
+        ldrmib      r3, [r1], #1
+        ldrcsb      r4, [r1], #1
+        ldrcsb      r5, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      r4, [r0], #1
+        strcsb      r5, [r0], #1
+2:
+        subs        r2, #32
+        blt         5f
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+3:      /* Main copy loop, copying 32 bytes at a time */
+        pld         [r1, #(CACHE_LINE_SIZE * 4)]
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        subs        r2, r2, #32
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        bge         3b
+5:      /* Handle any remaining bytes */
+        adds        r2, #32
+        beq         6f
+
+        movs        r12, r2, lsl #28
+        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
+        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
+        stmcsia     r0!, {r3, r4, r5, r6}
+        stmmiia     r0!, {r7, r8}
+        movs        r12, r2, lsl #30
+        ldrcs       r3, [r1], #4            /*  4 bytes */
+        ldrmih      r4, [r1], #2            /*  2 bytes */
+        strcs       r3, [r0], #4
+        strmih      r4, [r0], #2
+        tst         r2, #0x1
+        ldrneb      r3, [r1]                /*  last byte  */
+        strneb      r3, [r0]
+6:
+        ldmfd       sp!, {r4, r5, r6, r7, r8}
+        ldmfd       sp!, {r0, pc}
 END(memcpy)