From 185ce72d003def80022a48fe56ce65e547170cd2 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Fri, 15 Mar 2013 16:01:17 -0700 Subject: [PATCH] Update to latest cortexa15 memcpy code. This uses the new code original submitted as memcpy.a15.S as the base. However, the old code handled unaligned src/dst better so that was spliced in. I optimized the original unaligned code by removing a few unnecessary instructions. I optimized the a15 code by rewriting the pre and post code. I also modified the main loop to add a pld so that larger copies would not stall waiting for memory. Test cases for the new memcpy: - Copy all sized values from 0 to 1024 bytes, using whatever alignment is returned by malloc. For each alignment case described below, the test copied from 0 to 128 bytes. - Src and dst pointers are both aligned to the same value, starting at one going through every power of two up to and including 128. - Src aligned to double word boundary, dst aligned to word boundary. - Src aligned to word boundary, dst aligned to double word boundary. - Src aligned to 16 bit boundary, dst aligned to word boundary. - Src aligned to word boundary, dst aligned to 16 byte boundary. - Src aligned to word boundary, dst aligned to 1 byte from a word boundary. - Src aligned to word boundary, dst aligned to 2 bytes from a word boundary. - Src aligned to word boundary, dst aligned to 3 bytes from a word boundary. - Src aligned to 1 byte from a word boundary, dst aligned to a word boundary. - Src aligned to 2 bytes from a word boundary, dst aligned to a word boundary. - Src aligned to 3 bytes from a word boundary, dst aligned to a word boundary. Cases to verify the unaligned source code properly aligns to a 16 bit boundary. - Src aligned to 1 byte from a 128 bit boundary, dst aligned to 4 + 128 bit boundary. - Src aligned to 1 byte from a 128 bit boundary, dst aligned to 8 + 128 bit boundary. - Src aligned to 1 byte from a 128 bit boundary, dst aligned to 12 + 128 bit boundary. - Src aligned to 1 byte from a 128 bit boundary, dst aligned to 16 + 128 bit boundary. In all cases, a two byte fencepost was placed at the end of the destination to verify that only the requested number of bytes were copied. Bug: 8005082 Merge from internal master. (cherry-picked from commit 21ede92d794969f22cacbdb9f557818f1c5712b5) Change-Id: Ief70c9e6dc8c6473ae245b6570b2c266fed9618c --- libc/arch-arm/cortex-a15/bionic/memcpy.S | 311 ++++++++++++++++++----- 1 file changed, 245 insertions(+), 66 deletions(-) diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S index 16187b562..9985e7f42 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S @@ -24,81 +24,110 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* Assumes neon instructions and a cache line size of 64 bytes. */ + /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */ + + // This version is tuned for the Cortex-A15 processor. #include #include -/* - * This code assumes it is running on a processor that supports all arm v7 - * instructions, that supports neon instructions, and that has a 64 byte - * cache line. - */ - .text + .syntax unified .fpu neon -#define CACHE_LINE_SIZE 64 +#define CACHE_LINE_SIZE 64 ENTRY(memcpy) - .save {r0, lr} - /* start preloading as early as possible */ - pld [r1, #(CACHE_LINE_SIZE*0)] - stmfd sp!, {r0, lr} - pld [r1, #(CACHE_LINE_SIZE*1)] + // Assumes that n >= 0, and dst, src are valid pointers. + // For any sizes less than 832 use the neon code that doesn't + // care about the src alignment. This avoids any checks + // for src alignment, and offers the best improvement since + // smaller sized copies are dominated by the overhead of + // the pre and post main loop. + // For larger copies, if src and dst cannot both be aligned to + // word boundaries, use the neon code. + // For all other copies, align dst to a double word boundary + // and copy using LDRD/STRD instructions. - /* do we have at least 16-bytes to copy (needed for alignment below) */ - cmp r2, #16 - blo 5f + // Save registers (r0 holds the return value): + // optimized push {r0, lr}. + .save {r0, lr} + pld [r1, #(CACHE_LINE_SIZE*16)] + push {r0, lr} - /* align destination to cache-line for the write-buffer */ + cmp r2, #16 + blo copy_less_than_16_unknown_align + + cmp r2, #832 + bge check_alignment + +copy_unknown_alignment: + // Unknown alignment of src and dst. + // Assumes that the first few bytes have already been prefetched. + + // Align destination to 128 bits. The mainloop store instructions + // require this alignment or they will throw an exception. rsb r3, r0, #0 ands r3, r3, #0xF - beq 0f + beq 2f - /* copy up to 15-bytes (count in r3) */ + // Copy up to 15 bytes (count in r3). sub r2, r2, r3 movs ip, r3, lsl #31 - ldrmib lr, [r1], #1 - strmib lr, [r0], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 + + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + movs ip, r3, lsl #29 bge 1f - // copies 4 bytes, destination 32-bits aligned + // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after. vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! 1: bcc 2f - // copies 8 bytes, destination 64-bits aligned + // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after. vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0, :64]! -2: -0: /* preload immediately the next cache line, which we may need */ - pld [r1, #(CACHE_LINE_SIZE*0)] - pld [r1, #(CACHE_LINE_SIZE*1)] - - /* make sure we have at least 64 bytes to copy */ +2: // Make sure we have at least 64 bytes to copy. subs r2, r2, #64 blo 2f - /* Preload all the cache lines we need. - * NOTE: The number of pld below depends on CACHE_LINE_SIZE, - * ideally we would increase the distance in the main loop to - * avoid the goofy code below. In practice this doesn't seem to make - * a big difference. - * NOTE: The value CACHE_LINE_SIZE * 4 was chosen through - * experimentation. - */ - pld [r1, #(CACHE_LINE_SIZE*2)] - pld [r1, #(CACHE_LINE_SIZE*3)] - pld [r1, #(CACHE_LINE_SIZE*4)] - -1: /* The main loop copies 64 bytes at a time */ +1: // The main loop copies 64 bytes at a time. vld1.8 {d0 - d3}, [r1]! vld1.8 {d4 - d7}, [r1]! pld [r1, #(CACHE_LINE_SIZE*4)] @@ -107,25 +136,24 @@ ENTRY(memcpy) vst1.8 {d4 - d7}, [r0, :128]! bhs 1b -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - add r2, r2, #64 - subs r2, r2, #32 - blo 4f +2: // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 3f -3: /* 32 bytes at a time. These cache lines were already preloaded */ + // 32 bytes. These cache lines were already preloaded. vld1.8 {d0 - d3}, [r1]! - subs r2, r2, #32 + sub r2, r2, #32 vst1.8 {d0 - d3}, [r0, :128]! - bhs 3b -4: /* less than 32 left */ +3: // Less than 32 left. add r2, r2, #32 tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned + beq copy_less_than_16_unknown_align + // Copies 16 bytes, destination 128 bits aligned. vld1.8 {d0, d1}, [r1]! vst1.8 {d0, d1}, [r0, :128]! -5: /* copy up to 15-bytes (count in r2) */ +copy_less_than_16_unknown_align: + // Copy up to 15 bytes (count in r2). movs ip, r2, lsl #29 bcc 1f vld1.8 {d0}, [r1]! @@ -133,14 +161,165 @@ ENTRY(memcpy) 1: bge 2f vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! -2: movs ip, r2, lsl #31 - ldrmib r3, [r1], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strmib r3, [r0], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - ldmfd sp!, {r0, lr} - bx lr +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + pop {r0, pc} + +check_alignment: + // If src and dst cannot both be aligned to a word boundary, + // use the unaligned copy version. + eor r3, r0, r1 + ands r3, r3, #0x3 + bne copy_unknown_alignment + + // To try and improve performance, stack layout changed, + // i.e., not keeping the stack looking like users expect + // (highest numbered register at highest address). + // TODO: Add debug frame directives. + // We don't need exception unwind directives, because the code below + // does not throw any exceptions and does not call any other functions. + // Generally, newlib functions like this lack debug information for + // assembler source. + .save {r4, r5} + strd r4, r5, [sp, #-8]! + .save {r6, r7} + strd r6, r7, [sp, #-8]! + .save {r8, r9} + strd r8, r9, [sp, #-8]! + + // Optimized for already aligned dst code. + ands ip, r0, #3 + bne dst_not_word_aligned + +word_aligned: + // Align the destination buffer to 8 bytes, to make sure double + // loads and stores don't cross a cache line boundary, + // as they are then more expensive even if the data is in the cache + // (require two load/store issue cycles instead of one). + // If only one of the buffers is not 8 bytes aligned, + // then it's more important to align dst than src, + // because there is more penalty for stores + // than loads that cross a cacheline boundary. + // This check and realignment are only done if there is >= 832 + // bytes to copy. + + // Dst is word aligned, but check if it is already double word aligned. + ands r3, r0, #4 + beq 1f + ldr r3, [r1], #4 + str r3, [r0], #4 + sub r2, #4 + +1: // Can only get here if > 64 bytes to copy, so don't do check r2. + sub r2, #64 + +2: // Every loop iteration copies 64 bytes. + .irp offset, #0, #8, #16, #24, #32 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + + // Keep the pld as far from the next load as possible. + // The amount to prefetch was determined experimentally using + // large sizes, and verifying the prefetch size does not affect + // the smaller copies too much. + // WARNING: If the ldrd and strd instructions get too far away + // from each other, performance suffers. Three loads + // in a row is the best tradeoff. + pld [r1, #(CACHE_LINE_SIZE*16)] + strd r4, r5, [r0, #40] + strd r6, r7, [r0, #48] + strd r8, r9, [r0, #56] + + add r0, r0, #64 + add r1, r1, #64 + subs r2, r2, #64 + bge 2b + + // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 4f + + // Copy 32 bytes. These cache lines were already preloaded. + .irp offset, #0, #8, #16, #24 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #32 + add r0, r0, #32 + sub r2, r2, #32 +4: // Less than 32 left. + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // Copy 16 bytes. + .irp offset, #0, #8 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #16 + add r0, r0, #16 + +5: // Copy up to 15 bytes (count in r2). + movs ip, r2, lsl #29 + bcc 1f + // Copy 8 bytes. + ldrd r4, r5, [r1], #8 + strd r4, r5, [r0], #8 +1: bge 2f + // Copy 4 bytes. + ldr r4, [r1], #4 + str r4, [r0], #4 +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + // Restore registers: optimized pop {r0, pc} + ldrd r8, r9, [sp], #8 + ldrd r6, r7, [sp], #8 + ldrd r4, r5, [sp], #8 + pop {r0, pc} + +dst_not_word_aligned: + // Align dst to word. + rsb ip, ip, #4 + cmp ip, #2 + + itt gt + ldrbgt lr, [r1], #1 + strbgt lr, [r0], #1 + + itt ge + ldrbge lr, [r1], #1 + strbge lr, [r0], #1 + + ldrb lr, [r1], #1 + strb lr, [r0], #1 + + sub r2, r2, ip + + // If src is not word aligned, jump to the unaligned code. + ands ip, r1, #0x3 + beq word_aligned END(memcpy)