From 6c80ccdeed9d9b30e961f68229fe8171d79c5d14 Mon Sep 17 00:00:00 2001 From: Shu Zhang Date: Mon, 12 May 2014 18:12:15 +0800 Subject: [PATCH] denver: optimize memmove Optimize 32-bit denver memmove with reversal memcpy. Change-Id: Iaad0a9475248cdd7e4f50d58bea9db1b767abc88 --- libc/arch-arm/arm.mk | 1 - libc/arch-arm/cortex-a15/cortex-a15.mk | 1 + libc/arch-arm/cortex-a9/cortex-a9.mk | 1 + libc/arch-arm/denver/bionic/memmove.S | 281 +++++++++++++++++++++++++ libc/arch-arm/denver/denver.mk | 5 +- libc/arch-arm/generic/generic.mk | 1 + libc/arch-arm/krait/krait.mk | 3 +- tests/string_test.cpp | 66 ++++++ 8 files changed, 355 insertions(+), 4 deletions(-) create mode 100644 libc/arch-arm/denver/bionic/memmove.S diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk index 06b167575..3821854ad 100644 --- a/libc/arch-arm/arm.mk +++ b/libc/arch-arm/arm.mk @@ -13,7 +13,6 @@ libc_bionic_src_files_arm := \ libc_common_src_files_arm += \ bionic/index.cpp \ bionic/memchr.c \ - bionic/memmove.c.arm \ bionic/memrchr.c \ bionic/strchr.cpp \ bionic/strnlen.c \ diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk index d0896afaf..552811ebc 100644 --- a/libc/arch-arm/cortex-a15/cortex-a15.mk +++ b/libc/arch-arm/cortex-a15/cortex-a15.mk @@ -7,3 +7,4 @@ libc_bionic_src_files_arm += \ arch-arm/cortex-a15/bionic/strlen.S \ arch-arm/cortex-a15/bionic/__strcat_chk.S \ arch-arm/cortex-a15/bionic/__strcpy_chk.S \ + bionic/memmove.c \ diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk index e15602b78..9b99387b1 100644 --- a/libc/arch-arm/cortex-a9/cortex-a9.mk +++ b/libc/arch-arm/cortex-a9/cortex-a9.mk @@ -7,3 +7,4 @@ libc_bionic_src_files_arm += \ arch-arm/cortex-a9/bionic/strlen.S \ arch-arm/cortex-a9/bionic/__strcat_chk.S \ arch-arm/cortex-a9/bionic/__strcpy_chk.S \ + bionic/memmove.c \ diff --git a/libc/arch-arm/denver/bionic/memmove.S b/libc/arch-arm/denver/bionic/memmove.S new file mode 100644 index 000000000..132190bb8 --- /dev/null +++ b/libc/arch-arm/denver/bionic/memmove.S @@ -0,0 +1,281 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + + .text + .syntax unified + .fpu neon + +#define CACHE_LINE_SIZE (64) +#define MEMCPY_BLOCK_SIZE_SMALL (32768) +#define MEMCPY_BLOCK_SIZE_MID (1048576) +#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4) +#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4) +#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16) + +ENTRY(memmove) + cmp r2, #0 + cmpne r0, r1 + bxeq lr + subs r3, r0, r1 + bls .L_jump_to_memcpy + cmp r2, r3 + bhi .L_reversed_memcpy + +.L_jump_to_memcpy: + b memcpy + +.L_reversed_memcpy: + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + + add r0, r0, r2 + add r1, r1, r2 + + /* preload next cache line */ + pld [r1, #-CACHE_LINE_SIZE] + pld [r1, #-CACHE_LINE_SIZE*2] + +.L_reversed_memcpy_align_dest: + /* Deal with very small blocks (< 32bytes) asap */ + cmp r2, #32 + blo .L_reversed_memcpy_lt_32bytes + /* no need to align if len < 128 bytes */ + cmp r2, #128 + blo .L_reversed_memcpy_lt_128bytes + /* align destination to 64 bytes (1 cache line) */ + ands r3, r0, #0x3f + beq .L_reversed_memcpy_dispatch + sub r2, r2, r3 +0: /* copy 1 byte */ + movs ip, r3, lsl #31 + ldrbmi ip, [r1, #-1]! + strbmi ip, [r0, #-1]! +1: /* copy 2 bytes */ + ldrbcs ip, [r1, #-1]! + strbcs ip, [r0, #-1]! + ldrbcs ip, [r1, #-1]! + strbcs ip, [r0, #-1]! +2: /* copy 4 bytes */ + movs ip, r3, lsl #29 + bpl 3f + sub r1, r1, #4 + sub r0, r0, #4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32] +3: /* copy 8 bytes */ + bcc 4f + sub r1, r1, #8 + sub r0, r0, #8 + vld1.8 {d0}, [r1] + vst1.8 {d0}, [r0, :64] +4: /* copy 16 bytes */ + movs ip, r3, lsl #27 + bpl 5f + sub r1, r1, #16 + sub r0, r0, #16 + vld1.8 {q0}, [r1] + vst1.8 {q0}, [r0, :128] +5: /* copy 32 bytes */ + bcc .L_reversed_memcpy_dispatch + sub r1, r1, #32 + sub r0, r0, #32 + vld1.8 {q0, q1}, [r1] + vst1.8 {q0, q1}, [r0, :256] + +.L_reversed_memcpy_dispatch: + /* preload more cache lines */ + pld [r1, #-CACHE_LINE_SIZE*3] + pld [r1, #-CACHE_LINE_SIZE*4] + + cmp r2, #MEMCPY_BLOCK_SIZE_SMALL + blo .L_reversed_memcpy_neon_pld_near + cmp r2, #MEMCPY_BLOCK_SIZE_MID + blo .L_reversed_memcpy_neon_pld_mid + b .L_reversed_memcpy_neon_pld_far + +.L_reversed_memcpy_neon_pld_near: + /* less than 128 bytes? */ + subs r2, r2, #128 + blo 1f + sub r1, r1, #32 + sub r0, r0, #32 + mov r3, #-32 + .align 4 +0: + /* copy 128 bytes in each loop */ + subs r2, r2, #128 + + /* preload to cache */ + pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] + /* copy a cache line */ + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + + /* preload to cache */ + pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] + /* copy a cache line */ + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + + bhs 0b + add r1, r1, #32 + add r0, r0, #32 +1: + adds r2, r2, #128 + bne .L_reversed_memcpy_lt_128bytes + pop {r0, pc} + +.L_reversed_memcpy_neon_pld_mid: + subs r2, r2, #128 + sub r1, r1, #32 + sub r0, r0, #32 + mov r3, #-32 + .align 4 +0: + /* copy 128 bytes in each loop */ + subs r2, r2, #128 + + /* preload to cache */ + pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] + /* copy a cache line */ + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + + /* preload to cache */ + pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] + /* copy a cache line */ + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + vld1.8 {q0, q1}, [r1], r3 + vst1.8 {q0, q1}, [r0, :256], r3 + + bhs 0b + add r1, r1, #32 + add r0, r0, #32 +1: + adds r2, r2, #128 + bne .L_reversed_memcpy_lt_128bytes + pop {r0, pc} + +.L_reversed_memcpy_neon_pld_far: + sub r2, r2, #128 + sub r0, r0, #128 + sub r1, r1, #128 + .align 4 +0: + /* copy 128 bytes in each loop */ + subs r2, r2, #128 + + /* preload to cache */ + pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128] + pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128] + /* read */ + vld1.8 {q0, q1}, [r1]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q8, q9}, [r1]! + vld1.8 {q10, q11}, [r1]! + /* write */ + vst1.8 {q0, q1}, [r0, :256]! + vst1.8 {q2, q3}, [r0, :256]! + vst1.8 {q8, q9}, [r0, :256]! + vst1.8 {q10, q11}, [r0, :256]! + + sub r0, r0, #256 + sub r1, r1, #256 + bhs 0b + add r0, r0, #128 + add r1, r1, #128 +1: + adds r2, r2, #128 + bne .L_reversed_memcpy_lt_128bytes + pop {r0, pc} + +.L_reversed_memcpy_lt_128bytes: +6: /* copy 64 bytes */ + movs ip, r2, lsl #26 + bcc 5f + sub r1, r1, #32 + sub r0, r0, #32 + vld1.8 {q0, q1}, [r1] + vst1.8 {q0, q1}, [r0] + sub r1, r1, #32 + sub r0, r0, #32 + vld1.8 {q0, q1}, [r1] + vst1.8 {q0, q1}, [r0] +5: /* copy 32 bytes */ + bpl 4f + sub r1, r1, #32 + sub r0, r0, #32 + vld1.8 {q0, q1}, [r1] + vst1.8 {q0, q1}, [r0] +.L_reversed_memcpy_lt_32bytes: +4: /* copy 16 bytes */ + movs ip, r2, lsl #28 + bcc 3f + sub r1, r1, #16 + sub r0, r0, #16 + vld1.8 {q0}, [r1] + vst1.8 {q0}, [r0] +3: /* copy 8 bytes */ + bpl 2f + sub r1, r1, #8 + sub r0, r0, #8 + vld1.8 {d0}, [r1] + vst1.8 {d0}, [r0] +2: /* copy 4 bytes */ + ands ip, r2, #0x4 + beq 1f + sub r1, r1, #4 + sub r0, r0, #4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] +1: /* copy 2 bytes */ + movs ip, r2, lsl #31 + ldrbcs ip, [r1, #-1]! + strbcs ip, [r0, #-1]! + ldrbcs ip, [r1, #-1]! + strbcs ip, [r0, #-1]! +0: /* copy 1 byte */ + ldrbmi ip, [r1, #-1]! + strbmi ip, [r0, #-1]! + + pop {r0, pc} + +END(memmove) diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk index 3fcc4572c..6989187bf 100644 --- a/libc/arch-arm/denver/denver.mk +++ b/libc/arch-arm/denver/denver.mk @@ -1,12 +1,13 @@ libc_bionic_src_files_arm += \ arch-arm/denver/bionic/memcpy.S \ + arch-arm/denver/bionic/memmove.S \ arch-arm/denver/bionic/memset.S \ arch-arm/denver/bionic/__strcat_chk.S \ - arch-arm/denver/bionic/__strcpy_chk.S + arch-arm/denver/bionic/__strcpy_chk.S \ # Use cortex-a15 versions of strcat/strcpy/strlen. libc_bionic_src_files_arm += \ arch-arm/cortex-a15/bionic/strcat.S \ arch-arm/cortex-a15/bionic/strcpy.S \ arch-arm/cortex-a15/bionic/strlen.S \ - arch-arm/cortex-a15/bionic/strcmp.S + arch-arm/cortex-a15/bionic/strcmp.S \ diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk index 2bc84e02d..2456e6e4c 100644 --- a/libc/arch-arm/generic/generic.mk +++ b/libc/arch-arm/generic/generic.mk @@ -4,6 +4,7 @@ libc_bionic_src_files_arm += \ arch-arm/generic/bionic/strcmp.S \ arch-arm/generic/bionic/strcpy.S \ arch-arm/generic/bionic/strlen.c \ + bionic/memmove.c \ bionic/__strcat_chk.cpp \ bionic/__strcpy_chk.cpp \ upstream-openbsd/lib/libc/string/strcat.c \ diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk index 08342d657..631ab6879 100644 --- a/libc/arch-arm/krait/krait.mk +++ b/libc/arch-arm/krait/krait.mk @@ -5,8 +5,9 @@ libc_bionic_src_files_arm += \ arch-arm/krait/bionic/__strcat_chk.S \ arch-arm/krait/bionic/__strcpy_chk.S \ -# Use cortex-a15 versions of strcat/strcpy/strlen. +# Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove libc_bionic_src_files_arm += \ arch-arm/cortex-a15/bionic/strcat.S \ arch-arm/cortex-a15/bionic/strcpy.S \ arch-arm/cortex-a15/bionic/strlen.S \ + bionic/memmove.c \ diff --git a/tests/string_test.cpp b/tests/string_test.cpp index 5ccc63d3a..f17e575dd 100644 --- a/tests/string_test.cpp +++ b/tests/string_test.cpp @@ -909,6 +909,56 @@ TEST(string, memmove) { } } +static void verify_memmove(char* src_copy, char* dst, char* src, size_t size) { + memset(dst, 0, size); + memcpy(src, src_copy, size); + ASSERT_EQ(dst, memmove(dst, src, size)); + ASSERT_EQ(0, memcmp(dst, src_copy, size)); +} + +#define MEMMOVE_DATA_SIZE (1024*1024*3) + +TEST(string, memmove_check) { + char* buffer = reinterpret_cast(malloc(MEMMOVE_DATA_SIZE)); + ASSERT_TRUE(buffer != NULL); + + char* src_data = reinterpret_cast(malloc(MEMMOVE_DATA_SIZE)); + ASSERT_TRUE(src_data != NULL); + // Initialize to a known pattern to copy into src for each test and + // to compare dst against. + for (size_t i = 0; i < MEMMOVE_DATA_SIZE; i++) { + src_data[i] = (i + 1) % 255; + } + + // Check all different dst offsets between 0 and 127 inclusive. + char* src = buffer; + for (size_t i = 0; i < 127; i++) { + char* dst = buffer + 256 + i; + // Small copy. + verify_memmove(src_data, dst, src, 1024); + + // Medium copy. + verify_memmove(src_data, dst, src, 64 * 1024); + + // Medium copy. + verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024); + } + + // Check all leftover size offsets between 1 and 127 inclusive. + char* dst = buffer + 256; + src = buffer; + for (size_t size = 1; size < 127; size++) { + // Small copy. + verify_memmove(src_data, dst, src, 1024); + + // Medium copy. + verify_memmove(src_data, dst, src, 64 * 1024); + + // Large copy. + verify_memmove(src_data, dst, src, 1024 * 1024 + 128 * 1024); + } +} + TEST(string, bcopy) { StringTestState state(LARGE); for (size_t i = 0; i < state.n; i++) { @@ -964,6 +1014,22 @@ TEST(string, memcpy_overread) { RunSrcDstBufferOverreadTest(DoMemcpyTest); } +static void DoMemmoveTest(uint8_t* src, uint8_t* dst, size_t len) { + memset(src, (len % 255) + 1, len); + memset(dst, 0, len); + + ASSERT_EQ(dst, memmove(dst, src, len)); + ASSERT_TRUE(memcmp(src, dst, len) == 0); +} + +TEST(string, memmove_align) { + RunSrcDstBufferAlignTest(LARGE, DoMemmoveTest); +} + +TEST(string, memmove_overread) { + RunSrcDstBufferOverreadTest(DoMemmoveTest); +} + static void DoMemsetTest(uint8_t* buf, size_t len) { for (size_t i = 0; i < len; i++) { buf[i] = 0;