From d119b7b6f48fe507088cfb98bcafa99b320fd884 Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Mon, 15 Jul 2013 12:49:26 -0700 Subject: [PATCH] Optimize strcat/strcpy, small tweaks to strlen. Create one version of strcat/strcpy/strlen for cortex-a15/krait and another version for cortex-a9. Tested with the libc_test strcat/strcpy/strlen tests. Including new tests that verify that the src for strcat/strcpy do not overread across page boundaries. NOTE: The handling of unaligned strcpy (same code in strcat) could probably be optimized further such that the src is read 64 bits at a time instead of the partial reads occurring now. strlen improves slightly since it was recently optimized. Performance improvements for strcpy and strcat (using an empty dest string): cortex-a9 - Small copies vary from about 5% to 20% as the size gets above 10 bytes. - Copies >= 1024, about a 60% improvement. - Unaligned copies, from about 40% improvement. cortex-a15 - Most small copies exhibit a 100% improvement, a few copies only improve by 20%. - Copies >= 1024, about 150% improvement. - Unaligned copies, about 100% improvement. krait - Most small copies vary widely, but on average 20% improvement, then the performance gets better, hitting about a 100% improvement when copies 64 bytes of data. - Copies >= 1024, about 100% improvement. - When coping MBs of data, about 50% improvement. - Unaligned copies, about 90% improvement. As strcat destination strings get larger in size: cortex-a9 - about 40% improvement for small dst strings (>= 32). - about 250% improvement for dst strings >= 1024. cortex-a15 - about 200% improvement for small dst strings (>=32). - about 250% improvement for dst strings >= 1024. krait - about 25% improvement for small dst strings (>=32). - about 100% improvement for dst strings >=1024. Change-Id: Ifd091ebdbce70fe35a7c5d8f71d5914255f3af35 --- libc/Android.mk | 1 - libc/arch-arm/arm.mk | 1 - libc/arch-arm/cortex-a15/bionic/strcat.S | 568 ++++++++++++++++++++ libc/arch-arm/cortex-a15/bionic/strcpy.S | 451 ++++++++++++++++ libc/arch-arm/cortex-a15/bionic/strlen.S | 80 +-- libc/arch-arm/cortex-a15/cortex-a15.mk | 2 + libc/arch-arm/cortex-a9/bionic/strcat.S | 548 +++++++++++++++++++ libc/arch-arm/cortex-a9/bionic/strcpy.S | 456 ++++++++++++++++ libc/arch-arm/cortex-a9/bionic/strlen.S | 167 ++++++ libc/arch-arm/cortex-a9/cortex-a9.mk | 5 +- libc/arch-arm/{ => generic}/bionic/strcpy.S | 0 libc/arch-arm/generic/generic.mk | 2 + libc/arch-arm/krait/krait.mk | 4 +- 13 files changed, 2247 insertions(+), 38 deletions(-) create mode 100644 libc/arch-arm/cortex-a15/bionic/strcat.S create mode 100644 libc/arch-arm/cortex-a15/bionic/strcpy.S create mode 100644 libc/arch-arm/cortex-a9/bionic/strcat.S create mode 100644 libc/arch-arm/cortex-a9/bionic/strcpy.S create mode 100644 libc/arch-arm/cortex-a9/bionic/strlen.S rename libc/arch-arm/{ => generic}/bionic/strcpy.S (100%) diff --git a/libc/Android.mk b/libc/Android.mk index db668d379..f353c416f 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -357,7 +357,6 @@ libc_common_src_files += \ bionic/memmove.c.arm \ string/bcopy.c \ string/strncmp.c \ - string/strcat.c \ string/strncat.c \ string/strncpy.c \ bionic/strchr.cpp \ diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk index 1a2185f8f..7fb15a1c7 100644 --- a/libc/arch-arm/arm.mk +++ b/libc/arch-arm/arm.mk @@ -14,7 +14,6 @@ _LIBC_ARCH_COMMON_SRC_FILES := \ arch-arm/bionic/_setjmp.S \ arch-arm/bionic/setjmp.S \ arch-arm/bionic/sigsetjmp.S \ - arch-arm/bionic/strcpy.S \ arch-arm/bionic/syscall.S \ arch-arm/bionic/tgkill.S \ arch-arm/bionic/tkill.S \ diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S new file mode 100644 index 000000000..72d4e9eb0 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/strcat.S @@ -0,0 +1,568 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .syntax unified + + .thumb + .thumb_func + + .macro m_push + push {r0, r4, r5, lr} + .endm // m_push + + .macro m_pop + pop {r0, r4, r5, pc} + .endm // m_pop + + .macro m_scan_byte + ldrb r3, [r0] + cbz r3, strcat_r0_scan_done + add r0, #1 + .endm // m_scan_byte + + .macro m_copy_byte reg, cmd, label + ldrb \reg, [r1], #1 + strb \reg, [r0], #1 + \cmd \reg, \label + .endm // m_copy_byte + +ENTRY(strcat) + // Quick check to see if src is empty. + ldrb r2, [r1] + pld [r1, #0] + cbnz r2, strcat_continue + bx lr + +strcat_continue: + // To speed up really small dst strings, unroll checking the first 4 bytes. + m_push + m_scan_byte + m_scan_byte + m_scan_byte + m_scan_byte + + ands r3, r0, #7 + beq strcat_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcat_align_to_32 + + ldrb r5, [r0] + cbz r5, strcat_r0_scan_done + add r0, r0, #1 + +strcat_align_to_32: + bcc strcat_align_to_64 + + ldrb r2, [r0] + cbz r2, strcat_r0_scan_done + add r0, r0, #1 + ldrb r4, [r0] + cbz r4, strcat_r0_scan_done + add r0, r0, #1 + +strcat_align_to_64: + tst r3, #4 + beq strcat_mainloop + ldr r3, [r0], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcat_zero_in_second_register + b strcat_mainloop + +strcat_r0_scan_done: + // For short copies, hard-code checking the first 8 bytes since this + // new code doesn't win until after about 8 bytes. + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue + +strcpy_finish: + m_pop + +strcpy_continue: + ands r3, r0, #7 + beq strcpy_check_src_align + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcpy_align_to_32 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_32: + bcc strcpy_align_to_64 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_64: + tst r3, #4 + beq strcpy_check_src_align + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + str r2, [r0], #4 + +strcpy_check_src_align: + // At this point dst is aligned to a double word, check if src + // is also aligned to a double word. + ands r3, r1, #7 + bne strcpy_unaligned_copy + + .p2align 2 +strcpy_mainloop: + ldrd r2, r3, [r1], #8 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_mainloop + +strcpy_complete: + m_pop + +strcpy_zero_in_first_register: + lsls lr, ip, #17 + bne strcpy_copy1byte + bcs strcpy_copy2bytes + lsls ip, ip, #1 + bne strcpy_copy3bytes + +strcpy_copy4bytes: + // Copy 4 bytes to the destiniation. + str r2, [r0] + m_pop + +strcpy_copy1byte: + strb r2, [r0] + m_pop + +strcpy_copy2bytes: + strh r2, [r0] + m_pop + +strcpy_copy3bytes: + strh r2, [r0], #2 + lsr r2, #16 + strb r2, [r0] + m_pop + +strcpy_zero_in_second_register: + lsls lr, ip, #17 + bne strcpy_copy5bytes + bcs strcpy_copy6bytes + lsls ip, ip, #1 + bne strcpy_copy7bytes + + // Copy 8 bytes to the destination. + strd r2, r3, [r0] + m_pop + +strcpy_copy5bytes: + str r2, [r0], #4 + strb r3, [r0] + m_pop + +strcpy_copy6bytes: + str r2, [r0], #4 + strh r3, [r0] + m_pop + +strcpy_copy7bytes: + str r2, [r0], #4 + strh r3, [r0], #2 + lsr r3, #16 + strb r3, [r0] + m_pop + +strcpy_unaligned_copy: + // Dst is aligned to a double word, while src is at an unknown alignment. + // There are 7 different versions of the unaligned copy code + // to prevent overreading the src. The mainloop of every single version + // will store 64 bits per loop. The difference is how much of src can + // be read without potentially crossing a page boundary. + tbb [pc, r3] +strcpy_unaligned_branchtable: + .byte 0 + .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + + .p2align 2 + // Can read 7 bytes before possibly crossing a page. +strcpy_unalign7: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r3, [r1] + cbz r3, strcpy_unalign7_copy5bytes + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign7_copy6bytes + ldrb r5, [r1, #2] + cbz r5, strcpy_unalign7_copy7bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + lsrs ip, r3, #24 + strd r2, r3, [r0], #8 + beq strcpy_unalign_return + b strcpy_unalign7 + +strcpy_unalign7_copy5bytes: + str r2, [r0], #4 + strb r3, [r0] +strcpy_unalign_return: + m_pop + +strcpy_unalign7_copy6bytes: + str r2, [r0], #4 + strb r3, [r0], #1 + strb r4, [r0], #1 + m_pop + +strcpy_unalign7_copy7bytes: + str r2, [r0], #4 + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + m_pop + + .p2align 2 + // Can read 6 bytes before possibly crossing a page. +strcpy_unalign6: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + ldrb r5, [r1, #1] + cbz r5, strcpy_unalign_copy6bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + tst r3, #0xff0000 + beq strcpy_copy7bytes + lsrs ip, r3, #24 + strd r2, r3, [r0], #8 + beq strcpy_unalign_return + b strcpy_unalign6 + + .p2align 2 + // Can read 5 bytes before possibly crossing a page. +strcpy_unalign5: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign5 + +strcpy_unalign_copy5bytes: + str r2, [r0], #4 + strb r4, [r0] + m_pop + +strcpy_unalign_copy6bytes: + str r2, [r0], #4 + strb r4, [r0], #1 + strb r5, [r0] + m_pop + + .p2align 2 + // Can read 4 bytes before possibly crossing a page. +strcpy_unalign4: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldr r3, [r1], #4 + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign4 + + .p2align 2 + // Can read 3 bytes before possibly crossing a page. +strcpy_unalign3: + ldrb r2, [r1] + cbz r2, strcpy_unalign3_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign3_copy2bytes + ldrb r4, [r1, #2] + cbz r4, strcpy_unalign3_copy3bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + lsrs lr, r2, #24 + beq strcpy_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign3 + +strcpy_unalign3_copy1byte: + strb r2, [r0] + m_pop + +strcpy_unalign3_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_pop + +strcpy_unalign3_copy3bytes: + strb r2, [r0], #1 + strb r3, [r0], #1 + strb r4, [r0] + m_pop + + .p2align 2 + // Can read 2 bytes before possibly crossing a page. +strcpy_unalign2: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign_copy2bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + pld [r1, #64] + + tst r2, #0xff0000 + beq strcpy_copy3bytes + lsrs ip, r2, #24 + beq strcpy_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign2 + + .p2align 2 + // Can read 1 byte before possibly crossing a page. +strcpy_unalign1: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign1 + +strcpy_unalign_copy1byte: + strb r2, [r0] + m_pop + +strcpy_unalign_copy2bytes: + strb r2, [r0], #1 + strb r4, [r0] + m_pop + + .p2align 2 +strcat_mainloop: + ldrd r2, r3, [r0], #8 + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcat_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcat_zero_in_second_register + b strcat_mainloop + +strcat_zero_in_first_register: + // Prefetch the src now, it's going to be used soon. + pld [r1, #0] + lsls lr, ip, #17 + bne strcat_sub8 + bcs strcat_sub7 + lsls ip, ip, #1 + bne strcat_sub6 + + sub r0, r0, #5 + b strcat_r0_scan_done + +strcat_sub8: + sub r0, r0, #8 + b strcat_r0_scan_done + +strcat_sub7: + sub r0, r0, #7 + b strcat_r0_scan_done + +strcat_sub6: + sub r0, r0, #6 + b strcat_r0_scan_done + +strcat_zero_in_second_register: + // Prefetch the src now, it's going to be used soon. + pld [r1, #0] + lsls lr, ip, #17 + bne strcat_sub4 + bcs strcat_sub3 + lsls ip, ip, #1 + bne strcat_sub2 + + sub r0, r0, #1 + b strcat_r0_scan_done + +strcat_sub4: + sub r0, r0, #4 + b strcat_r0_scan_done + +strcat_sub3: + sub r0, r0, #3 + b strcat_r0_scan_done + +strcat_sub2: + sub r0, r0, #2 + b strcat_r0_scan_done +END(strcat) diff --git a/libc/arch-arm/cortex-a15/bionic/strcpy.S b/libc/arch-arm/cortex-a15/bionic/strcpy.S new file mode 100644 index 000000000..577354034 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/strcpy.S @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .syntax unified + + .thumb + .thumb_func + + .macro m_push + push {r0, r4, r5, lr} + .endm // m_push + + .macro m_pop + pop {r0, r4, r5, pc} + .endm // m_pop + + .macro m_copy_byte reg, cmd, label + ldrb \reg, [r1], #1 + strb \reg, [r0], #1 + \cmd \reg, \label + .endm // m_copy_byte + +ENTRY(strcpy) + // For short copies, hard-code checking the first 8 bytes since this + // new code doesn't win until after about 8 bytes. + m_push + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue + +strcpy_finish: + m_pop + +strcpy_continue: + pld [r1, #0] + ands r3, r0, #7 + beq strcpy_check_src_align + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcpy_align_to_32 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_32: + bcc strcpy_align_to_64 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_64: + tst r3, #4 + beq strcpy_check_src_align + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + str r2, [r0], #4 + +strcpy_check_src_align: + // At this point dst is aligned to a double word, check if src + // is also aligned to a double word. + ands r3, r1, #7 + bne strcpy_unaligned_copy + + .p2align 2 +strcpy_mainloop: + ldrd r2, r3, [r1], #8 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_mainloop + +strcpy_complete: + m_pop + +strcpy_zero_in_first_register: + lsls lr, ip, #17 + bne strcpy_copy1byte + bcs strcpy_copy2bytes + lsls ip, ip, #1 + bne strcpy_copy3bytes + +strcpy_copy4bytes: + // Copy 4 bytes to the destiniation. + str r2, [r0] + m_pop + +strcpy_copy1byte: + strb r2, [r0] + m_pop + +strcpy_copy2bytes: + strh r2, [r0] + m_pop + +strcpy_copy3bytes: + strh r2, [r0], #2 + lsr r2, #16 + strb r2, [r0] + m_pop + +strcpy_zero_in_second_register: + lsls lr, ip, #17 + bne strcpy_copy5bytes + bcs strcpy_copy6bytes + lsls ip, ip, #1 + bne strcpy_copy7bytes + + // Copy 8 bytes to the destination. + strd r2, r3, [r0] + m_pop + +strcpy_copy5bytes: + str r2, [r0], #4 + strb r3, [r0] + m_pop + +strcpy_copy6bytes: + str r2, [r0], #4 + strh r3, [r0] + m_pop + +strcpy_copy7bytes: + str r2, [r0], #4 + strh r3, [r0], #2 + lsr r3, #16 + strb r3, [r0] + m_pop + +strcpy_unaligned_copy: + // Dst is aligned to a double word, while src is at an unknown alignment. + // There are 7 different versions of the unaligned copy code + // to prevent overreading the src. The mainloop of every single version + // will store 64 bits per loop. The difference is how much of src can + // be read without potentially crossing a page boundary. + tbb [pc, r3] +strcpy_unaligned_branchtable: + .byte 0 + .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + + .p2align 2 + // Can read 7 bytes before possibly crossing a page. +strcpy_unalign7: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r3, [r1] + cbz r3, strcpy_unalign7_copy5bytes + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign7_copy6bytes + ldrb r5, [r1, #2] + cbz r5, strcpy_unalign7_copy7bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + lsrs ip, r3, #24 + strd r2, r3, [r0], #8 + beq strcpy_unalign_return + b strcpy_unalign7 + +strcpy_unalign7_copy5bytes: + str r2, [r0], #4 + strb r3, [r0] +strcpy_unalign_return: + m_pop + +strcpy_unalign7_copy6bytes: + str r2, [r0], #4 + strb r3, [r0], #1 + strb r4, [r0], #1 + m_pop + +strcpy_unalign7_copy7bytes: + str r2, [r0], #4 + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + m_pop + + .p2align 2 + // Can read 6 bytes before possibly crossing a page. +strcpy_unalign6: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + ldrb r5, [r1, #1] + cbz r5, strcpy_unalign_copy6bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + tst r3, #0xff0000 + beq strcpy_copy7bytes + lsrs ip, r3, #24 + strd r2, r3, [r0], #8 + beq strcpy_unalign_return + b strcpy_unalign6 + + .p2align 2 + // Can read 5 bytes before possibly crossing a page. +strcpy_unalign5: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign5 + +strcpy_unalign_copy5bytes: + str r2, [r0], #4 + strb r4, [r0] + m_pop + +strcpy_unalign_copy6bytes: + str r2, [r0], #4 + strb r4, [r0], #1 + strb r5, [r0] + m_pop + + .p2align 2 + // Can read 4 bytes before possibly crossing a page. +strcpy_unalign4: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldr r3, [r1], #4 + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign4 + + .p2align 2 + // Can read 3 bytes before possibly crossing a page. +strcpy_unalign3: + ldrb r2, [r1] + cbz r2, strcpy_unalign3_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign3_copy2bytes + ldrb r4, [r1, #2] + cbz r4, strcpy_unalign3_copy3bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + lsrs lr, r2, #24 + beq strcpy_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign3 + +strcpy_unalign3_copy1byte: + strb r2, [r0] + m_pop + +strcpy_unalign3_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_pop + +strcpy_unalign3_copy3bytes: + strb r2, [r0], #1 + strb r3, [r0], #1 + strb r4, [r0] + m_pop + + .p2align 2 + // Can read 2 bytes before possibly crossing a page. +strcpy_unalign2: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign_copy2bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + pld [r1, #64] + + tst r2, #0xff0000 + beq strcpy_copy3bytes + lsrs ip, r2, #24 + beq strcpy_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign2 + + .p2align 2 + // Can read 1 byte before possibly crossing a page. +strcpy_unalign1: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + strd r2, r3, [r0], #8 + b strcpy_unalign1 + +strcpy_unalign_copy1byte: + strb r2, [r0] + m_pop + +strcpy_unalign_copy2bytes: + strb r2, [r0], #1 + strb r4, [r0] + m_pop +END(strcpy) diff --git a/libc/arch-arm/cortex-a15/bionic/strlen.S b/libc/arch-arm/cortex-a15/bionic/strlen.S index d5b8ba422..08f6d193b 100644 --- a/libc/arch-arm/cortex-a15/bionic/strlen.S +++ b/libc/arch-arm/cortex-a15/bionic/strlen.S @@ -61,34 +61,32 @@ .thumb_func ENTRY(strlen) - pld [r1, #128] - mov r1, r0 + pld [r0, #0] + mov r1, r0 - rsb r3, r0, #0 - ands r3, r3, #7 + ands r3, r0, #7 beq mainloop // Align to a double word (64 bits). - ands ip, r3, #1 + rsb r3, r3, #8 + lsls ip, r3, #31 beq align_to_32 ldrb r2, [r1], #1 - cmp r2, #0 - beq update_count_and_return + cbz r2, update_count_and_return align_to_32: + bcc align_to_64 ands ip, r3, #2 beq align_to_64 ldrb r2, [r1], #1 - cmp r2, #0 - beq update_count_and_return + cbz r2, update_count_and_return ldrb r2, [r1], #1 - cmp r2, #0 - beq update_count_and_return + cbz r2, update_count_and_return align_to_64: - ands ip, r3, #4 + tst r3, #4 beq mainloop ldr r3, [r1], #4 @@ -97,6 +95,7 @@ align_to_64: ands ip, ip, #0x80808080 bne zero_in_second_register + .p2align 2 mainloop: ldrd r2, r3, [r1], #8 @@ -113,39 +112,54 @@ mainloop: bne zero_in_second_register b mainloop +update_count_and_return: + sub r0, r1, r0 + sub r0, r0, #1 + bx lr + zero_in_first_register: - sub r1, r1, #4 + sub r0, r1, r0 + lsls r3, ip, #17 + bne sub8_and_return + bcs sub7_and_return + lsls ip, ip, #1 + bne sub6_and_return + + sub r0, r0, #5 + bx lr + +sub8_and_return: + sub r0, r0, #8 + bx lr + +sub7_and_return: + sub r0, r0, #7 + bx lr + +sub6_and_return: + sub r0, r0, #6 + bx lr zero_in_second_register: sub r0, r1, r0 + lsls r3, ip, #17 + bne sub4_and_return + bcs sub3_and_return + lsls ip, ip, #1 + bne sub2_and_return - // Check for zero in byte 0. - ands r1, ip, #0x80 - beq check_byte1 + sub r0, r0, #1 + bx lr +sub4_and_return: sub r0, r0, #4 bx lr -check_byte1: - // Check for zero in byte 1. - ands r1, ip, #0x8000 - beq check_byte2 - +sub3_and_return: sub r0, r0, #3 bx lr -check_byte2: - // Check for zero in byte 2. - ands r1, ip, #0x800000 - beq return - +sub2_and_return: sub r0, r0, #2 bx lr - -update_count_and_return: - sub r0, r1, r0 - -return: - sub r0, r0, #1 - bx lr END(strlen) diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk index 0904e6bca..281e424ba 100644 --- a/libc/arch-arm/cortex-a15/cortex-a15.mk +++ b/libc/arch-arm/cortex-a15/cortex-a15.mk @@ -1,6 +1,8 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/cortex-a15/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/cortex-a15/bionic/memset.S) +$(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a15/bionic/strcmp.S) +$(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S) include bionic/libc/arch-arm/generic/generic.mk diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S new file mode 100644 index 000000000..0f5baef4c --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/strcat.S @@ -0,0 +1,548 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .syntax unified + + .thumb + .thumb_func + + .macro m_push + push {r0, r4, r5, lr} + .endm // m_push + + .macro m_ret inst + \inst {r0, r4, r5, pc} + .endm // m_ret + + .macro m_scan_byte + ldrb r3, [r0] + cbz r3, strcat_r0_scan_done + add r0, #1 + .endm // m_scan_byte + + .macro m_copy_byte reg, cmd, label + ldrb \reg, [r1], #1 + strb \reg, [r0], #1 + \cmd \reg, \label + .endm // m_copy_byte + +ENTRY(strcat) + // Quick check to see if src is empty. + ldrb r2, [r1] + pld [r1, #0] + cbnz r2, strcat_continue + bx lr + +strcat_continue: + // To speed up really small dst strings, unroll checking the first 4 bytes. + m_push + m_scan_byte + m_scan_byte + m_scan_byte + m_scan_byte + + ands r3, r0, #7 + bne strcat_align_src + + .p2align 2 +strcat_mainloop: + ldmia r0!, {r2, r3} + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcat_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcat_zero_in_second_register + b strcat_mainloop + +strcat_zero_in_first_register: + sub r0, r0, #4 + +strcat_zero_in_second_register: + // Check for zero in byte 0. + tst ip, #0x80 + it ne + subne r0, r0, #4 + bne strcat_r0_scan_done + // Check for zero in byte 1. + tst ip, #0x8000 + it ne + subne r0, r0, #3 + bne strcat_r0_scan_done + // Check for zero in byte 2. + tst ip, #0x800000 + it ne + subne r0, r0, #2 + it eq + // Zero is in byte 3. + subeq r0, r0, #1 + +strcat_r0_scan_done: + // Unroll the first 8 bytes that will be copied. + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue + +strcpy_finish: + m_ret inst=pop + +strcpy_continue: + pld [r1, #0] + ands r3, r0, #7 + bne strcpy_align_dst + +strcpy_check_src_align: + // At this point dst is aligned to a double word, check if src + // is also aligned to a double word. + ands r3, r1, #7 + bne strcpy_unaligned_copy + + .p2align 2 +strcpy_mainloop: + ldmia r1!, {r2, r3} + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_mainloop + +strcpy_zero_in_first_register: + lsls lr, ip, #17 + itt ne + strbne r2, [r0] + m_ret inst=popne + itt cs + strhcs r2, [r0] + m_ret inst=popcs + lsls ip, ip, #1 + itt eq + streq r2, [r0] + m_ret inst=popeq + strh r2, [r0], #2 + lsr r3, r2, #16 + strb r3, [r0] + m_ret inst=pop + +strcpy_zero_in_second_register: + lsls lr, ip, #17 + ittt ne + stmiane r0!, {r2} + strbne r3, [r0] + m_ret inst=popne + ittt cs + strcs r2, [r0], #4 + strhcs r3, [r0] + m_ret inst=popcs + lsls ip, ip, #1 + itt eq + stmiaeq r0, {r2, r3} + m_ret inst=popeq + stmia r0!, {r2} + strh r3, [r0], #2 + lsr r4, r3, #16 + strb r4, [r0] + m_ret inst=pop + +strcpy_align_dst: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcpy_align_to_32 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_32: + bcc strcpy_align_to_64 + + ldrb r4, [r1], #1 + strb r4, [r0], #1 + cmp r4, #0 + it eq + m_ret inst=popeq + ldrb r5, [r1], #1 + strb r5, [r0], #1 + cmp r5, #0 + it eq + m_ret inst=popeq + +strcpy_align_to_64: + tst r3, #4 + beq strcpy_check_src_align + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + stmia r0!, {r2} + b strcpy_check_src_align + +strcpy_complete: + m_ret inst=pop + +strcpy_unaligned_copy: + // Dst is aligned to a double word, while src is at an unknown alignment. + // There are 7 different versions of the unaligned copy code + // to prevent overreading the src. The mainloop of every single version + // will store 64 bits per loop. The difference is how much of src can + // be read without potentially crossing a page boundary. + tbb [pc, r3] +strcpy_unaligned_branchtable: + .byte 0 + .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + + .p2align 2 + // Can read 7 bytes before possibly crossing a page. +strcpy_unalign7: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r3, [r1] + cbz r3, strcpy_unalign7_copy5bytes + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign7_copy6bytes + ldrb r5, [r1, #2] + cbz r5, strcpy_unalign7_copy7bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + lsrs ip, r3, #24 + stmia r0!, {r2, r3} + beq strcpy_unalign_return + b strcpy_unalign7 + +strcpy_unalign7_copy5bytes: + stmia r0!, {r2} + strb r3, [r0] +strcpy_unalign_return: + m_ret inst=pop + +strcpy_unalign7_copy6bytes: + stmia r0!, {r2} + strb r3, [r0], #1 + strb r4, [r0], #1 + m_ret inst=pop + +strcpy_unalign7_copy7bytes: + stmia r0!, {r2} + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + m_ret inst=pop + + .p2align 2 + // Can read 6 bytes before possibly crossing a page. +strcpy_unalign6: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + ldrb r5, [r1, #1] + cbz r5, strcpy_unalign_copy6bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + tst r3, #0xff0000 + beq strcpy_unalign6_copy7bytes + lsrs ip, r3, #24 + stmia r0!, {r2, r3} + beq strcpy_unalign_return + b strcpy_unalign6 + +strcpy_unalign6_copy7bytes: + stmia r0!, {r2} + strh r3, [r0], #2 + lsr r3, #16 + strb r3, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 5 bytes before possibly crossing a page. +strcpy_unalign5: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign5 + +strcpy_unalign_copy5bytes: + stmia r0!, {r2} + strb r4, [r0] + m_ret inst=pop + +strcpy_unalign_copy6bytes: + stmia r0!, {r2} + strb r4, [r0], #1 + strb r5, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 4 bytes before possibly crossing a page. +strcpy_unalign4: + ldmia r1!, {r2} + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldmia r1!, {r3} + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign4 + + .p2align 2 + // Can read 3 bytes before possibly crossing a page. +strcpy_unalign3: + ldrb r2, [r1] + cbz r2, strcpy_unalign3_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign3_copy2bytes + ldrb r4, [r1, #2] + cbz r4, strcpy_unalign3_copy3bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + lsrs lr, r2, #24 + beq strcpy_unalign_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign3 + +strcpy_unalign3_copy1byte: + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign3_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_ret inst=pop + +strcpy_unalign3_copy3bytes: + strb r2, [r0], #1 + strb r3, [r0], #1 + strb r4, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 2 bytes before possibly crossing a page. +strcpy_unalign2: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign_copy2bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + pld [r1, #64] + + tst r2, #0xff0000 + beq strcpy_unalign_copy3bytes + lsrs ip, r2, #24 + beq strcpy_unalign_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign2 + + .p2align 2 + // Can read 1 byte before possibly crossing a page. +strcpy_unalign1: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign1 + +strcpy_unalign_copy1byte: + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_ret inst=pop + +strcpy_unalign_copy3bytes: + strh r2, [r0], #2 + lsr r2, #16 + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign_copy4bytes: + stmia r0, {r2} + m_ret inst=pop + +strcat_align_src: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcat_align_to_32 + ldrb r2, [r0], #1 + cbz r2, strcat_r0_update + +strcat_align_to_32: + bcc strcat_align_to_64 + ldrb r2, [r0], #1 + cbz r2, strcat_r0_update + ldrb r2, [r0], #1 + cbz r2, strcat_r0_update + +strcat_align_to_64: + tst r3, #4 + beq strcat_mainloop + ldr r3, [r0], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcat_zero_in_second_register + b strcat_mainloop + +strcat_r0_update: + sub r0, r0, #1 + b strcat_r0_scan_done +END(strcat) diff --git a/libc/arch-arm/cortex-a9/bionic/strcpy.S b/libc/arch-arm/cortex-a9/bionic/strcpy.S new file mode 100644 index 000000000..9aa4f883d --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/strcpy.S @@ -0,0 +1,456 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .syntax unified + + .thumb + .thumb_func + + .macro m_push + push {r0, r4, r5, lr} + .endm // m_push + + .macro m_ret inst + \inst {r0, r4, r5, pc} + .endm // m_ret + + .macro m_copy_byte reg, cmd, label + ldrb \reg, [r1], #1 + strb \reg, [r0], #1 + \cmd \reg, \label + .endm // m_copy_byte + +ENTRY(strcpy) + // Unroll the first 8 bytes that will be copied. + m_push + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue + +strcpy_finish: + m_ret inst=pop + +strcpy_continue: + pld [r1, #0] + ands r3, r0, #7 + bne strcpy_align_dst + +strcpy_check_src_align: + // At this point dst is aligned to a double word, check if src + // is also aligned to a double word. + ands r3, r1, #7 + bne strcpy_unaligned_copy + + .p2align 2 +strcpy_mainloop: + ldmia r1!, {r2, r3} + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_mainloop + +strcpy_zero_in_first_register: + lsls lr, ip, #17 + itt ne + strbne r2, [r0] + m_ret inst=popne + itt cs + strhcs r2, [r0] + m_ret inst=popcs + lsls ip, ip, #1 + itt eq + streq r2, [r0] + m_ret inst=popeq + strh r2, [r0], #2 + lsr r3, r2, #16 + strb r3, [r0] + m_ret inst=pop + +strcpy_zero_in_second_register: + lsls lr, ip, #17 + ittt ne + stmiane r0!, {r2} + strbne r3, [r0] + m_ret inst=popne + ittt cs + strcs r2, [r0], #4 + strhcs r3, [r0] + m_ret inst=popcs + lsls ip, ip, #1 + itt eq + stmiaeq r0, {r2, r3} + m_ret inst=popeq + stmia r0!, {r2} + strh r3, [r0], #2 + lsr r4, r3, #16 + strb r4, [r0] + m_ret inst=pop + +strcpy_align_dst: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq strcpy_align_to_32 + + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, strcpy_complete + +strcpy_align_to_32: + bcc strcpy_align_to_64 + + ldrb r4, [r1], #1 + strb r4, [r0], #1 + cmp r4, #0 + it eq + m_ret inst=popeq + ldrb r5, [r1], #1 + strb r5, [r0], #1 + cmp r5, #0 + it eq + m_ret inst=popeq + +strcpy_align_to_64: + tst r3, #4 + beq strcpy_check_src_align + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + stmia r0!, {r2} + b strcpy_check_src_align + +strcpy_complete: + m_ret inst=pop + +strcpy_unaligned_copy: + // Dst is aligned to a double word, while src is at an unknown alignment. + // There are 7 different versions of the unaligned copy code + // to prevent overreading the src. The mainloop of every single version + // will store 64 bits per loop. The difference is how much of src can + // be read without potentially crossing a page boundary. + tbb [pc, r3] +strcpy_unaligned_branchtable: + .byte 0 + .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) + .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + + .p2align 2 + // Can read 7 bytes before possibly crossing a page. +strcpy_unalign7: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r3, [r1] + cbz r3, strcpy_unalign7_copy5bytes + ldrb r4, [r1, #1] + cbz r4, strcpy_unalign7_copy6bytes + ldrb r5, [r1, #2] + cbz r5, strcpy_unalign7_copy7bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + lsrs ip, r3, #24 + stmia r0!, {r2, r3} + beq strcpy_unalign_return + b strcpy_unalign7 + +strcpy_unalign7_copy5bytes: + stmia r0!, {r2} + strb r3, [r0] +strcpy_unalign_return: + m_ret inst=pop + +strcpy_unalign7_copy6bytes: + stmia r0!, {r2} + strb r3, [r0], #1 + strb r4, [r0], #1 + m_ret inst=pop + +strcpy_unalign7_copy7bytes: + stmia r0!, {r2} + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + m_ret inst=pop + + .p2align 2 + // Can read 6 bytes before possibly crossing a page. +strcpy_unalign6: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + ldrb r5, [r1, #1] + cbz r5, strcpy_unalign_copy6bytes + + ldr r3, [r1], #4 + pld [r1, #64] + + tst r3, #0xff0000 + beq strcpy_unalign6_copy7bytes + lsrs ip, r3, #24 + stmia r0!, {r2, r3} + beq strcpy_unalign_return + b strcpy_unalign6 + +strcpy_unalign6_copy7bytes: + stmia r0!, {r2} + strh r3, [r0], #2 + lsr r3, #16 + strb r3, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 5 bytes before possibly crossing a page. +strcpy_unalign5: + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldrb r4, [r1] + cbz r4, strcpy_unalign_copy5bytes + + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign5 + +strcpy_unalign_copy5bytes: + stmia r0!, {r2} + strb r4, [r0] + m_ret inst=pop + +strcpy_unalign_copy6bytes: + stmia r0!, {r2} + strb r4, [r0], #1 + strb r5, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 4 bytes before possibly crossing a page. +strcpy_unalign4: + ldmia r1!, {r2} + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + ldmia r1!, {r3} + pld [r1, #64] + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign4 + + .p2align 2 + // Can read 3 bytes before possibly crossing a page. +strcpy_unalign3: + ldrb r2, [r1] + cbz r2, strcpy_unalign3_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign3_copy2bytes + ldrb r4, [r1, #2] + cbz r4, strcpy_unalign3_copy3bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + lsrs lr, r2, #24 + beq strcpy_unalign_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign3 + +strcpy_unalign3_copy1byte: + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign3_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_ret inst=pop + +strcpy_unalign3_copy3bytes: + strb r2, [r0], #1 + strb r3, [r0], #1 + strb r4, [r0] + m_ret inst=pop + + .p2align 2 + // Can read 2 bytes before possibly crossing a page. +strcpy_unalign2: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + ldrb r3, [r1, #1] + cbz r3, strcpy_unalign_copy2bytes + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + pld [r1, #64] + + tst r2, #0xff0000 + beq strcpy_unalign_copy3bytes + lsrs ip, r2, #24 + beq strcpy_unalign_copy4bytes + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign2 + + .p2align 2 + // Can read 1 byte before possibly crossing a page. +strcpy_unalign1: + ldrb r2, [r1] + cbz r2, strcpy_unalign_copy1byte + + ldr r2, [r1], #4 + ldr r3, [r1], #4 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne strcpy_zero_in_second_register + + stmia r0!, {r2, r3} + b strcpy_unalign1 + +strcpy_unalign_copy1byte: + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign_copy2bytes: + strb r2, [r0], #1 + strb r3, [r0] + m_ret inst=pop + +strcpy_unalign_copy3bytes: + strh r2, [r0], #2 + lsr r2, #16 + strb r2, [r0] + m_ret inst=pop + +strcpy_unalign_copy4bytes: + stmia r0, {r2} + m_ret inst=pop +END(strcpy) diff --git a/libc/arch-arm/cortex-a9/bionic/strlen.S b/libc/arch-arm/cortex-a9/bionic/strlen.S new file mode 100644 index 000000000..259eda0c4 --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/strlen.S @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + + .syntax unified + + .thumb + .thumb_func + +ENTRY(strlen) + pld [r0, #0] + mov r1, r0 + + ands r3, r0, #7 + bne align_src + + .p2align 2 +mainloop: + ldmia r1!, {r2, r3} + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne zero_in_second_register + b mainloop + +zero_in_first_register: + sub r0, r1, r0 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq check_byte1_reg1 + + sub r0, r0, #8 + bx lr + +check_byte1_reg1: + bcc check_byte2_reg1 + + sub r0, r0, #7 + bx lr + +check_byte2_reg1: + // Check for zero in byte 2. + tst ip, #0x800000 + itt ne + subne r0, r0, #6 + bxne lr + sub r0, r0, #5 + bx lr + +zero_in_second_register: + sub r0, r1, r0 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq check_byte1_reg2 + + sub r0, r0, #4 + bx lr + +check_byte1_reg2: + bcc check_byte2_reg2 + + sub r0, r0, #3 + bx lr + +check_byte2_reg2: + // Check for zero in byte 2. + tst ip, #0x800000 + itt ne + subne r0, r0, #2 + bxne lr + sub r0, r0, #1 + bx lr + +align_src: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq align_to_32 + + ldrb r2, [r1], #1 + cbz r2, done + +align_to_32: + bcc align_to_64 + + ldrb r2, [r1], #1 + cbz r2, done + ldrb r2, [r1], #1 + cbz r2, done + +align_to_64: + tst r3, #4 + beq mainloop + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne zero_in_second_register + b mainloop + +done: + sub r0, r1, r0 + sub r0, r0, #1 + bx lr +END(strlen) diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk index 5c684ed49..61a52c2ac 100644 --- a/libc/arch-arm/cortex-a9/cortex-a9.mk +++ b/libc/arch-arm/cortex-a9/cortex-a9.mk @@ -1,7 +1,8 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/cortex-a9/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/cortex-a9/bionic/memset.S) +$(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a9/bionic/strcat.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a9/bionic/strcmp.S) -# Use cortex-a15 version of strlen. -$(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S) +$(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a9/bionic/strcpy.S) +$(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a9/bionic/strlen.S) include bionic/libc/arch-arm/generic/generic.mk diff --git a/libc/arch-arm/bionic/strcpy.S b/libc/arch-arm/generic/bionic/strcpy.S similarity index 100% rename from libc/arch-arm/bionic/strcpy.S rename to libc/arch-arm/generic/bionic/strcpy.S diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk index 18cad9da6..0b3f64494 100644 --- a/libc/arch-arm/generic/generic.mk +++ b/libc/arch-arm/generic/generic.mk @@ -1,4 +1,6 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/generic/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/generic/bionic/memset.S) +$(call libc-add-cpu-variant-src,STRCAT,string/strcat.c) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S) +$(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.c) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c) diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk index 288afbb61..1ff18e9bb 100644 --- a/libc/arch-arm/krait/krait.mk +++ b/libc/arch-arm/krait/krait.mk @@ -1,7 +1,9 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S) -# Use cortex-a15 version of strlen. +# Use cortex-a15 versions of strcat/strcpy/strlen. +$(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S) +$(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S) include bionic/libc/arch-arm/generic/generic.mk