diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk index 1d9863c32..e87ef3803 100644 --- a/libc/arch-arm/arm.mk +++ b/libc/arch-arm/arm.mk @@ -31,6 +31,8 @@ _LIBC_ARCH_DYNAMIC_SRC_FILES := \ _LIBC_FORTIFY_FILES_TO_REMOVE := \ bionic/__memcpy_chk.cpp \ bionic/__memset_chk.cpp \ + bionic/__strcpy_chk.cpp \ + bionic/__strcat_chk.cpp \ libc_common_src_files := \ $(filter-out $(_LIBC_FORTIFY_FILES_TO_REMOVE),$(libc_common_src_files)) diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S new file mode 100644 index 000000000..08dc78a9a --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + + .thumb + .thumb_func + +// Get the length of src string, then get the source of the dst string. +// Check that the two lengths together don't exceed the threshold, then +// do a memcpy of the data. +ENTRY(__strcat_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + push {r4, r5} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 0 + + mov lr, r2 + + // Save the dst register to r5 + mov r5, r0 + + // Zero out r4 + eor r4, r4, r4 + + // r1 contains the address of the string to count. +.L_strlen_start: + mov r0, r1 + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r1], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r1], #8 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r1, r0 + sub r3, r3, #1 + b .L_finish + +.L_zero_in_first_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_finish + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_finish + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_finish + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_finish + +.L_zero_in_second_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_finish + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_finish + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_finish + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_finish: + cmp r4, #0 + bne .L_strlen_done + + // Time to get the dst string length. + mov r1, r5 + + // Save the original source address to r5. + mov r5, r0 + + // Save the current length (adding 1 for the terminator). + add r4, r3, #1 + b .L_strlen_start + + // r0 holds the pointer to the dst string. + // r3 holds the dst string length. + // r4 holds the src string length + 1. +.L_strlen_done: + add r2, r3, r4 + cmp r2, lr + bgt .L_fortify_check_failed + + // Set up the registers for the memcpy code. + mov r1, r5 + pld [r1, #64] + mov r2, r4 + add r0, r0, r3 + pop {r4, r5} + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + + .cfi_endproc +END(__strcat_chk) + + .data +error_string: + .string "strcat buffer overflow" diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S new file mode 100644 index 000000000..9fde59015 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + + .thumb + .thumb_func + +// Get the length of the source string first, then do a memcpy of the data +// instead of a strcpy. +ENTRY(__strcpy_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + + mov lr, r2 + mov r0, r1 + + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r0], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r0], #8 + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r0, r1 + sub r3, r3, #1 + b .L_check_size + +.L_zero_in_first_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_check_size + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_check_size + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_check_size + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_check_size + +.L_zero_in_second_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_check_size + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_check_size + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_check_size + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_check_size: + pld [r1, #0] + pld [r1, #64] + ldr r0, [sp] + cmp r3, lr + bge .L_fortify_check_failed + + // Add 1 for copy length to get the string terminator. + add r2, r3, #1 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + + .cfi_endproc +END(__strcpy_chk) + + .data +error_string: + .string "strcpy buffer overflow" diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S index 239402455..8052d62ef 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S @@ -53,11 +53,8 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */ +// Prototype: void *memcpy (void *dst, const void *src, size_t count). - // This version is tuned for the Cortex-A15 processor. - -#include #include #include "libc_events.h" @@ -65,274 +62,34 @@ .syntax unified .fpu neon -#define CACHE_LINE_SIZE 64 - ENTRY(__memcpy_chk) + .cfi_startproc cmp r2, r3 - bgt fortify_check_failed + bgt __memcpy_chk_fail // Fall through to memcpy... + .cfi_endproc END(__memcpy_chk) ENTRY(memcpy) - // Assumes that n >= 0, and dst, src are valid pointers. - // For any sizes less than 832 use the neon code that doesn't - // care about the src alignment. This avoids any checks - // for src alignment, and offers the best improvement since - // smaller sized copies are dominated by the overhead of - // the pre and post main loop. - // For larger copies, if src and dst cannot both be aligned to - // word boundaries, use the neon code. - // For all other copies, align dst to a double word boundary - // and copy using LDRD/STRD instructions. - - // Save registers (r0 holds the return value): - // optimized push {r0, lr}. - .save {r0, lr} - pld [r1, #(CACHE_LINE_SIZE*16)] + .cfi_startproc + pld [r1, #64] push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 - cmp r2, #16 - blo copy_less_than_16_unknown_align + #include "memcpy_base.S" + .cfi_endproc +END(memcpy) - cmp r2, #832 - bge check_alignment + .cfi_startproc +__memcpy_chk_fail: + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 -copy_unknown_alignment: - // Unknown alignment of src and dst. - // Assumes that the first few bytes have already been prefetched. - - // Align destination to 128 bits. The mainloop store instructions - // require this alignment or they will throw an exception. - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 2f - - // Copy up to 15 bytes (count in r3). - sub r2, r2, r3 - movs ip, r3, lsl #31 - - itt mi - ldrbmi lr, [r1], #1 - strbmi lr, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1], #1 - strbcs ip, [r0], #1 - strbcs lr, [r0], #1 - - movs ip, r3, lsl #29 - bge 1f - // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after. - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! -1: bcc 2f - // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after. - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0, :64]! - -2: // Make sure we have at least 64 bytes to copy. - subs r2, r2, #64 - blo 2f - -1: // The main loop copies 64 bytes at a time. - vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! - pld [r1, #(CACHE_LINE_SIZE*4)] - subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0, :128]! - vst1.8 {d4 - d7}, [r0, :128]! - bhs 1b - -2: // Fix-up the remaining count and make sure we have >= 32 bytes left. - adds r2, r2, #32 - blo 3f - - // 32 bytes. These cache lines were already preloaded. - vld1.8 {d0 - d3}, [r1]! - sub r2, r2, #32 - vst1.8 {d0 - d3}, [r0, :128]! -3: // Less than 32 left. - add r2, r2, #32 - tst r2, #0x10 - beq copy_less_than_16_unknown_align - // Copies 16 bytes, destination 128 bits aligned. - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [r0, :128]! - -copy_less_than_16_unknown_align: - // Copy up to 15 bytes (count in r2). - movs ip, r2, lsl #29 - bcc 1f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0]! -1: bge 2f - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! - -2: // Copy 0 to 4 bytes. - lsls r2, r2, #31 - itt ne - ldrbne lr, [r1], #1 - strbne lr, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1] - strbcs ip, [r0], #1 - strbcs lr, [r0] - - pop {r0, pc} - -check_alignment: - // If src and dst cannot both be aligned to a word boundary, - // use the unaligned copy version. - eor r3, r0, r1 - ands r3, r3, #0x3 - bne copy_unknown_alignment - - // To try and improve performance, stack layout changed, - // i.e., not keeping the stack looking like users expect - // (highest numbered register at highest address). - // TODO: Add debug frame directives. - // We don't need exception unwind directives, because the code below - // does not throw any exceptions and does not call any other functions. - // Generally, newlib functions like this lack debug information for - // assembler source. - .save {r4, r5} - strd r4, r5, [sp, #-8]! - .save {r6, r7} - strd r6, r7, [sp, #-8]! - .save {r8, r9} - strd r8, r9, [sp, #-8]! - - // Optimized for already aligned dst code. - ands ip, r0, #3 - bne dst_not_word_aligned - -word_aligned: - // Align the destination buffer to 8 bytes, to make sure double - // loads and stores don't cross a cache line boundary, - // as they are then more expensive even if the data is in the cache - // (require two load/store issue cycles instead of one). - // If only one of the buffers is not 8 bytes aligned, - // then it's more important to align dst than src, - // because there is more penalty for stores - // than loads that cross a cacheline boundary. - // This check and realignment are only done if there is >= 832 - // bytes to copy. - - // Dst is word aligned, but check if it is already double word aligned. - ands r3, r0, #4 - beq 1f - ldr r3, [r1], #4 - str r3, [r0], #4 - sub r2, #4 - -1: // Can only get here if > 64 bytes to copy, so don't do check r2. - sub r2, #64 - -2: // Every loop iteration copies 64 bytes. - .irp offset, #0, #8, #16, #24, #32 - ldrd r4, r5, [r1, \offset] - strd r4, r5, [r0, \offset] - .endr - - ldrd r4, r5, [r1, #40] - ldrd r6, r7, [r1, #48] - ldrd r8, r9, [r1, #56] - - // Keep the pld as far from the next load as possible. - // The amount to prefetch was determined experimentally using - // large sizes, and verifying the prefetch size does not affect - // the smaller copies too much. - // WARNING: If the ldrd and strd instructions get too far away - // from each other, performance suffers. Three loads - // in a row is the best tradeoff. - pld [r1, #(CACHE_LINE_SIZE*16)] - strd r4, r5, [r0, #40] - strd r6, r7, [r0, #48] - strd r8, r9, [r0, #56] - - add r0, r0, #64 - add r1, r1, #64 - subs r2, r2, #64 - bge 2b - - // Fix-up the remaining count and make sure we have >= 32 bytes left. - adds r2, r2, #32 - blo 4f - - // Copy 32 bytes. These cache lines were already preloaded. - .irp offset, #0, #8, #16, #24 - ldrd r4, r5, [r1, \offset] - strd r4, r5, [r0, \offset] - .endr - add r1, r1, #32 - add r0, r0, #32 - sub r2, r2, #32 -4: // Less than 32 left. - add r2, r2, #32 - tst r2, #0x10 - beq 5f - // Copy 16 bytes. - .irp offset, #0, #8 - ldrd r4, r5, [r1, \offset] - strd r4, r5, [r0, \offset] - .endr - add r1, r1, #16 - add r0, r0, #16 - -5: // Copy up to 15 bytes (count in r2). - movs ip, r2, lsl #29 - bcc 1f - // Copy 8 bytes. - ldrd r4, r5, [r1], #8 - strd r4, r5, [r0], #8 -1: bge 2f - // Copy 4 bytes. - ldr r4, [r1], #4 - str r4, [r0], #4 -2: // Copy 0 to 4 bytes. - lsls r2, r2, #31 - itt ne - ldrbne lr, [r1], #1 - strbne lr, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1] - strbcs ip, [r0], #1 - strbcs lr, [r0] - - // Restore registers: optimized pop {r0, pc} - ldrd r8, r9, [sp], #8 - ldrd r6, r7, [sp], #8 - ldrd r4, r5, [sp], #8 - pop {r0, pc} - -dst_not_word_aligned: - // Align dst to word. - rsb ip, ip, #4 - cmp ip, #2 - - itt gt - ldrbgt lr, [r1], #1 - strbgt lr, [r0], #1 - - itt ge - ldrbge lr, [r1], #1 - strbge lr, [r0], #1 - - ldrb lr, [r1], #1 - strb lr, [r0], #1 - - sub r2, r2, ip - - // Src is guaranteed to be at least word aligned by this point. - b word_aligned - - - // Only reached when the __memcpy_chk check fails. -fortify_check_failed: ldr r0, error_message ldr r1, error_code 1: @@ -342,7 +99,7 @@ error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+8) -END(memcpy) + .cfi_endproc .data error_string: diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S new file mode 100644 index 000000000..647e0653f --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + // Assumes that n >= 0, and dst, src are valid pointers. + // For any sizes less than 832 use the neon code that doesn't + // care about the src alignment. This avoids any checks + // for src alignment, and offers the best improvement since + // smaller sized copies are dominated by the overhead of + // the pre and post main loop. + // For larger copies, if src and dst cannot both be aligned to + // word boundaries, use the neon code. + // For all other copies, align dst to a double word boundary + // and copy using LDRD/STRD instructions. + + cmp r2, #16 + blo .L_copy_less_than_16_unknown_align + + cmp r2, #832 + bge .L_check_alignment + +.L_copy_unknown_alignment: + // Unknown alignment of src and dst. + // Assumes that the first few bytes have already been prefetched. + + // Align destination to 128 bits. The mainloop store instructions + // require this alignment or they will throw an exception. + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 2f + + // Copy up to 15 bytes (count in r3). + sub r2, r2, r3 + movs ip, r3, lsl #31 + + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + + movs ip, r3, lsl #29 + bge 1f + // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after. + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! +1: bcc 2f + // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after. + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0, :64]! + +2: // Make sure we have at least 64 bytes to copy. + subs r2, r2, #64 + blo 2f + +1: // The main loop copies 64 bytes at a time. + vld1.8 {d0 - d3}, [r1]! + vld1.8 {d4 - d7}, [r1]! + pld [r1, #(64*4)] + subs r2, r2, #64 + vst1.8 {d0 - d3}, [r0, :128]! + vst1.8 {d4 - d7}, [r0, :128]! + bhs 1b + +2: // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 3f + + // 32 bytes. These cache lines were already preloaded. + vld1.8 {d0 - d3}, [r1]! + sub r2, r2, #32 + vst1.8 {d0 - d3}, [r0, :128]! +3: // Less than 32 left. + add r2, r2, #32 + tst r2, #0x10 + beq .L_copy_less_than_16_unknown_align + // Copies 16 bytes, destination 128 bits aligned. + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0, :128]! + +.L_copy_less_than_16_unknown_align: + // Copy up to 15 bytes (count in r2). + movs ip, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: bge 2f + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + pop {r0, pc} + +.L_check_alignment: + // If src and dst cannot both be aligned to a word boundary, + // use the unaligned copy version. + eor r3, r0, r1 + ands r3, r3, #0x3 + bne .L_copy_unknown_alignment + + // To try and improve performance, stack layout changed, + // i.e., not keeping the stack looking like users expect + // (highest numbered register at highest address). + // TODO: Add debug frame directives. + // We don't need exception unwind directives, because the code below + // does not throw any exceptions and does not call any other functions. + // Generally, newlib functions like this lack debug information for + // assembler source. + .save {r4, r5} + strd r4, r5, [sp, #-8]! + .save {r6, r7} + strd r6, r7, [sp, #-8]! + .save {r8, r9} + strd r8, r9, [sp, #-8]! + + // Optimized for already aligned dst code. + ands ip, r0, #3 + bne .L_dst_not_word_aligned + +.L_word_aligned: + // Align the destination buffer to 8 bytes, to make sure double + // loads and stores don't cross a cache line boundary, + // as they are then more expensive even if the data is in the cache + // (require two load/store issue cycles instead of one). + // If only one of the buffers is not 8 bytes aligned, + // then it's more important to align dst than src, + // because there is more penalty for stores + // than loads that cross a cacheline boundary. + // This check and realignment are only done if there is >= 832 + // bytes to copy. + + // Dst is word aligned, but check if it is already double word aligned. + ands r3, r0, #4 + beq 1f + ldr r3, [r1], #4 + str r3, [r0], #4 + sub r2, #4 + +1: // Can only get here if > 64 bytes to copy, so don't do check r2. + sub r2, #64 + +2: // Every loop iteration copies 64 bytes. + .irp offset, #0, #8, #16, #24, #32 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + + // Keep the pld as far from the next load as possible. + // The amount to prefetch was determined experimentally using + // large sizes, and verifying the prefetch size does not affect + // the smaller copies too much. + // WARNING: If the ldrd and strd instructions get too far away + // from each other, performance suffers. Three loads + // in a row is the best tradeoff. + pld [r1, #(64*16)] + strd r4, r5, [r0, #40] + strd r6, r7, [r0, #48] + strd r8, r9, [r0, #56] + + add r0, r0, #64 + add r1, r1, #64 + subs r2, r2, #64 + bge 2b + + // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 4f + + // Copy 32 bytes. These cache lines were already preloaded. + .irp offset, #0, #8, #16, #24 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #32 + add r0, r0, #32 + sub r2, r2, #32 +4: // Less than 32 left. + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // Copy 16 bytes. + .irp offset, #0, #8 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #16 + add r0, r0, #16 + +5: // Copy up to 15 bytes (count in r2). + movs ip, r2, lsl #29 + bcc 1f + // Copy 8 bytes. + ldrd r4, r5, [r1], #8 + strd r4, r5, [r0], #8 +1: bge 2f + // Copy 4 bytes. + ldr r4, [r1], #4 + str r4, [r0], #4 +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + // Restore registers: optimized pop {r0, pc} + ldrd r8, r9, [sp], #8 + ldrd r6, r7, [sp], #8 + ldrd r4, r5, [sp], #8 + pop {r0, pc} + +.L_dst_not_word_aligned: + // Align dst to word. + rsb ip, ip, #4 + cmp ip, #2 + + itt gt + ldrbgt lr, [r1], #1 + strbgt lr, [r0], #1 + + itt ge + ldrbge lr, [r1], #1 + strbge lr, [r0], #1 + + ldrb lr, [r1], #1 + strb lr, [r0], #1 + + sub r2, r2, ip + + // Src is guaranteed to be at least word aligned by this point. + b .L_word_aligned diff --git a/libc/arch-arm/cortex-a15/bionic/memset.S b/libc/arch-arm/cortex-a15/bionic/memset.S index 6c143ad81..5593be689 100644 --- a/libc/arch-arm/cortex-a15/bionic/memset.S +++ b/libc/arch-arm/cortex-a15/bionic/memset.S @@ -40,8 +40,14 @@ .syntax unified ENTRY(__memset_chk) + .cfi_startproc cmp r2, r3 - bls done + bls .L_done + + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 ldr r0, error_message ldr r1, error_code @@ -53,24 +59,28 @@ error_code: error_message: .word error_string-(1b+8) + .cfi_endproc END(__memset_chk) ENTRY(bzero) + .cfi_startproc mov r2, r1 mov r1, #0 - -done: +.L_done: // Fall through to memset... + .cfi_endproc END(bzero) ENTRY(memset) - .save {r0} + .cfi_startproc stmfd sp!, {r0} + .cfi_def_cfa_offset 4 + .cfi_rel_offset r0, 0 // The new algorithm is slower for copies < 16 so use the old // neon code in that case. cmp r2, #16 - blo set_less_than_16_unknown_align + blo .L_set_less_than_16_unknown_align // Use strd which requires an even and odd register so move the // values so that: @@ -84,17 +94,17 @@ ENTRY(memset) orr r1, r1, r1, lsr #8 orr r1, r1, r1, lsr #16 -check_alignment: +.L_check_alignment: // Align destination to a double word to avoid the strd crossing // a cache line boundary. ands ip, r3, #7 - bne do_double_word_align + bne .L_do_double_word_align -double_word_aligned: +.L_double_word_aligned: mov r0, r1 subs r2, #64 - blo set_less_than_64 + blo .L_set_less_than_64 1: // Main loop sets 64 bytes at a time. .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 @@ -105,39 +115,39 @@ double_word_aligned: subs r2, #64 bge 1b -set_less_than_64: +.L_set_less_than_64: // Restore r2 to the count of bytes left to set. add r2, #64 lsls ip, r2, #27 - bcc set_less_than_32 + bcc .L_set_less_than_32 // Set 32 bytes. .irp offset, #0, #8, #16, #24 strd r0, r1, [r3, \offset] .endr add r3, #32 -set_less_than_32: - bpl set_less_than_16 +.L_set_less_than_32: + bpl .L_set_less_than_16 // Set 16 bytes. .irp offset, #0, #8 strd r0, r1, [r3, \offset] .endr add r3, #16 -set_less_than_16: +.L_set_less_than_16: // Less than 16 bytes to set. lsls ip, r2, #29 - bcc set_less_than_8 + bcc .L_set_less_than_8 // Set 8 bytes. strd r0, r1, [r3], #8 -set_less_than_8: - bpl set_less_than_4 +.L_set_less_than_8: + bpl .L_set_less_than_4 // Set 4 bytes str r1, [r3], #4 -set_less_than_4: +.L_set_less_than_4: lsls ip, r2, #31 it ne strbne r1, [r3], #1 @@ -148,7 +158,7 @@ set_less_than_4: ldmfd sp!, {r0} bx lr -do_double_word_align: +.L_do_double_word_align: rsb ip, ip, #8 sub r2, r2, ip movs r0, ip, lsl #31 @@ -160,11 +170,11 @@ do_double_word_align: // Dst is at least word aligned by this point. cmp ip, #4 - blo double_word_aligned + blo .L_double_word_aligned str r1, [r3], #4 - b double_word_aligned + b .L_double_word_aligned -set_less_than_16_unknown_align: +.L_set_less_than_16_unknown_align: // Set up to 15 bytes. vdup.8 d0, r1 movs ip, r2, lsl #29 @@ -180,6 +190,7 @@ set_less_than_16_unknown_align: strbcs r1, [r0], #1 ldmfd sp!, {r0} bx lr + .cfi_endproc END(memset) .data diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk index 281e424ba..c62e7e72a 100644 --- a/libc/arch-arm/cortex-a15/cortex-a15.mk +++ b/libc/arch-arm/cortex-a15/cortex-a15.mk @@ -4,5 +4,7 @@ $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a15/bionic/strcmp.S) $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S) +$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a15/bionic/__strcat_chk.S) +$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a15/bionic/__strcpy_chk.S) include bionic/libc/arch-arm/generic/generic.mk diff --git a/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S new file mode 100644 index 000000000..3f866361d --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + .fpu neon + .thumb + .thumb_func + +// Get the length of src string, then get the source of the dst string. +// Check that the two lengths together don't exceed the threshold, then +// do a memcpy of the data. +ENTRY(__strcat_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + push {r4, r5} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + mov lr, r2 + + // Save the dst register to r5 + mov r5, r0 + + // Zero out r4 + eor r4, r4, r4 + + // r1 contains the address of the string to count. +.L_strlen_start: + mov r0, r1 + + ands r3, r0, #7 + bne .L_align_src + + .p2align 2 +.L_mainloop: + ldmia r1!, {r2, r3} + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_zero_in_first_register: + sub r3, r1, r0 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq .L_check_byte1_reg1 + + sub r3, r3, #8 + b .L_finish + +.L_check_byte1_reg1: + bcc .L_check_byte2_reg1 + + sub r3, r3, #7 + b .L_finish + +.L_check_byte2_reg1: + // Check for zero in byte 2. + tst ip, #0x800000 + it ne + subne r3, r3, #6 + bne .L_finish + sub r3, r3, #5 + b .L_finish + +.L_zero_in_second_register: + sub r3, r1, r0 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq .L_check_byte1_reg2 + + sub r3, r3, #4 + b .L_finish + +.L_check_byte1_reg2: + bcc .L_check_byte2_reg2 + + sub r3, r3, #3 + b .L_finish + +.L_check_byte2_reg2: + // Check for zero in byte 2. + tst ip, #0x800000 + it ne + subne r3, r3, #2 + bne .L_finish + sub r3, r3, #1 + b .L_finish + +.L_align_src: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r1], #1 + cbz r2, .L_done + +.L_align_to_32: + bcc .L_align_to_64 + + ldrb r2, [r1], #1 + cbz r2, .L_done + ldrb r2, [r1], #1 + cbz r2, .L_done + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r2, [r1], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_done: + sub r3, r1, r0 + sub r3, r3, #1 + +.L_finish: + cmp r4, #0 + bne .L_strlen_done + + // Time to get the dst string length. + mov r1, r5 + + // Save the original source address to r5. + mov r5, r0 + + // Save the current length (adding 1 for the terminator). + add r4, r3, #1 + b .L_strlen_start + + // r0 holds the pointer to the dst string. + // r3 holds the dst string length. + // r4 holds the src string length + 1. +.L_strlen_done: + add r2, r3, r4 + cmp r2, lr + bgt .L_fortify_check_failed + + // Set up the registers for the memcpy code. + mov r1, r5 + pld [r1, #64] + mov r2, r4 + add r0, r0, r3 + pop {r4, r5} + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + + .cfi_endproc +END(__strcat_chk) + + .data +error_string: + .string "strcat buffer overflow" diff --git a/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S new file mode 100644 index 000000000..787b0578a --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + .fpu neon + .thumb + .thumb_func + +// Get the length of the source string first, then do a memcpy of the data +// instead of a strcpy. +ENTRY(__strcpy_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + + mov lr, r2 + mov r0, r1 + + ands r3, r0, #7 + bne .L_align_src + + .p2align 2 +.L_mainloop: + ldmia r0!, {r2, r3} + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_zero_in_first_register: + sub r3, r0, r1 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq .L_check_byte1_reg1 + + sub r3, r3, #8 + b .L_check_size + +.L_check_byte1_reg1: + bcc .L_check_byte2_reg1 + + sub r3, r3, #7 + b .L_check_size + +.L_check_byte2_reg1: + // Check for zero in byte 2. + tst ip, #0x800000 + it ne + subne r3, r3, #6 + bne .L_check_size + sub r3, r3, #5 + b .L_check_size + +.L_zero_in_second_register: + sub r3, r0, r1 + // Check for zero in byte 0. + lsls r2, ip, #17 + beq .L_check_byte1_reg2 + + sub r3, r3, #4 + b .L_check_size + +.L_check_byte1_reg2: + bcc .L_check_byte2_reg2 + + sub r3, r3, #3 + b .L_check_size + +.L_check_byte2_reg2: + // Check for zero in byte 2. + tst ip, #0x800000 + it ne + subne r3, r3, #2 + bne .L_check_size + sub r3, r3, #1 + b .L_check_size + +.L_align_src: + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r0], #1 + cbz r2, .L_done + +.L_align_to_32: + bcc .L_align_to_64 + + ldrb r2, [r0], #1 + cbz r2, .L_done + ldrb r2, [r0], #1 + cbz r2, .L_done + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r2, [r0], #4 + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_done: + sub r3, r0, r1 + sub r3, r3, #1 + +.L_check_size: + pld [r1, #0] + pld [r1, #64] + ldr r0, [sp] + cmp r3, lr + bge .L_fortify_check_failed + + // Add 1 for copy length to get the string terminator. + add r2, r3, #1 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + + .cfi_endproc +END(__strcpy_chk) + + .data +error_string: + .string "strcpy buffer overflow" diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy.S b/libc/arch-arm/cortex-a9/bionic/memcpy.S index 4e624d43a..e7beb25b3 100644 --- a/libc/arch-arm/cortex-a9/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a9/bionic/memcpy.S @@ -26,7 +26,6 @@ * SUCH DAMAGE. */ -#include #include #include "libc_events.h" @@ -36,190 +35,40 @@ * cache line. */ - .text + .syntax unified .fpu neon - -#define CACHE_LINE_SIZE 32 + .thumb + .thumb_func ENTRY(__memcpy_chk) + .cfi_startproc cmp r2, r3 - bgt fortify_check_failed + bgt __memcpy_chk_fail // Fall through to memcpy... + .cfi_endproc END(__memcpy_chk) ENTRY(memcpy) - .save {r0, lr} - /* start preloading as early as possible */ - pld [r1, #(CACHE_LINE_SIZE * 0)] - stmfd sp!, {r0, lr} - pld [r1, #(CACHE_LINE_SIZE * 2)] + .cfi_startproc + pld [r1, #0] + stmfd sp!, {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + pld [r1, #64] - // Check so divider is at least 16 bytes, needed for alignment code. - cmp r2, #16 - blo 5f + #include "memcpy_base.S" + .cfi_endproc +END(memcpy) + .cfi_startproc +__memcpy_chk_fail: + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 - /* check if buffers are aligned. If so, run arm-only version */ - eor r3, r0, r1 - ands r3, r3, #0x3 - beq 11f - - /* Check the upper size limit for Neon unaligned memory access in memcpy */ - cmp r2, #224 - blo 3f - - /* align destination to 16 bytes for the write-buffer */ - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 3f - - /* copy up to 15-bytes (count in r3) */ - sub r2, r2, r3 - movs ip, r3, lsl #31 - ldrmib lr, [r1], #1 - strmib lr, [r0], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - movs ip, r3, lsl #29 - bge 1f - // copies 4 bytes, destination 32-bits aligned - vld1.32 {d0[0]}, [r1]! - vst1.32 {d0[0]}, [r0, :32]! -1: bcc 2f - // copies 8 bytes, destination 64-bits aligned - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0, :64]! -2: - /* preload immediately the next cache line, which we may need */ - pld [r1, #(CACHE_LINE_SIZE * 0)] - pld [r1, #(CACHE_LINE_SIZE * 2)] -3: - /* make sure we have at least 64 bytes to copy */ - subs r2, r2, #64 - blo 2f - - /* preload all the cache lines we need */ - pld [r1, #(CACHE_LINE_SIZE * 4)] - pld [r1, #(CACHE_LINE_SIZE * 6)] - -1: /* The main loop copies 64 bytes at a time */ - vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! - pld [r1, #(CACHE_LINE_SIZE * 6)] - subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0]! - vst1.8 {d4 - d7}, [r0]! - bhs 1b - -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - add r2, r2, #64 - subs r2, r2, #32 - blo 4f - -3: /* 32 bytes at a time. These cache lines were already preloaded */ - vld1.8 {d0 - d3}, [r1]! - subs r2, r2, #32 - vst1.8 {d0 - d3}, [r0]! - bhs 3b - -4: /* less than 32 left */ - add r2, r2, #32 - tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [r0]! -5: /* copy up to 15-bytes (count in r2) */ - movs ip, r2, lsl #29 - bcc 1f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0]! -1: bge 2f - vld1.32 {d0[0]}, [r1]! - vst1.32 {d0[0]}, [r0]! -2: movs ip, r2, lsl #31 - ldrmib r3, [r1], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strmib r3, [r0], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - - ldmfd sp!, {r0, lr} - bx lr -11: - /* Simple arm-only copy loop to handle aligned copy operations */ - stmfd sp!, {r4, r5, r6, r7, r8} - pld [r1, #(CACHE_LINE_SIZE * 4)] - - /* Check alignment */ - rsb r3, r1, #0 - ands r3, #3 - beq 2f - - /* align source to 32 bits. We need to insert 2 instructions between - * a ldr[b|h] and str[b|h] because byte and half-word instructions - * stall 2 cycles. - */ - movs r12, r3, lsl #31 - sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ - ldrmib r3, [r1], #1 - ldrcsb r4, [r1], #1 - ldrcsb r5, [r1], #1 - strmib r3, [r0], #1 - strcsb r4, [r0], #1 - strcsb r5, [r0], #1 - -2: - subs r2, r2, #64 - blt 4f - -3: /* Main copy loop, copying 64 bytes at a time */ - pld [r1, #(CACHE_LINE_SIZE * 8)] - ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} - stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} - ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} - stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} - subs r2, r2, #64 - bge 3b - -4: /* Check if there are > 32 bytes left */ - adds r2, r2, #64 - subs r2, r2, #32 - blt 5f - - /* Copy 32 bytes */ - ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} - stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} - subs r2, #32 - -5: /* Handle any remaining bytes */ - adds r2, #32 - beq 6f - - movs r12, r2, lsl #28 - ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */ - ldmmiia r1!, {r7, r8} /* 8 bytes */ - stmcsia r0!, {r3, r4, r5, r6} - stmmiia r0!, {r7, r8} - movs r12, r2, lsl #30 - ldrcs r3, [r1], #4 /* 4 bytes */ - ldrmih r4, [r1], #2 /* 2 bytes */ - strcs r3, [r0], #4 - strmih r4, [r0], #2 - tst r2, #0x1 - ldrneb r3, [r1] /* last byte */ - strneb r3, [r0] -6: - ldmfd sp!, {r4, r5, r6, r7, r8} - ldmfd sp!, {r0, pc} - - - // Only reached when the __memcpy_chk check fails. -fortify_check_failed: ldr r0, error_message ldr r1, error_code 1: @@ -228,8 +77,8 @@ fortify_check_failed: error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: - .word error_string-(1b+8) -END(memcpy) + .word error_string-(1b+4) + .cfi_endproc .data error_string: diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S new file mode 100644 index 000000000..46b5a9361 --- /dev/null +++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This code assumes it is running on a processor that supports all arm v7 + * instructions, that supports neon instructions, and that has a 32 byte + * cache line. + */ + + // Check so divider is at least 16 bytes, needed for alignment code. + cmp r2, #16 + blo 5f + + + /* check if buffers are aligned. If so, run arm-only version */ + eor r3, r0, r1 + ands r3, r3, #0x3 + beq 11f + + /* Check the upper size limit for Neon unaligned memory access in memcpy */ + cmp r2, #224 + blo 3f + + /* align destination to 16 bytes for the write-buffer */ + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 3f + + /* copy up to 15-bytes (count in r3) */ + sub r2, r2, r3 + movs ip, r3, lsl #31 + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + movs ip, r3, lsl #29 + bge 1f + // copies 4 bytes, destination 32-bits aligned + vld1.32 {d0[0]}, [r1]! + vst1.32 {d0[0]}, [r0, :32]! +1: bcc 2f + // copies 8 bytes, destination 64-bits aligned + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0, :64]! +2: + /* preload immediately the next cache line, which we may need */ + pld [r1, #0] + pld [r1, #(32 * 2)] +3: + /* make sure we have at least 64 bytes to copy */ + subs r2, r2, #64 + blo 2f + + /* preload all the cache lines we need */ + pld [r1, #(32 * 4)] + pld [r1, #(32 * 6)] + +1: /* The main loop copies 64 bytes at a time */ + vld1.8 {d0 - d3}, [r1]! + vld1.8 {d4 - d7}, [r1]! + pld [r1, #(32 * 6)] + subs r2, r2, #64 + vst1.8 {d0 - d3}, [r0]! + vst1.8 {d4 - d7}, [r0]! + bhs 1b + +2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ + add r2, r2, #64 + subs r2, r2, #32 + blo 4f + +3: /* 32 bytes at a time. These cache lines were already preloaded */ + vld1.8 {d0 - d3}, [r1]! + subs r2, r2, #32 + vst1.8 {d0 - d3}, [r0]! + bhs 3b + +4: /* less than 32 left */ + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // copies 16 bytes, 128-bits aligned + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0]! +5: /* copy up to 15-bytes (count in r2) */ + movs ip, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: bge 2f + vld1.32 {d0[0]}, [r1]! + vst1.32 {d0[0]}, [r0]! +2: movs ip, r2, lsl #31 + itt mi + ldrbmi r3, [r1], #1 + strbmi r3, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + + ldmfd sp!, {r0, lr} + bx lr +11: + /* Simple arm-only copy loop to handle aligned copy operations */ + stmfd sp!, {r4, r5, r6, r7, r8} + pld [r1, #(32 * 4)] + + /* Check alignment */ + rsb r3, r1, #0 + ands r3, #3 + beq 2f + + /* align source to 32 bits. We need to insert 2 instructions between + * a ldr[b|h] and str[b|h] because byte and half-word instructions + * stall 2 cycles. + */ + movs r12, r3, lsl #31 + sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ + itt mi + ldrbmi r3, [r1], #1 + strbmi r3, [r0], #1 + itttt cs + ldrbcs r4, [r1], #1 + ldrbcs r5, [r1], #1 + strbcs r4, [r0], #1 + strbcs r5, [r0], #1 + +2: + subs r2, r2, #64 + blt 4f + +3: /* Main copy loop, copying 64 bytes at a time */ + pld [r1, #(32 * 8)] + ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} + stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} + ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} + stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} + subs r2, r2, #64 + bge 3b + +4: /* Check if there are > 32 bytes left */ + adds r2, r2, #64 + subs r2, r2, #32 + blt 5f + + /* Copy 32 bytes */ + ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} + stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} + subs r2, #32 + +5: /* Handle any remaining bytes */ + adds r2, #32 + beq 6f + + movs r12, r2, lsl #28 + itt cs + ldmiacs r1!, {r3, r4, r5, r6} /* 16 bytes */ + stmiacs r0!, {r3, r4, r5, r6} + itt mi + ldmiami r1!, {r7, r8} /* 8 bytes */ + stmiami r0!, {r7, r8} + movs r12, r2, lsl #30 + itt cs + ldrcs r3, [r1], #4 /* 4 bytes */ + strcs r3, [r0], #4 + itt mi + ldrhmi r4, [r1], #2 /* 2 bytes */ + strhmi r4, [r0], #2 + tst r2, #0x1 + itt ne + ldrbne r3, [r1] /* last byte */ + strbne r3, [r0] +6: + ldmfd sp!, {r4, r5, r6, r7, r8} + ldmfd sp!, {r0, pc} diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S index d01143039..bc25a3e2e 100644 --- a/libc/arch-arm/cortex-a9/bionic/memset.S +++ b/libc/arch-arm/cortex-a9/bionic/memset.S @@ -38,8 +38,14 @@ .fpu neon ENTRY(__memset_chk) + .cfi_startproc cmp r2, r3 - bls done + bls .L_done + + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 ldr r0, error_message ldr r1, error_code @@ -51,24 +57,29 @@ error_code: error_message: .word error_string-(1b+8) + .cfi_endproc END(__memset_chk) ENTRY(bzero) + .cfi_startproc mov r2, r1 mov r1, #0 -done: +.L_done: // Fall through to memset... + .cfi_endproc END(bzero) /* memset() returns its first argument. */ ENTRY(memset) + .cfi_startproc # The neon memset only wins for less than 132. cmp r2, #132 bhi 11f - .save {r0} stmfd sp!, {r0} + .cfi_def_cfa_offset 4 + .cfi_rel_offset r0, 0 vdup.8 q0, r1 @@ -106,8 +117,15 @@ ENTRY(memset) * offset = (4-(src&3))&3 = -src & 3 */ - .save {r0, r4-r7, lr} stmfd sp!, {r0, r4-r7, lr} + .cfi_def_cfa_offset 24 + .cfi_rel_offset r0, 0 + .cfi_rel_offset r4, 4 + .cfi_rel_offset r5, 8 + .cfi_rel_offset r6, 12 + .cfi_rel_offset r7, 16 + .cfi_rel_offset lr, 20 + rsb r3, r0, #0 ands r3, r3, #3 cmp r3, r2 @@ -169,6 +187,7 @@ ENTRY(memset) strcsb r1, [r0] ldmfd sp!, {r0, r4-r7, lr} bx lr + .cfi_endproc END(memset) .data diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk index 61a52c2ac..eee1b3646 100644 --- a/libc/arch-arm/cortex-a9/cortex-a9.mk +++ b/libc/arch-arm/cortex-a9/cortex-a9.mk @@ -4,5 +4,7 @@ $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a9/bionic/strcat.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a9/bionic/strcmp.S) $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a9/bionic/strcpy.S) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a9/bionic/strlen.S) +$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a9/bionic/__strcat_chk.S) +$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a9/bionic/__strcpy_chk.S) include bionic/libc/arch-arm/generic/generic.mk diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk index c3a5aa53b..e2300039f 100644 --- a/libc/arch-arm/generic/generic.mk +++ b/libc/arch-arm/generic/generic.mk @@ -4,3 +4,5 @@ $(call libc-add-cpu-variant-src,STRCAT,string/strcat.c) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S) $(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.S) $(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c) +$(call libc-add-cpu-variant-src,__STRCAT_CHK,bionic/__strcat_chk.cpp) +$(call libc-add-cpu-variant-src,__STRCPY_CHK,bionic/__strcpy_chk.cpp) diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S new file mode 100644 index 000000000..4516d3005 --- /dev/null +++ b/libc/arch-arm/krait/bionic/__strcat_chk.S @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + + .thumb + .thumb_func + +// Get the length of src string, then get the source of the dst string. +// Check that the two lengths together don't exceed the threshold, then +// do a memcpy of the data. +ENTRY(__strcat_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + push {r4, r5} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + mov lr, r2 + + // Save the dst register to r5 + mov r5, r0 + + // Zero out r4 + eor r4, r4, r4 + + // r1 contains the address of the string to count. +.L_strlen_start: + mov r0, r1 + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r1], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r1], #8 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r1, r0 + sub r3, r3, #1 + b .L_finish + +.L_zero_in_first_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_finish + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_finish + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_finish + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_finish + +.L_zero_in_second_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_finish + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_finish + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_finish + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_finish: + cmp r4, #0 + bne .L_strlen_done + + // Time to get the dst string length. + mov r1, r5 + + // Save the original source address to r5. + mov r5, r0 + + // Save the current length (adding 1 for the terminator). + add r4, r3, #1 + b .L_strlen_start + + // r0 holds the pointer to the dst string. + // r3 holds the dst string length. + // r4 holds the src string length + 1. +.L_strlen_done: + add r2, r3, r4 + cmp r2, lr + bgt .L_fortify_check_failed + + // Set up the registers for the memcpy code. + mov r1, r5 + pld [r1, #64] + mov r2, r4 + add r0, r0, r3 + pop {r4, r5} + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + + .cfi_endproc +END(__strcat_chk) + + .data +error_string: + .string "strcat buffer overflow" diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S new file mode 100644 index 000000000..c57268c46 --- /dev/null +++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "libc_events.h" + + .syntax unified + + .thumb + .thumb_func + +// Get the length of the source string first, then do a memcpy of the data +// instead of a strcpy. +ENTRY(__strcpy_chk) + .cfi_startproc + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + + mov lr, r2 + mov r0, r1 + + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r0], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r0], #8 + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r0, r1 + sub r3, r3, #1 + b .L_check_size + +.L_zero_in_first_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_check_size + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_check_size + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_check_size + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_check_size + +.L_zero_in_second_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_check_size + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_check_size + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_check_size + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_check_size: + pld [r1, #0] + pld [r1, #64] + ldr r0, [sp] + cmp r3, lr + bge .L_fortify_check_failed + + // Add 1 for copy length to get the string terminator. + add r2, r3, #1 + + #include "memcpy_base.S" + +.L_fortify_check_failed: + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) + .cfi_endproc +END(__strcpy_chk) + + .data +error_string: + .string "strcpy buffer overflow" diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index 3afe18ca8..dde231ae9 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -28,7 +28,6 @@ /* Assumes neon instructions and a cache line size of 32 bytes. */ -#include #include #include "libc_events.h" @@ -38,103 +37,40 @@ * cache line. */ -#define CACHE_LINE_SIZE 32 - .text + .syntax unified .fpu neon + .thumb + .thumb_func ENTRY(__memcpy_chk) + .cfi_startproc cmp r2, r3 - bgt fortify_check_failed + bgt __memcpy_chk_fail // Fall through to memcpy... + .cfi_endproc END(__memcpy_chk) ENTRY(memcpy) - .save {r0, lr} - /* start preloading as early as possible */ - pld [r1, #(CACHE_LINE_SIZE*4)] - stmfd sp!, {r0, lr} + .cfi_startproc + pld [r1, #64] + stmfd sp!, {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 - /* do we have at least 16-bytes to copy (needed for alignment below) */ - cmp r2, #16 - blo 5f + #include "memcpy_base.S" + .cfi_endproc +END(memcpy) - /* align destination to cache-line for the write-buffer */ - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 2f + .cfi_startproc +__memcpy_chk_fail: + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 - /* copy up to 15-bytes (count in r3) */ - sub r2, r2, r3 - movs ip, r3, lsl #31 - ldrmib lr, [r1], #1 - strmib lr, [r0], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - movs ip, r3, lsl #29 - bge 1f - // copies 4 bytes, destination 32-bits aligned - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! -1: bcc 2f - // copies 8 bytes, destination 64-bits aligned - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0, :64]! - -2: /* make sure we have at least 64 bytes to copy */ - subs r2, r2, #64 - blo 2f - -1: /* The main loop copies 64 bytes at a time */ - vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! - pld [r1, #(CACHE_LINE_SIZE*2)] - subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0, :128]! - vst1.8 {d4 - d7}, [r0, :128]! - bhs 1b - -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - adds r2, r2, #32 - blo 4f - - /* Copy 32 bytes. These cache lines were already preloaded */ - vld1.8 {d0 - d3}, [r1]! - sub r2, r2, #32 - vst1.8 {d0 - d3}, [r0, :128]! - -4: /* less than 32 left */ - add r2, r2, #32 - tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [r0, :128]! - -5: /* copy up to 15-bytes (count in r2) */ - movs ip, r2, lsl #29 - bcc 1f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0]! -1: bge 2f - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! -2: movs ip, r2, lsl #31 - ldrmib r3, [r1], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strmib r3, [r0], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - - ldmfd sp!, {r0, lr} - bx lr - - // Only reached when the __memcpy_chk check fails. -fortify_check_failed: ldr r0, error_message ldr r1, error_code 1: @@ -143,8 +79,8 @@ fortify_check_failed: error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: - .word error_string-(1b+8) -END(memcpy) + .word error_string-(1b+4) + .cfi_endproc .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S new file mode 100644 index 000000000..48ce477f0 --- /dev/null +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * This code assumes it is running on a processor that supports all arm v7 + * instructions, that supports neon instructions, and that has a 32 byte + * cache line. + */ + +// Assumes neon instructions and a cache line size of 32 bytes. + + /* do we have at least 16-bytes to copy (needed for alignment below) */ + cmp r2, #16 + blo 5f + + /* align destination to cache-line for the write-buffer */ + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 2f + + /* copy up to 15-bytes (count in r3) */ + sub r2, r2, r3 + movs ip, r3, lsl #31 + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + movs ip, r3, lsl #29 + bge 1f + // copies 4 bytes, destination 32-bits aligned + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! +1: bcc 2f + // copies 8 bytes, destination 64-bits aligned + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0, :64]! + +2: /* make sure we have at least 64 bytes to copy */ + subs r2, r2, #64 + blo 2f + +1: /* The main loop copies 64 bytes at a time */ + vld1.8 {d0 - d3}, [r1]! + vld1.8 {d4 - d7}, [r1]! + pld [r1, #(32*2)] + subs r2, r2, #64 + vst1.8 {d0 - d3}, [r0, :128]! + vst1.8 {d4 - d7}, [r0, :128]! + bhs 1b + +2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ + adds r2, r2, #32 + blo 4f + + /* Copy 32 bytes. These cache lines were already preloaded */ + vld1.8 {d0 - d3}, [r1]! + sub r2, r2, #32 + vst1.8 {d0 - d3}, [r0, :128]! + +4: /* less than 32 left */ + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // copies 16 bytes, 128-bits aligned + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0, :128]! + +5: /* copy up to 15-bytes (count in r2) */ + movs ip, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: bge 2f + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! +2: movs ip, r2, lsl #31 + itt mi + ldrbmi r3, [r1], #1 + strbmi r3, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + + ldmfd sp!, {r0, lr} + bx lr diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S index 4e4788b7b..15661320b 100644 --- a/libc/arch-arm/krait/bionic/memset.S +++ b/libc/arch-arm/krait/bionic/memset.S @@ -39,8 +39,14 @@ .fpu neon ENTRY(__memset_chk) + .cfi_startproc cmp r2, r3 - bls done + bls .L_done + + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 ldr r0, error_message ldr r1, error_code @@ -52,20 +58,25 @@ error_code: error_message: .word error_string-(1b+8) + .cfi_endproc END(__memset_chk) ENTRY(bzero) + .cfi_startproc mov r2, r1 mov r1, #0 -done: +.L_done: // Fall through to memset... + .cfi_endproc END(bzero) /* memset() returns its first argument. */ ENTRY(memset) - .save {r0} + .cfi_startproc stmfd sp!, {r0} + .cfi_def_cfa_offset 4 + .cfi_rel_offset r0, 0 vdup.8 q0, r1 @@ -98,6 +109,7 @@ ENTRY(memset) strcsb r1, [r0], #1 ldmfd sp!, {r0} bx lr + .cfi_endproc END(memset) .data diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk index 1ff18e9bb..29ab743c6 100644 --- a/libc/arch-arm/krait/krait.mk +++ b/libc/arch-arm/krait/krait.mk @@ -1,6 +1,8 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S) +$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S) +$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S) # Use cortex-a15 versions of strcat/strcpy/strlen. $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S) $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)