am f0c3d909: Create optimized __strcpy_chk/__strcat_chk.

* commit 'f0c3d909136167fdbe32b7815e5e1e02b4c35d62': Create optimized __strcpy_chk/__strcat_chk.
2013-08-14 07:58:50 -07:00
parent d861b6cbdf f0c3d90913
commit 3f78cb428e
20 changed files with 1950 additions and 553 deletions
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -31,6 +31,8 @@ _LIBC_ARCH_DYNAMIC_SRC_FILES := \
 _LIBC_FORTIFY_FILES_TO_REMOVE := \
    bionic/__memcpy_chk.cpp \
    bionic/__memset_chk.cpp \
+    bionic/__strcpy_chk.cpp \
+    bionic/__strcat_chk.cpp \

 libc_common_src_files := \
    $(filter-out $(_LIBC_FORTIFY_FILES_TO_REMOVE),$(libc_common_src_files))
--- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+    push    {r4, r5}
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 0
+
+    mov     lr, r2
+
+    // Save the dst register to r5
+    mov     r5, r0
+
+    // Zero out r4
+    eor     r4, r4, r4
+
+    // r1 contains the address of the string to count.
+.L_strlen_start:
+    mov     r0, r1
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r1], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r1], #8
+
+    pld     [r1, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r1, r0
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_zero_in_first_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_finish
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_finish
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_finish
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_finish
+
+.L_zero_in_second_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_finish
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_finish
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_finish:
+    cmp     r4, #0
+    bne     .L_strlen_done
+
+    // Time to get the dst string length.
+    mov     r1, r5
+
+    // Save the original source address to r5.
+    mov     r5, r0
+
+    // Save the current length (adding 1 for the terminator).
+    add     r4, r3, #1
+    b       .L_strlen_start
+
+    // r0 holds the pointer to the dst string.
+    // r3 holds the dst string length.
+    // r4 holds the src string length + 1.
+.L_strlen_done:
+    add     r2, r3, r4
+    cmp     r2, lr
+    bgt     .L_fortify_check_failed
+
+    // Set up the registers for the memcpy code.
+    mov     r1, r5
+    pld     [r1, #64]
+    mov     r2, r4
+    add     r0, r0, r3
+    pop     {r4, r5}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+
+    .cfi_endproc
+END(__strcat_chk)
+
+    .data
+error_string:
+    .string "strcat buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+
+    mov     lr, r2
+    mov     r0, r1
+
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r0], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r0], #8
+
+    pld     [r0, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r0, r1
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_zero_in_first_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_check_size
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_check_size
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_check_size
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_check_size
+
+.L_zero_in_second_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_check_size
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_check_size
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_check_size:
+    pld     [r1, #0]
+    pld     [r1, #64]
+    ldr     r0, [sp]
+    cmp     r3, lr
+    bge     .L_fortify_check_failed
+
+    // Add 1 for copy length to get the string terminator.
+    add     r2, r3, #1
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+
+    .cfi_endproc
+END(__strcpy_chk)
+
+    .data
+error_string:
+    .string "strcpy buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@@ -53,11 +53,8 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-    /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).

-        // This version is tuned for the Cortex-A15 processor.
-
-#include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"

@@ -65,274 +62,34 @@
        .syntax unified
        .fpu    neon

-#define CACHE_LINE_SIZE 64
-
 ENTRY(__memcpy_chk)
+        .cfi_startproc
        cmp     r2, r3
-        bgt     fortify_check_failed
+        bgt     __memcpy_chk_fail

        // Fall through to memcpy...
+        .cfi_endproc
 END(__memcpy_chk)

 ENTRY(memcpy)
-        // Assumes that n >= 0, and dst, src are valid pointers.
-        // For any sizes less than 832 use the neon code that doesn't
-        // care about the src alignment. This avoids any checks
-        // for src alignment, and offers the best improvement since
-        // smaller sized copies are dominated by the overhead of
-        // the pre and post main loop.
-        // For larger copies, if src and dst cannot both be aligned to
-        // word boundaries, use the neon code.
-        // For all other copies, align dst to a double word boundary
-        // and copy using LDRD/STRD instructions.
-
-        // Save registers (r0 holds the return value):
-        // optimized push {r0, lr}.
-        .save   {r0, lr}
-        pld     [r1, #(CACHE_LINE_SIZE*16)]
+        .cfi_startproc
+        pld     [r1, #64]
        push    {r0, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4

-        cmp     r2, #16
-        blo     copy_less_than_16_unknown_align
+        #include "memcpy_base.S"
+        .cfi_endproc
+END(memcpy)

-        cmp     r2, #832
-        bge     check_alignment
+        .cfi_startproc
+__memcpy_chk_fail:
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

-copy_unknown_alignment:
-        // Unknown alignment of src and dst.
-        // Assumes that the first few bytes have already been prefetched.
-
-        // Align destination to 128 bits. The mainloop store instructions
-        // require this alignment or they will throw an exception.
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         2f
-
-        // Copy up to 15 bytes (count in r3).
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-
-        itt         mi
-        ldrbmi      lr, [r1], #1
-        strbmi      lr, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1], #1
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0], #1
-
-        movs        ip, r3, lsl #29
-        bge         1f
-        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1:      bcc         2f
-        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-
-2:      // Make sure we have at least 64 bytes to copy.
-        subs        r2, r2, #64
-        blo         2f
-
-1:      // The main loop copies 64 bytes at a time.
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(CACHE_LINE_SIZE*4)]
-        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
-        bhs         1b
-
-2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
-        adds        r2, r2, #32
-        blo         3f
-
-        // 32 bytes. These cache lines were already preloaded.
-        vld1.8      {d0 - d3},  [r1]!
-        sub         r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
-3:      // Less than 32 left.
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         copy_less_than_16_unknown_align
-        // Copies 16 bytes, destination 128 bits aligned.
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0, :128]!
-
-copy_less_than_16_unknown_align:
-        // Copy up to 15 bytes (count in r2).
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-
-2:      // Copy 0 to 4 bytes.
-        lsls        r2, r2, #31
-        itt         ne
-        ldrbne      lr, [r1], #1
-        strbne      lr, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1]
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0]
-
-        pop         {r0, pc}
-
-check_alignment:
-        // If src and dst cannot both be aligned to a word boundary,
-        // use the unaligned copy version.
-        eor     r3, r0, r1
-        ands    r3, r3, #0x3
-        bne     copy_unknown_alignment
-
-        // To try and improve performance, stack layout changed,
-        // i.e., not keeping the stack looking like users expect
-        // (highest numbered register at highest address).
-        // TODO: Add debug frame directives.
-        // We don't need exception unwind directives, because the code below
-        // does not throw any exceptions and does not call any other functions.
-        // Generally, newlib functions like this lack debug information for
-        // assembler source.
-        .save   {r4, r5}
-        strd    r4, r5, [sp, #-8]!
-        .save   {r6, r7}
-        strd    r6, r7, [sp, #-8]!
-        .save   {r8, r9}
-        strd    r8, r9, [sp, #-8]!
-
-        // Optimized for already aligned dst code.
-        ands    ip, r0, #3
-        bne     dst_not_word_aligned
-
-word_aligned:
-        // Align the destination buffer to 8 bytes, to make sure double
-        // loads and stores don't cross a cache line boundary,
-        // as they are then more expensive even if the data is in the cache
-        // (require two load/store issue cycles instead of one).
-        // If only one of the buffers is not 8 bytes aligned,
-        // then it's more important to align dst than src,
-        // because there is more penalty for stores
-        // than loads that cross a cacheline boundary.
-        // This check and realignment are only done if there is >= 832
-        // bytes to copy.
-
-        // Dst is word aligned, but check if it is already double word aligned.
-        ands    r3, r0, #4
-        beq     1f
-        ldr     r3, [r1], #4
-        str     r3, [r0], #4
-        sub     r2, #4
-
-1:      // Can only get here if > 64 bytes to copy, so don't do check r2.
-        sub     r2, #64
-
-2:      // Every loop iteration copies 64 bytes.
-        .irp    offset, #0, #8, #16, #24, #32
-        ldrd    r4, r5, [r1, \offset]
-        strd    r4, r5, [r0, \offset]
-        .endr
-
-        ldrd    r4, r5, [r1, #40]
-        ldrd    r6, r7, [r1, #48]
-        ldrd    r8, r9, [r1, #56]
-
-        // Keep the pld as far from the next load as possible.
-        // The amount to prefetch was determined experimentally using
-        // large sizes, and verifying the prefetch size does not affect
-        // the smaller copies too much.
-        // WARNING: If the ldrd and strd instructions get too far away
-        //          from each other, performance suffers. Three loads
-        //          in a row is the best tradeoff.
-        pld     [r1, #(CACHE_LINE_SIZE*16)]
-        strd    r4, r5, [r0, #40]
-        strd    r6, r7, [r0, #48]
-        strd    r8, r9, [r0, #56]
-
-        add     r0, r0, #64
-        add     r1, r1, #64
-        subs    r2, r2, #64
-        bge     2b
-
-        // Fix-up the remaining count and make sure we have >= 32 bytes left.
-        adds    r2, r2, #32
-        blo     4f
-
-        // Copy 32 bytes. These cache lines were already preloaded.
-        .irp    offset, #0, #8, #16, #24
-        ldrd    r4, r5, [r1, \offset]
-        strd    r4, r5, [r0, \offset]
-        .endr
-        add     r1, r1, #32
-        add     r0, r0, #32
-        sub     r2, r2, #32
-4:      // Less than 32 left.
-        add     r2, r2, #32
-        tst     r2, #0x10
-        beq     5f
-        // Copy 16 bytes.
-        .irp    offset, #0, #8
-        ldrd    r4, r5, [r1, \offset]
-        strd    r4, r5, [r0, \offset]
-        .endr
-        add     r1, r1, #16
-        add     r0, r0, #16
-
-5:      // Copy up to 15 bytes (count in r2).
-        movs    ip, r2, lsl #29
-        bcc     1f
-        // Copy 8 bytes.
-        ldrd    r4, r5, [r1], #8
-        strd    r4, r5, [r0], #8
-1:      bge         2f
-        // Copy 4 bytes.
-        ldr     r4, [r1], #4
-        str     r4, [r0], #4
-2:      // Copy 0 to 4 bytes.
-        lsls    r2, r2, #31
-        itt     ne
-        ldrbne  lr, [r1], #1
-        strbne  lr, [r0], #1
-        itttt   cs
-        ldrbcs  ip, [r1], #1
-        ldrbcs  lr, [r1]
-        strbcs  ip, [r0], #1
-        strbcs  lr, [r0]
-
-        // Restore registers: optimized pop {r0, pc}
-        ldrd    r8, r9, [sp], #8
-        ldrd    r6, r7, [sp], #8
-        ldrd    r4, r5, [sp], #8
-        pop     {r0, pc}
-
-dst_not_word_aligned:
-        // Align dst to word.
-        rsb     ip, ip, #4
-        cmp     ip, #2
-
-        itt     gt
-        ldrbgt  lr, [r1], #1
-        strbgt  lr, [r0], #1
-
-        itt     ge
-        ldrbge  lr, [r1], #1
-        strbge  lr, [r0], #1
-
-        ldrb    lr, [r1], #1
-        strb    lr, [r0], #1
-
-        sub     r2, r2, ip
-
-        // Src is guaranteed to be at least word aligned by this point.
-        b       word_aligned
-
-
-        // Only reached when the __memcpy_chk check fails.
-fortify_check_failed:
        ldr     r0, error_message
        ldr     r1, error_code
 1:
@@ -342,7 +99,7 @@ error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
        .word   error_string-(1b+8)
-END(memcpy)
+        .cfi_endproc

        .data
 error_string:
--- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+        // Assumes that n >= 0, and dst, src are valid pointers.
+        // For any sizes less than 832 use the neon code that doesn't
+        // care about the src alignment. This avoids any checks
+        // for src alignment, and offers the best improvement since
+        // smaller sized copies are dominated by the overhead of
+        // the pre and post main loop.
+        // For larger copies, if src and dst cannot both be aligned to
+        // word boundaries, use the neon code.
+        // For all other copies, align dst to a double word boundary
+        // and copy using LDRD/STRD instructions.
+
+        cmp     r2, #16
+        blo     .L_copy_less_than_16_unknown_align
+
+        cmp     r2, #832
+        bge     .L_check_alignment
+
+.L_copy_unknown_alignment:
+        // Unknown alignment of src and dst.
+        // Assumes that the first few bytes have already been prefetched.
+
+        // Align destination to 128 bits. The mainloop store instructions
+        // require this alignment or they will throw an exception.
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        // Copy up to 15 bytes (count in r3).
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+
+        itt         mi
+        ldrbmi      lr, [r1], #1
+        strbmi      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+
+        movs        ip, r3, lsl #29
+        bge         1f
+        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1:      bcc         2f
+        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+
+2:      // Make sure we have at least 64 bytes to copy.
+        subs        r2, r2, #64
+        blo         2f
+
+1:      // The main loop copies 64 bytes at a time.
+        vld1.8      {d0  - d3},   [r1]!
+        vld1.8      {d4  - d7},   [r1]!
+        pld         [r1, #(64*4)]
+        subs        r2, r2, #64
+        vst1.8      {d0  - d3},   [r0, :128]!
+        vst1.8      {d4  - d7},   [r0, :128]!
+        bhs         1b
+
+2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
+        adds        r2, r2, #32
+        blo         3f
+
+        // 32 bytes. These cache lines were already preloaded.
+        vld1.8      {d0 - d3},  [r1]!
+        sub         r2, r2, #32
+        vst1.8      {d0 - d3},  [r0, :128]!
+3:      // Less than 32 left.
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         .L_copy_less_than_16_unknown_align
+        // Copies 16 bytes, destination 128 bits aligned.
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+.L_copy_less_than_16_unknown_align:
+        // Copy up to 15 bytes (count in r2).
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+2:      // Copy 0 to 4 bytes.
+        lsls        r2, r2, #31
+        itt         ne
+        ldrbne      lr, [r1], #1
+        strbne      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1]
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0]
+
+        pop         {r0, pc}
+
+.L_check_alignment:
+        // If src and dst cannot both be aligned to a word boundary,
+        // use the unaligned copy version.
+        eor     r3, r0, r1
+        ands    r3, r3, #0x3
+        bne     .L_copy_unknown_alignment
+
+        // To try and improve performance, stack layout changed,
+        // i.e., not keeping the stack looking like users expect
+        // (highest numbered register at highest address).
+        // TODO: Add debug frame directives.
+        // We don't need exception unwind directives, because the code below
+        // does not throw any exceptions and does not call any other functions.
+        // Generally, newlib functions like this lack debug information for
+        // assembler source.
+        .save   {r4, r5}
+        strd    r4, r5, [sp, #-8]!
+        .save   {r6, r7}
+        strd    r6, r7, [sp, #-8]!
+        .save   {r8, r9}
+        strd    r8, r9, [sp, #-8]!
+
+        // Optimized for already aligned dst code.
+        ands    ip, r0, #3
+        bne     .L_dst_not_word_aligned
+
+.L_word_aligned:
+        // Align the destination buffer to 8 bytes, to make sure double
+        // loads and stores don't cross a cache line boundary,
+        // as they are then more expensive even if the data is in the cache
+        // (require two load/store issue cycles instead of one).
+        // If only one of the buffers is not 8 bytes aligned,
+        // then it's more important to align dst than src,
+        // because there is more penalty for stores
+        // than loads that cross a cacheline boundary.
+        // This check and realignment are only done if there is >= 832
+        // bytes to copy.
+
+        // Dst is word aligned, but check if it is already double word aligned.
+        ands    r3, r0, #4
+        beq     1f
+        ldr     r3, [r1], #4
+        str     r3, [r0], #4
+        sub     r2, #4
+
+1:      // Can only get here if > 64 bytes to copy, so don't do check r2.
+        sub     r2, #64
+
+2:      // Every loop iteration copies 64 bytes.
+        .irp    offset, #0, #8, #16, #24, #32
+        ldrd    r4, r5, [r1, \offset]
+        strd    r4, r5, [r0, \offset]
+        .endr
+
+        ldrd    r4, r5, [r1, #40]
+        ldrd    r6, r7, [r1, #48]
+        ldrd    r8, r9, [r1, #56]
+
+        // Keep the pld as far from the next load as possible.
+        // The amount to prefetch was determined experimentally using
+        // large sizes, and verifying the prefetch size does not affect
+        // the smaller copies too much.
+        // WARNING: If the ldrd and strd instructions get too far away
+        //          from each other, performance suffers. Three loads
+        //          in a row is the best tradeoff.
+        pld     [r1, #(64*16)]
+        strd    r4, r5, [r0, #40]
+        strd    r6, r7, [r0, #48]
+        strd    r8, r9, [r0, #56]
+
+        add     r0, r0, #64
+        add     r1, r1, #64
+        subs    r2, r2, #64
+        bge     2b
+
+        // Fix-up the remaining count and make sure we have >= 32 bytes left.
+        adds    r2, r2, #32
+        blo     4f
+
+        // Copy 32 bytes. These cache lines were already preloaded.
+        .irp    offset, #0, #8, #16, #24
+        ldrd    r4, r5, [r1, \offset]
+        strd    r4, r5, [r0, \offset]
+        .endr
+        add     r1, r1, #32
+        add     r0, r0, #32
+        sub     r2, r2, #32
+4:      // Less than 32 left.
+        add     r2, r2, #32
+        tst     r2, #0x10
+        beq     5f
+        // Copy 16 bytes.
+        .irp    offset, #0, #8
+        ldrd    r4, r5, [r1, \offset]
+        strd    r4, r5, [r0, \offset]
+        .endr
+        add     r1, r1, #16
+        add     r0, r0, #16
+
+5:      // Copy up to 15 bytes (count in r2).
+        movs    ip, r2, lsl #29
+        bcc     1f
+        // Copy 8 bytes.
+        ldrd    r4, r5, [r1], #8
+        strd    r4, r5, [r0], #8
+1:      bge         2f
+        // Copy 4 bytes.
+        ldr     r4, [r1], #4
+        str     r4, [r0], #4
+2:      // Copy 0 to 4 bytes.
+        lsls    r2, r2, #31
+        itt     ne
+        ldrbne  lr, [r1], #1
+        strbne  lr, [r0], #1
+        itttt   cs
+        ldrbcs  ip, [r1], #1
+        ldrbcs  lr, [r1]
+        strbcs  ip, [r0], #1
+        strbcs  lr, [r0]
+
+        // Restore registers: optimized pop {r0, pc}
+        ldrd    r8, r9, [sp], #8
+        ldrd    r6, r7, [sp], #8
+        ldrd    r4, r5, [sp], #8
+        pop     {r0, pc}
+
+.L_dst_not_word_aligned:
+        // Align dst to word.
+        rsb     ip, ip, #4
+        cmp     ip, #2
+
+        itt     gt
+        ldrbgt  lr, [r1], #1
+        strbgt  lr, [r0], #1
+
+        itt     ge
+        ldrbge  lr, [r1], #1
+        strbge  lr, [r0], #1
+
+        ldrb    lr, [r1], #1
+        strb    lr, [r0], #1
+
+        sub     r2, r2, ip
+
+        // Src is guaranteed to be at least word aligned by this point.
+        b       .L_word_aligned
--- a/libc/arch-arm/cortex-a15/bionic/memset.S
+++ b/libc/arch-arm/cortex-a15/bionic/memset.S
@@ -40,8 +40,14 @@
        .syntax     unified

 ENTRY(__memset_chk)
+        .cfi_startproc
        cmp         r2, r3
-        bls         done
+        bls         .L_done
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

        ldr         r0, error_message
        ldr         r1, error_code
@@ -53,24 +59,28 @@ error_code:
 error_message:
        .word       error_string-(1b+8)

+        .cfi_endproc
 END(__memset_chk)

 ENTRY(bzero)
+        .cfi_startproc
        mov         r2, r1
        mov         r1, #0
-
-done:
+.L_done:
        // Fall through to memset...
+        .cfi_endproc
 END(bzero)

 ENTRY(memset)
-        .save       {r0}
+        .cfi_startproc
        stmfd       sp!, {r0}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset r0, 0

        // The new algorithm is slower for copies < 16 so use the old
        // neon code in that case.
        cmp         r2, #16
-        blo         set_less_than_16_unknown_align
+        blo         .L_set_less_than_16_unknown_align

        // Use strd which requires an even and odd register so move the
        // values so that:
@@ -84,17 +94,17 @@ ENTRY(memset)
        orr         r1, r1, r1, lsr #8
        orr         r1, r1, r1, lsr #16

-check_alignment:
+.L_check_alignment:
        // Align destination to a double word to avoid the strd crossing
        // a cache line boundary.
        ands        ip, r3, #7
-        bne         do_double_word_align
+        bne         .L_do_double_word_align

-double_word_aligned:
+.L_double_word_aligned:
        mov         r0, r1

        subs        r2, #64
-        blo         set_less_than_64
+        blo         .L_set_less_than_64

 1:      // Main loop sets 64 bytes at a time.
        .irp        offset, #0, #8, #16, #24, #32, #40, #48, #56
@@ -105,39 +115,39 @@ double_word_aligned:
        subs        r2, #64
        bge         1b

-set_less_than_64:
+.L_set_less_than_64:
        // Restore r2 to the count of bytes left to set.
        add         r2, #64
        lsls        ip, r2, #27
-        bcc         set_less_than_32
+        bcc         .L_set_less_than_32
        // Set 32 bytes.
        .irp        offset, #0, #8, #16, #24
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #32

-set_less_than_32:
-        bpl         set_less_than_16
+.L_set_less_than_32:
+        bpl         .L_set_less_than_16
        // Set 16 bytes.
        .irp        offset, #0, #8
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #16

-set_less_than_16:
+.L_set_less_than_16:
        // Less than 16 bytes to set.
        lsls        ip, r2, #29
-        bcc         set_less_than_8
+        bcc         .L_set_less_than_8

        // Set 8 bytes.
        strd        r0, r1, [r3], #8

-set_less_than_8:
-        bpl         set_less_than_4
+.L_set_less_than_8:
+        bpl         .L_set_less_than_4
        // Set 4 bytes
        str         r1, [r3], #4

-set_less_than_4:
+.L_set_less_than_4:
        lsls        ip, r2, #31
        it          ne
        strbne      r1, [r3], #1
@@ -148,7 +158,7 @@ set_less_than_4:
        ldmfd       sp!, {r0}
        bx          lr

-do_double_word_align:
+.L_do_double_word_align:
        rsb         ip, ip, #8
        sub         r2, r2, ip
        movs        r0, ip, lsl #31
@@ -160,11 +170,11 @@ do_double_word_align:

        // Dst is at least word aligned by this point.
        cmp         ip, #4
-        blo         double_word_aligned
+        blo         .L_double_word_aligned
        str         r1, [r3], #4
-        b           double_word_aligned
+        b           .L_double_word_aligned

-set_less_than_16_unknown_align:
+.L_set_less_than_16_unknown_align:
        // Set up to 15 bytes.
        vdup.8      d0, r1
        movs        ip, r2, lsl #29
@@ -180,6 +190,7 @@ set_less_than_16_unknown_align:
        strbcs      r1, [r0], #1
        ldmfd       sp!, {r0}
        bx          lr
+        .cfi_endproc
 END(memset)

        .data
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@@ -4,5 +4,7 @@ $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a15/bionic/strcmp.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S)
+$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a15/bionic/__strcat_chk.S)
+$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a15/bionic/__strcpy_chk.S)

 include bionic/libc/arch-arm/generic/generic.mk
--- a/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+    .fpu    neon
+    .thumb
+    .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+    push    {r4, r5}
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    mov     lr, r2
+
+    // Save the dst register to r5
+    mov     r5, r0
+
+    // Zero out r4
+    eor     r4, r4, r4
+
+    // r1 contains the address of the string to count.
+.L_strlen_start:
+    mov     r0, r1
+
+    ands    r3, r0, #7
+    bne     .L_align_src
+
+    .p2align 2
+.L_mainloop:
+    ldmia   r1!, {r2, r3}
+
+    pld     [r1, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_zero_in_first_register:
+    sub     r3, r1, r0
+    // Check for zero in byte 0.
+    lsls    r2, ip, #17
+    beq     .L_check_byte1_reg1
+
+    sub     r3, r3, #8
+    b       .L_finish
+
+.L_check_byte1_reg1:
+    bcc     .L_check_byte2_reg1
+
+    sub     r3, r3, #7
+    b       .L_finish
+
+.L_check_byte2_reg1:
+    // Check for zero in byte 2.
+    tst     ip, #0x800000
+    it      ne
+    subne   r3, r3, #6
+    bne     .L_finish
+    sub     r3, r3, #5
+    b       .L_finish
+
+.L_zero_in_second_register:
+    sub     r3, r1, r0
+    // Check for zero in byte 0.
+    lsls    r2, ip, #17
+    beq     .L_check_byte1_reg2
+
+    sub     r3, r3, #4
+    b       .L_finish
+
+.L_check_byte1_reg2:
+    bcc     .L_check_byte2_reg2
+
+    sub     r3, r3, #3
+    b       .L_finish
+
+.L_check_byte2_reg2:
+    // Check for zero in byte 2.
+    tst     ip, #0x800000
+    it      ne
+    subne   r3, r3, #2
+    bne     .L_finish
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_align_src:
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_done
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_done
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_done
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r2, [r1], #4
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_done:
+    sub     r3, r1, r0
+    sub     r3, r3, #1
+
+.L_finish:
+    cmp     r4, #0
+    bne     .L_strlen_done
+
+    // Time to get the dst string length.
+    mov     r1, r5
+
+    // Save the original source address to r5.
+    mov     r5, r0
+
+    // Save the current length (adding 1 for the terminator).
+    add     r4, r3, #1
+    b       .L_strlen_start
+
+    // r0 holds the pointer to the dst string.
+    // r3 holds the dst string length.
+    // r4 holds the src string length + 1.
+.L_strlen_done:
+    add     r2, r3, r4
+    cmp     r2, lr
+    bgt     .L_fortify_check_failed
+
+    // Set up the registers for the memcpy code.
+    mov     r1, r5
+    pld     [r1, #64]
+    mov     r2, r4
+    add     r0, r0, r3
+    pop     {r4, r5}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+
+    .cfi_endproc
+END(__strcat_chk)
+
+    .data
+error_string:
+    .string "strcat buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+    .fpu    neon
+    .thumb
+    .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+
+    mov     lr, r2
+    mov     r0, r1
+
+    ands    r3, r0, #7
+    bne     .L_align_src
+
+    .p2align 2
+.L_mainloop:
+    ldmia   r0!, {r2, r3}
+
+    pld     [r0, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_zero_in_first_register:
+    sub     r3, r0, r1
+    // Check for zero in byte 0.
+    lsls    r2, ip, #17
+    beq     .L_check_byte1_reg1
+
+    sub     r3, r3, #8
+    b       .L_check_size
+
+.L_check_byte1_reg1:
+    bcc     .L_check_byte2_reg1
+
+    sub     r3, r3, #7
+    b       .L_check_size
+
+.L_check_byte2_reg1:
+    // Check for zero in byte 2.
+    tst     ip, #0x800000
+    it      ne
+    subne   r3, r3, #6
+    bne     .L_check_size
+    sub     r3, r3, #5
+    b       .L_check_size
+
+.L_zero_in_second_register:
+    sub     r3, r0, r1
+    // Check for zero in byte 0.
+    lsls    r2, ip, #17
+    beq     .L_check_byte1_reg2
+
+    sub     r3, r3, #4
+    b       .L_check_size
+
+.L_check_byte1_reg2:
+    bcc     .L_check_byte2_reg2
+
+    sub     r3, r3, #3
+    b       .L_check_size
+
+.L_check_byte2_reg2:
+    // Check for zero in byte 2.
+    tst     ip, #0x800000
+    it      ne
+    subne   r3, r3, #2
+    bne     .L_check_size
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_align_src:
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_done
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_done
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_done
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r2, [r0], #4
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_done:
+    sub     r3, r0, r1
+    sub     r3, r3, #1
+
+.L_check_size:
+    pld     [r1, #0]
+    pld     [r1, #64]
+    ldr     r0, [sp]
+    cmp     r3, lr
+    bge     .L_fortify_check_failed
+
+    // Add 1 for copy length to get the string terminator.
+    add     r2, r3, #1
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+
+    .cfi_endproc
+END(__strcpy_chk)
+
+    .data
+error_string:
+    .string "strcpy buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy.S
@@ -26,7 +26,6 @@
 * SUCH DAMAGE.
 */

-#include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"

@@ -36,190 +35,40 @@
 * cache line.
 */

-        .text
+        .syntax unified
        .fpu    neon
-
-#define CACHE_LINE_SIZE     32
+        .thumb
+        .thumb_func

 ENTRY(__memcpy_chk)
+        .cfi_startproc
        cmp         r2, r3
-        bgt         fortify_check_failed
+        bgt         __memcpy_chk_fail

        // Fall through to memcpy...
+        .cfi_endproc
 END(__memcpy_chk)

 ENTRY(memcpy)
-        .save       {r0, lr}
-        /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        .cfi_startproc
+        pld     [r1, #0]
        stmfd   sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4
+        pld     [r1, #64]

-        // Check so divider is at least 16 bytes, needed for alignment code.
-        cmp         r2, #16
-        blo         5f
+        #include "memcpy_base.S"
+        .cfi_endproc
+END(memcpy)

+        .cfi_startproc
+__memcpy_chk_fail:
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

-        /* check if buffers are aligned. If so, run arm-only version */
-        eor         r3, r0, r1
-        ands        r3, r3, #0x3
-        beq         11f
-
-        /* Check the upper size limit for Neon unaligned memory access in memcpy */
-        cmp         r2, #224
-        blo         3f
-
-        /* align destination to 16 bytes for the write-buffer */
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         3f
-
-        /* copy up to 15-bytes (count in r3) */
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-        ldrmib      lr, [r1], #1
-        strmib      lr, [r0], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-        movs        ip, r3, lsl #29
-        bge         1f
-        // copies 4 bytes, destination 32-bits aligned
-        vld1.32     {d0[0]}, [r1]!
-        vst1.32     {d0[0]}, [r0, :32]!
-1:      bcc         2f
-        // copies 8 bytes, destination 64-bits aligned
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-2:
-        /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE * 0)]
-        pld         [r1, #(CACHE_LINE_SIZE * 2)]
-3:
-        /* make sure we have at least 64 bytes to copy */
-        subs        r2, r2, #64
-        blo         2f
-
-        /* preload all the cache lines we need */
-        pld         [r1, #(CACHE_LINE_SIZE * 4)]
-        pld         [r1, #(CACHE_LINE_SIZE * 6)]
-
-1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0 - d3}, [r1]!
-        vld1.8      {d4 - d7}, [r1]!
-        pld         [r1, #(CACHE_LINE_SIZE * 6)]
-        subs        r2, r2, #64
-        vst1.8      {d0 - d3}, [r0]!
-        vst1.8      {d4 - d7}, [r0]!
-        bhs         1b
-
-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        add         r2, r2, #64
-        subs        r2, r2, #32
-        blo         4f
-
-3:      /* 32 bytes at a time. These cache lines were already preloaded */
-        vld1.8      {d0 - d3}, [r1]!
-        subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
-        bhs         3b
-
-4:      /* less than 32 left */
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         5f
-        // copies 16 bytes, 128-bits aligned
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0]!
-5:      /* copy up to 15-bytes (count in r2) */
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld1.32     {d0[0]}, [r1]!
-        vst1.32     {d0[0]}, [r0]!
-2:      movs        ip, r2, lsl #31
-        ldrmib      r3, [r1], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strmib      r3, [r0], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-
-        ldmfd       sp!, {r0, lr}
-        bx          lr
-11:
-        /* Simple arm-only copy loop to handle aligned copy operations */
-        stmfd       sp!, {r4, r5, r6, r7, r8}
-        pld         [r1, #(CACHE_LINE_SIZE * 4)]
-
-        /* Check alignment */
-        rsb         r3, r1, #0
-        ands        r3, #3
-        beq         2f
-
-        /* align source to 32 bits. We need to insert 2 instructions between
-         * a ldr[b|h] and str[b|h] because byte and half-word instructions
-         * stall 2 cycles.
-         */
-        movs        r12, r3, lsl #31
-        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
-        ldrmib      r3, [r1], #1
-        ldrcsb      r4, [r1], #1
-        ldrcsb      r5, [r1], #1
-        strmib      r3, [r0], #1
-        strcsb      r4, [r0], #1
-        strcsb      r5, [r0], #1
-
-2:
-        subs        r2, r2, #64
-        blt         4f
-
-3:      /* Main copy loop, copying 64 bytes at a time */
-        pld         [r1, #(CACHE_LINE_SIZE * 8)]
-        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        subs        r2, r2, #64
-        bge         3b
-
-4:      /* Check if there are > 32 bytes left */
-        adds        r2, r2, #64
-        subs        r2, r2, #32
-        blt         5f
-
-        /* Copy 32 bytes */
-        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
-        subs        r2, #32
-
-5:      /* Handle any remaining bytes */
-        adds        r2, #32
-        beq         6f
-
-        movs        r12, r2, lsl #28
-        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
-        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
-        stmcsia     r0!, {r3, r4, r5, r6}
-        stmmiia     r0!, {r7, r8}
-        movs        r12, r2, lsl #30
-        ldrcs       r3, [r1], #4            /*  4 bytes */
-        ldrmih      r4, [r1], #2            /*  2 bytes */
-        strcs       r3, [r0], #4
-        strmih      r4, [r0], #2
-        tst         r2, #0x1
-        ldrneb      r3, [r1]                /*  last byte  */
-        strneb      r3, [r0]
-6:
-        ldmfd       sp!, {r4, r5, r6, r7, r8}
-        ldmfd       sp!, {r0, pc}
-
-
-        // Only reached when the __memcpy_chk check fails.
-fortify_check_failed:
        ldr     r0, error_message
        ldr     r1, error_code
 1:
@@ -228,8 +77,8 @@ fortify_check_failed:
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
-        .word   error_string-(1b+8)
-END(memcpy)
+        .word   error_string-(1b+4)
+        .cfi_endproc

        .data
 error_string:
--- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This code assumes it is running on a processor that supports all arm v7
+ * instructions, that supports neon instructions, and that has a 32 byte
+ * cache line.
+ */
+
+        // Check so divider is at least 16 bytes, needed for alignment code.
+        cmp         r2, #16
+        blo         5f
+
+
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
+        /* Check the upper size limit for Neon unaligned memory access in memcpy */
+        cmp         r2, #224
+        blo         3f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         3f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        itt         mi
+        ldrbmi      lr, [r1], #1
+        strbmi      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+2:
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #0]
+        pld         [r1, #(32 * 2)]
+3:
+        /* make sure we have at least 64 bytes to copy */
+        subs        r2, r2, #64
+        blo         2f
+
+        /* preload all the cache lines we need */
+        pld         [r1, #(32 * 4)]
+        pld         [r1, #(32 * 6)]
+
+1:      /* The main loop copies 64 bytes at a time */
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+        pld         [r1, #(32 * 6)]
+        subs        r2, r2, #64
+        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d4 - d7}, [r0]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        add         r2, r2, #64
+        subs        r2, r2, #32
+        blo         4f
+
+3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        vld1.8      {d0 - d3}, [r1]!
+        subs        r2, r2, #32
+        vst1.8      {d0 - d3}, [r0]!
+        bhs         3b
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0]!
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        itt         mi
+        ldrbmi      r3, [r1], #1
+        strbmi      r3, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
+11:
+        /* Simple arm-only copy loop to handle aligned copy operations */
+        stmfd       sp!, {r4, r5, r6, r7, r8}
+        pld         [r1, #(32 * 4)]
+
+        /* Check alignment */
+        rsb         r3, r1, #0
+        ands        r3, #3
+        beq         2f
+
+        /* align source to 32 bits. We need to insert 2 instructions between
+         * a ldr[b|h] and str[b|h] because byte and half-word instructions
+         * stall 2 cycles.
+         */
+        movs        r12, r3, lsl #31
+        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
+        itt         mi
+        ldrbmi      r3, [r1], #1
+        strbmi      r3, [r0], #1
+        itttt       cs
+        ldrbcs      r4, [r1], #1
+        ldrbcs      r5, [r1], #1
+        strbcs      r4, [r0], #1
+        strbcs      r5, [r0], #1
+
+2:
+        subs        r2, r2, #64
+        blt         4f
+
+3:      /* Main copy loop, copying 64 bytes at a time */
+        pld         [r1, #(32 * 8)]
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        subs        r2, r2, #64
+        bge         3b
+
+4:      /* Check if there are > 32 bytes left */
+        adds        r2, r2, #64
+        subs        r2, r2, #32
+        blt         5f
+
+        /* Copy 32 bytes */
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        subs        r2, #32
+
+5:      /* Handle any remaining bytes */
+        adds        r2, #32
+        beq         6f
+
+        movs        r12, r2, lsl #28
+        itt         cs
+        ldmiacs     r1!, {r3, r4, r5, r6}   /* 16 bytes */
+        stmiacs     r0!, {r3, r4, r5, r6}
+        itt         mi
+        ldmiami     r1!, {r7, r8}           /*  8 bytes */
+        stmiami     r0!, {r7, r8}
+        movs        r12, r2, lsl #30
+        itt         cs
+        ldrcs       r3, [r1], #4            /*  4 bytes */
+        strcs       r3, [r0], #4
+        itt         mi
+        ldrhmi      r4, [r1], #2            /*  2 bytes */
+        strhmi      r4, [r0], #2
+        tst         r2, #0x1
+        itt         ne
+        ldrbne      r3, [r1]                /*  last byte  */
+        strbne      r3, [r0]
+6:
+        ldmfd       sp!, {r4, r5, r6, r7, r8}
+        ldmfd       sp!, {r0, pc}
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -38,8 +38,14 @@
    .fpu    neon

 ENTRY(__memset_chk)
+        .cfi_startproc
        cmp         r2, r3
-        bls         done
+        bls         .L_done
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

        ldr         r0, error_message
        ldr         r1, error_code
@@ -51,24 +57,29 @@ error_code:
 error_message:
        .word       error_string-(1b+8)

+        .cfi_endproc
 END(__memset_chk)

 ENTRY(bzero)
+        .cfi_startproc
        mov     r2, r1
        mov     r1, #0

-done:
+.L_done:
        // Fall through to memset...
+        .cfi_endproc
 END(bzero)

 /* memset() returns its first argument.  */
 ENTRY(memset)
+        .cfi_startproc
        # The neon memset only wins for less than 132.
        cmp         r2, #132
        bhi         11f

-        .save       {r0}
        stmfd       sp!, {r0}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset r0, 0

        vdup.8      q0, r1

@@ -106,8 +117,15 @@ ENTRY(memset)
         * offset = (4-(src&3))&3 = -src & 3
         */

-        .save       {r0, r4-r7, lr}
        stmfd       sp!, {r0, r4-r7, lr}
+        .cfi_def_cfa_offset 24
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset r4, 4
+        .cfi_rel_offset r5, 8
+        .cfi_rel_offset r6, 12
+        .cfi_rel_offset r7, 16
+        .cfi_rel_offset lr, 20
+
        rsb         r3, r0, #0
        ands        r3, r3, #3
        cmp         r3, r2
@@ -169,6 +187,7 @@ ENTRY(memset)
        strcsb      r1, [r0]
        ldmfd       sp!, {r0, r4-r7, lr}
        bx          lr
+        .cfi_endproc
 END(memset)

        .data
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@@ -4,5 +4,7 @@ $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a9/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a9/bionic/strcmp.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a9/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a9/bionic/strlen.S)
+$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a9/bionic/__strcat_chk.S)
+$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a9/bionic/__strcpy_chk.S)

 include bionic/libc/arch-arm/generic/generic.mk
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -4,3 +4,5 @@ $(call libc-add-cpu-variant-src,STRCAT,string/strcat.c)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c)
+$(call libc-add-cpu-variant-src,__STRCAT_CHK,bionic/__strcat_chk.cpp)
+$(call libc-add-cpu-variant-src,__STRCPY_CHK,bionic/__strcpy_chk.cpp)
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+    push    {r4, r5}
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    mov     lr, r2
+
+    // Save the dst register to r5
+    mov     r5, r0
+
+    // Zero out r4
+    eor     r4, r4, r4
+
+    // r1 contains the address of the string to count.
+.L_strlen_start:
+    mov     r0, r1
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r1], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r1], #8
+
+    pld     [r1, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r1, r0
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_zero_in_first_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_finish
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_finish
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_finish
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_finish
+
+.L_zero_in_second_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_finish
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_finish
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_finish:
+    cmp     r4, #0
+    bne     .L_strlen_done
+
+    // Time to get the dst string length.
+    mov     r1, r5
+
+    // Save the original source address to r5.
+    mov     r5, r0
+
+    // Save the current length (adding 1 for the terminator).
+    add     r4, r3, #1
+    b       .L_strlen_start
+
+    // r0 holds the pointer to the dst string.
+    // r3 holds the dst string length.
+    // r4 holds the src string length + 1.
+.L_strlen_done:
+    add     r2, r3, r4
+    cmp     r2, lr
+    bgt     .L_fortify_check_failed
+
+    // Set up the registers for the memcpy code.
+    mov     r1, r5
+    pld     [r1, #64]
+    mov     r2, r4
+    add     r0, r0, r3
+    pop     {r4, r5}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+
+    .cfi_endproc
+END(__strcat_chk)
+
+    .data
+error_string:
+    .string "strcat buffer overflow"
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+#include "libc_events.h"
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+    .cfi_startproc
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+
+    mov     lr, r2
+    mov     r0, r1
+
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r0], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r0], #8
+
+    pld     [r0, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r0, r1
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_zero_in_first_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_check_size
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_check_size
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_check_size
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_check_size
+
+.L_zero_in_second_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_check_size
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_check_size
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_check_size:
+    pld     [r1, #0]
+    pld     [r1, #64]
+    ldr     r0, [sp]
+    cmp     r3, lr
+    bge     .L_fortify_check_failed
+
+    // Add 1 for copy length to get the string terminator.
+    add     r2, r3, #1
+
+    #include "memcpy_base.S"
+
+.L_fortify_check_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+    .cfi_endproc
+END(__strcpy_chk)
+
+    .data
+error_string:
+    .string "strcpy buffer overflow"
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -28,7 +28,6 @@

 /* Assumes neon instructions and a cache line size of 32 bytes. */

-#include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"

@@ -38,103 +37,40 @@
 * cache line.
 */

-#define CACHE_LINE_SIZE     32
-
        .text
+        .syntax unified
        .fpu    neon
+        .thumb
+        .thumb_func

 ENTRY(__memcpy_chk)
+        .cfi_startproc
        cmp         r2, r3
-        bgt         fortify_check_failed
+        bgt         __memcpy_chk_fail

        // Fall through to memcpy...
+        .cfi_endproc
 END(__memcpy_chk)

 ENTRY(memcpy)
-        .save       {r0, lr}
-        /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*4)]
+        .cfi_startproc
+        pld     [r1, #64]
        stmfd   sp!, {r0, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4

-        /* do we have at least 16-bytes to copy (needed for alignment below) */
-        cmp         r2, #16
-        blo         5f
+        #include "memcpy_base.S"
+        .cfi_endproc
+END(memcpy)

-        /* align destination to cache-line for the write-buffer */
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         2f
+        .cfi_startproc
+__memcpy_chk_fail:
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

-        /* copy up to 15-bytes (count in r3) */
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-        ldrmib      lr, [r1], #1
-        strmib      lr, [r0], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-        movs        ip, r3, lsl #29
-        bge         1f
-        // copies 4 bytes, destination 32-bits aligned
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1:      bcc         2f
-        // copies 8 bytes, destination 64-bits aligned
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-
-2:      /* make sure we have at least 64 bytes to copy */
-        subs        r2, r2, #64
-        blo         2f
-
-1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
-        bhs         1b
-
-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        adds        r2, r2, #32
-        blo         4f
-
-        /* Copy 32 bytes. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
-        sub         r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
-
-4:      /* less than 32 left */
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         5f
-        // copies 16 bytes, 128-bits aligned
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0, :128]!
-
-5:      /* copy up to 15-bytes (count in r2) */
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2:      movs        ip, r2, lsl #31
-        ldrmib      r3, [r1], #1
-        ldrcsb      ip, [r1], #1
-        ldrcsb      lr, [r1], #1
-        strmib      r3, [r0], #1
-        strcsb      ip, [r0], #1
-        strcsb      lr, [r0], #1
-
-        ldmfd       sp!, {r0, lr}
-        bx          lr
-
-        // Only reached when the __memcpy_chk check fails.
-fortify_check_failed:
        ldr     r0, error_message
        ldr     r1, error_code
 1:
@@ -143,8 +79,8 @@ fortify_check_failed:
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
-        .word   error_string-(1b+8)
-END(memcpy)
+        .word   error_string-(1b+4)
+        .cfi_endproc

        .data
 error_string:
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+/*
+ * This code assumes it is running on a processor that supports all arm v7
+ * instructions, that supports neon instructions, and that has a 32 byte
+ * cache line.
+ */
+
+// Assumes neon instructions and a cache line size of 32 bytes.
+
+        /* do we have at least 16-bytes to copy (needed for alignment below) */
+        cmp         r2, #16
+        blo         5f
+
+        /* align destination to cache-line for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        itt         mi
+        ldrbmi      lr, [r1], #1
+        strbmi      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+
+2:      /* make sure we have at least 64 bytes to copy */
+        subs        r2, r2, #64
+        blo         2f
+
+1:      /* The main loop copies 64 bytes at a time */
+        vld1.8      {d0  - d3},   [r1]!
+        vld1.8      {d4  - d7},   [r1]!
+        pld         [r1, #(32*2)]
+        subs        r2, r2, #64
+        vst1.8      {d0  - d3},   [r0, :128]!
+        vst1.8      {d4  - d7},   [r0, :128]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        adds        r2, r2, #32
+        blo         4f
+
+        /* Copy 32 bytes. These cache lines were already preloaded */
+        vld1.8      {d0 - d3},  [r1]!
+        sub         r2, r2, #32
+        vst1.8      {d0 - d3},  [r0, :128]!
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        itt         mi
+        ldrbmi      r3, [r1], #1
+        strbmi      r3, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -39,8 +39,14 @@
    .fpu    neon

 ENTRY(__memset_chk)
+        .cfi_startproc
        cmp         r2, r3
-        bls         done
+        bls         .L_done
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0

        ldr         r0, error_message
        ldr         r1, error_code
@@ -52,20 +58,25 @@ error_code:
 error_message:
        .word       error_string-(1b+8)

+        .cfi_endproc
 END(__memset_chk)

 ENTRY(bzero)
+        .cfi_startproc
        mov     r2, r1
        mov     r1, #0

-done:
+.L_done:
        // Fall through to memset...
+        .cfi_endproc
 END(bzero)

 /* memset() returns its first argument.  */
 ENTRY(memset)
-        .save       {r0}
+        .cfi_startproc
        stmfd       sp!, {r0}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset r0, 0

        vdup.8      q0, r1

@@ -98,6 +109,7 @@ ENTRY(memset)
        strcsb      r1, [r0], #1
        ldmfd       sp!, {r0}
        bx          lr
+        .cfi_endproc
 END(memset)

        .data
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -1,6 +1,8 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S)
+$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S)
+$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S)
 # Use cortex-a15 versions of strcat/strcpy/strlen.
 $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)