Merge commit 'cf052994' into manualmerge

Fixing merge conflict and pulling in all of the changes that I stupidly marked as not requiring a merge. Conflicts: libc/arch-arm/arm.mk libc/arch-arm/cortex-a15/bionic/memcpy.S libc/arch-arm/cortex-a15/bionic/memset.S libc/arch-arm/cortex-a9/bionic/memcpy.S libc/arch-arm/cortex-a9/bionic/memset.S libc/arch-arm/cortex-a9/cortex-a9.mk libc/arch-arm/krait/bionic/memcpy.S libc/arch-arm/krait/bionic/memset.S libc/arch-arm/krait/krait.mk Change-Id: I5da280dd8700681f118719c964a4d13446db51db
2013-08-15 19:20:43 -07:00 · 2013-08-15 19:20:43 -07:00 · 2e544837e7
commit 2e544837e7
parent 11855d4789 cf0529946a
33 changed files with 4458 additions and 631 deletions
--- a/libc/Android.mk
+++ b/libc/Android.mk
@ -62,7 +62,6 @@ libc_common_src_files := \
 	string/strcspn.c \
 	string/strdup.c \
 	string/strpbrk.c \
 	string/__strrchr_chk.c \
 	string/strsep.c \
 	string/strspn.c \
 	string/strstr.c \
@ -181,6 +180,25 @@ libc_common_src_files := \
 	netbsd/nameser/ns_print.c \
 	netbsd/nameser/ns_samedomain.c \
 # Fortify implementations of libc functions.
 libc_common_src_files += \
    bionic/__fgets_chk.cpp \
    bionic/__memcpy_chk.cpp \
    bionic/__memmove_chk.cpp \
    bionic/__memset_chk.cpp \
    bionic/__strcat_chk.cpp \
    bionic/__strchr_chk.cpp \
    bionic/__strcpy_chk.cpp \
    bionic/__strlcat_chk.cpp \
    bionic/__strlcpy_chk.cpp \
    bionic/__strlen_chk.cpp \
    bionic/__strncat_chk.cpp \
    bionic/__strncpy_chk.cpp \
    bionic/__strrchr_chk.cpp \
    bionic/__umask_chk.cpp \
    bionic/__vsnprintf_chk.cpp \
    bionic/__vsprintf_chk.cpp \
 libc_bionic_src_files := \
    bionic/abort.cpp \
    bionic/assert.cpp \
@ -189,16 +207,12 @@ libc_bionic_src_files := \
    bionic/__errno.c \
    bionic/eventfd_read.cpp \
    bionic/eventfd_write.cpp \
    bionic/__fgets_chk.cpp \
    bionic/futimens.cpp \
    bionic/getauxval.cpp \
    bionic/getcwd.cpp \
    bionic/libc_init_common.cpp \
    bionic/libc_logging.cpp \
    bionic/libgen.cpp \
    bionic/__memcpy_chk.cpp \
    bionic/__memmove_chk.cpp \
    bionic/__memset_chk.cpp \
    bionic/mmap.cpp \
    bionic/pthread_attr.cpp \
    bionic/pthread_detach.cpp \
@ -221,24 +235,13 @@ libc_bionic_src_files := \
    bionic/signalfd.cpp \
    bionic/sigwait.cpp \
    bionic/statvfs.cpp \
    bionic/__strcat_chk.cpp \
    bionic/__strchr_chk.cpp \
    bionic/__strcpy_chk.cpp \
    bionic/strerror.cpp \
    bionic/strerror_r.cpp \
    bionic/__strlcat_chk.cpp \
    bionic/__strlcpy_chk.cpp \
    bionic/__strlen_chk.cpp \
    bionic/__strncat_chk.cpp \
    bionic/__strncpy_chk.cpp \
    bionic/strsignal.cpp \
    bionic/stubs.cpp \
    bionic/sysconf.cpp \
    bionic/tdestroy.cpp \
    bionic/tmpfile.cpp \
    bionic/__umask_chk.cpp \
    bionic/__vsnprintf_chk.cpp \
    bionic/__vsprintf_chk.cpp \
    bionic/wait.cpp \
    bionic/wchar.cpp \
@ -363,7 +366,6 @@ libc_common_src_files += \
 	bionic/memmove.c.arm \
 	string/bcopy.c \
 	string/strncmp.c \
 	string/strcat.c \
 	string/strncat.c \
 	string/strncpy.c \
 	bionic/strchr.cpp \
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@ -14,7 +14,6 @@ _LIBC_ARCH_COMMON_SRC_FILES := \
    arch-arm/bionic/_setjmp.S \
    arch-arm/bionic/setjmp.S \
    arch-arm/bionic/sigsetjmp.S \
    arch-arm/bionic/strcpy.S \
    arch-arm/bionic/syscall.S \
    arch-arm/bionic/tgkill.S \
    arch-arm/bionic/tkill.S \
@ -27,6 +26,17 @@ _LIBC_ARCH_STATIC_SRC_FILES := \
 _LIBC_ARCH_DYNAMIC_SRC_FILES := \
    arch-arm/bionic/exidx_dynamic.c
 # Remove the C++ fortify function implementations for which there is an
 # arm assembler version.
 _LIBC_FORTIFY_FILES_TO_REMOVE := \
    bionic/__memcpy_chk.cpp \
    bionic/__memset_chk.cpp \
    bionic/__strcpy_chk.cpp \
    bionic/__strcat_chk.cpp \
 libc_common_src_files := \
    $(filter-out $(_LIBC_FORTIFY_FILES_TO_REMOVE),$(libc_common_src_files))
 ifeq ($(strip $(wildcard bionic/libc/arch-arm/$(TARGET_CPU_VARIANT)/$(TARGET_CPU_VARIANT).mk)),)
 $(error "TARGET_CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
 endif
--- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
@ -0,0 +1,215 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .thumb
    .thumb_func
 // Get the length of src string, then get the source of the dst string.
 // Check that the two lengths together don't exceed the threshold, then
 // do a memcpy of the data.
 ENTRY(__strcat_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    push    {r4, r5}
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 0
    mov     lr, r2
    // Save the dst register to r5
    mov     r5, r0
    // Zero out r4
    eor     r4, r4, r4
    // r1 contains the address of the string to count.
 .L_strlen_start:
    mov     r0, r1
    ands    r3, r1, #7
    beq     .L_mainloop
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_32:
    bcc     .L_align_to_64
    ands    ip, r3, #2
    beq     .L_align_to_64
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r3, [r1], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    .p2align 2
 .L_mainloop:
    ldrd    r2, r3, [r1], #8
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_update_count_and_finish:
    sub     r3, r1, r0
    sub     r3, r3, #1
    b       .L_finish
 .L_zero_in_first_register:
    sub     r3, r1, r0
    lsls    r2, ip, #17
    bne     .L_sub8_and_finish
    bcs     .L_sub7_and_finish
    lsls    ip, ip, #1
    bne     .L_sub6_and_finish
    sub     r3, r3, #5
    b       .L_finish
 .L_sub8_and_finish:
    sub     r3, r3, #8
    b       .L_finish
 .L_sub7_and_finish:
    sub     r3, r3, #7
    b       .L_finish
 .L_sub6_and_finish:
    sub     r3, r3, #6
    b       .L_finish
 .L_zero_in_second_register:
    sub     r3, r1, r0
    lsls    r2, ip, #17
    bne     .L_sub4_and_finish
    bcs     .L_sub3_and_finish
    lsls    ip, ip, #1
    bne     .L_sub2_and_finish
    sub     r3, r3, #1
    b       .L_finish
 .L_sub4_and_finish:
    sub     r3, r3, #4
    b       .L_finish
 .L_sub3_and_finish:
    sub     r3, r3, #3
    b       .L_finish
 .L_sub2_and_finish:
    sub     r3, r3, #2
 .L_finish:
    cmp     r4, #0
    bne     .L_strlen_done
    // Time to get the dst string length.
    mov     r1, r5
    // Save the original source address to r5.
    mov     r5, r0
    // Save the current length (adding 1 for the terminator).
    add     r4, r3, #1
    b       .L_strlen_start
    // r0 holds the pointer to the dst string.
    // r3 holds the dst string length.
    // r4 holds the src string length + 1.
 .L_strlen_done:
    add     r2, r3, r4
    cmp     r2, lr
    bgt     .L_fortify_check_failed
    // Set up the registers for the memcpy code.
    mov     r1, r5
    pld     [r1, #64]
    mov     r2, r4
    add     r0, r0, r3
    pop     {r4, r5}
    .cfi_adjust_cfa_offset -8
    .cfi_restore r4
    .cfi_restore r5
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 4
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcat_chk)
    .data
 error_string:
    .string "strcat buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
@ -0,0 +1,176 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .thumb
    .thumb_func
 // Get the length of the source string first, then do a memcpy of the data
 // instead of a strcpy.
 ENTRY(__strcpy_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    mov     lr, r2
    mov     r0, r1
    ands    r3, r1, #7
    beq     .L_mainloop
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_32:
    bcc     .L_align_to_64
    ands    ip, r3, #2
    beq     .L_align_to_64
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r3, [r0], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    .p2align 2
 .L_mainloop:
    ldrd    r2, r3, [r0], #8
    pld     [r0, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_update_count_and_finish:
    sub     r3, r0, r1
    sub     r3, r3, #1
    b       .L_check_size
 .L_zero_in_first_register:
    sub     r3, r0, r1
    lsls    r2, ip, #17
    bne     .L_sub8_and_finish
    bcs     .L_sub7_and_finish
    lsls    ip, ip, #1
    bne     .L_sub6_and_finish
    sub     r3, r3, #5
    b       .L_check_size
 .L_sub8_and_finish:
    sub     r3, r3, #8
    b       .L_check_size
 .L_sub7_and_finish:
    sub     r3, r3, #7
    b       .L_check_size
 .L_sub6_and_finish:
    sub     r3, r3, #6
    b       .L_check_size
 .L_zero_in_second_register:
    sub     r3, r0, r1
    lsls    r2, ip, #17
    bne     .L_sub4_and_finish
    bcs     .L_sub3_and_finish
    lsls    ip, ip, #1
    bne     .L_sub2_and_finish
    sub     r3, r3, #1
    b       .L_check_size
 .L_sub4_and_finish:
    sub     r3, r3, #4
    b       .L_check_size
 .L_sub3_and_finish:
    sub     r3, r3, #3
    b       .L_check_size
 .L_sub2_and_finish:
    sub     r3, r3, #2
 .L_check_size:
    pld     [r1, #0]
    pld     [r1, #64]
    ldr     r0, [sp]
    cmp     r3, lr
    bge     .L_fortify_check_failed
    // Add 1 for copy length to get the string terminator.
    add     r2, r3, #1
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcpy_chk)
    .data
 error_string:
    .string "strcpy buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@ -53,272 +53,54 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-    /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
        // This version is tuned for the Cortex-A15 processor.
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
        .text
        .syntax unified
        .fpu    neon
-#define CACHE_LINE_SIZE 64
+ENTRY(__memcpy_chk)
        .cfi_startproc
        cmp     r2, r3
        bgt     __memcpy_chk_fail
        // Fall through to memcpy...
        .cfi_endproc
 END(__memcpy_chk)
 ENTRY(memcpy)
-        // Assumes that n >= 0, and dst, src are valid pointers.
+        .cfi_startproc
-        // For any sizes less than 832 use the neon code that doesn't
+        pld     [r1, #64]
        // care about the src alignment. This avoids any checks
        // for src alignment, and offers the best improvement since
        // smaller sized copies are dominated by the overhead of
        // the pre and post main loop.
        // For larger copies, if src and dst cannot both be aligned to
        // word boundaries, use the neon code.
        // For all other copies, align dst to a double word boundary
        // and copy using LDRD/STRD instructions.
        // Save registers (r0 holds the return value):
        // optimized push {r0, lr}.
        .save   {r0, lr}
        pld     [r1, #(CACHE_LINE_SIZE*16)]
        push    {r0, lr}
        .cfi_def_cfa_offset 8
        .cfi_rel_offset r0, 0
        .cfi_rel_offset lr, 4
-        cmp     r2, #16
+        #include "memcpy_base.S"
-        blo     copy_less_than_16_unknown_align
+        .cfi_endproc
        cmp     r2, #832
        bge     check_alignment
 copy_unknown_alignment:
        // Unknown alignment of src and dst.
        // Assumes that the first few bytes have already been prefetched.
        // Align destination to 128 bits. The mainloop store instructions
        // require this alignment or they will throw an exception.
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         2f
        // Copy up to 15 bytes (count in r3).
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        itt         mi
        ldrbmi      lr, [r1], #1
        strbmi      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
 1:      bcc         2f
        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:      // Make sure we have at least 64 bytes to copy.
        subs        r2, r2, #64
        blo         2f
 1:      // The main loop copies 64 bytes at a time.
        vld1.8      {d0  - d3},   [r1]!
        vld1.8      {d4  - d7},   [r1]!
        pld         [r1, #(CACHE_LINE_SIZE*4)]
        subs        r2, r2, #64
        vst1.8      {d0  - d3},   [r0, :128]!
        vst1.8      {d4  - d7},   [r0, :128]!
        bhs         1b
 2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
        adds        r2, r2, #32
        blo         3f
        // 32 bytes. These cache lines were already preloaded.
        vld1.8      {d0 - d3},  [r1]!
        sub         r2, r2, #32
        vst1.8      {d0 - d3},  [r0, :128]!
 3:      // Less than 32 left.
        add         r2, r2, #32
        tst         r2, #0x10
        beq         copy_less_than_16_unknown_align
        // Copies 16 bytes, destination 128 bits aligned.
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
 copy_less_than_16_unknown_align:
        // Copy up to 15 bytes (count in r2).
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
 2:      // Copy 0 to 4 bytes.
        lsls        r2, r2, #31
        itt         ne
        ldrbne      lr, [r1], #1
        strbne      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1]
        strbcs      ip, [r0], #1
        strbcs      lr, [r0]
        pop         {r0, pc}
 check_alignment:
        // If src and dst cannot both be aligned to a word boundary,
        // use the unaligned copy version.
        eor     r3, r0, r1
        ands    r3, r3, #0x3
        bne     copy_unknown_alignment
        // To try and improve performance, stack layout changed,
        // i.e., not keeping the stack looking like users expect
        // (highest numbered register at highest address).
        // TODO: Add debug frame directives.
        // We don't need exception unwind directives, because the code below
        // does not throw any exceptions and does not call any other functions.
        // Generally, newlib functions like this lack debug information for
        // assembler source.
        .save   {r4, r5}
        strd    r4, r5, [sp, #-8]!
        .save   {r6, r7}
        strd    r6, r7, [sp, #-8]!
        .save   {r8, r9}
        strd    r8, r9, [sp, #-8]!
        // Optimized for already aligned dst code.
        ands    ip, r0, #3
        bne     dst_not_word_aligned
 word_aligned:
        // Align the destination buffer to 8 bytes, to make sure double
        // loads and stores don't cross a cache line boundary,
        // as they are then more expensive even if the data is in the cache
        // (require two load/store issue cycles instead of one).
        // If only one of the buffers is not 8 bytes aligned,
        // then it's more important to align dst than src,
        // because there is more penalty for stores
        // than loads that cross a cacheline boundary.
        // This check and realignment are only done if there is >= 832
        // bytes to copy.
        // Dst is word aligned, but check if it is already double word aligned.
        ands    r3, r0, #4
        beq     1f
        ldr     r3, [r1], #4
        str     r3, [r0], #4
        sub     r2, #4
 1:      // Can only get here if > 64 bytes to copy, so don't do check r2.
        sub     r2, #64
 2:      // Every loop iteration copies 64 bytes.
        .irp    offset, #0, #8, #16, #24, #32
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        ldrd    r4, r5, [r1, #40]
        ldrd    r6, r7, [r1, #48]
        ldrd    r8, r9, [r1, #56]
        // Keep the pld as far from the next load as possible.
        // The amount to prefetch was determined experimentally using
        // large sizes, and verifying the prefetch size does not affect
        // the smaller copies too much.
        // WARNING: If the ldrd and strd instructions get too far away
        //          from each other, performance suffers. Three loads
        //          in a row is the best tradeoff.
        pld     [r1, #(CACHE_LINE_SIZE*16)]
        strd    r4, r5, [r0, #40]
        strd    r6, r7, [r0, #48]
        strd    r8, r9, [r0, #56]
        add     r0, r0, #64
        add     r1, r1, #64
        subs    r2, r2, #64
        bge     2b
        // Fix-up the remaining count and make sure we have >= 32 bytes left.
        adds    r2, r2, #32
        blo     4f
        // Copy 32 bytes. These cache lines were already preloaded.
        .irp    offset, #0, #8, #16, #24
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        add     r1, r1, #32
        add     r0, r0, #32
        sub     r2, r2, #32
 4:      // Less than 32 left.
        add     r2, r2, #32
        tst     r2, #0x10
        beq     5f
        // Copy 16 bytes.
        .irp    offset, #0, #8
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        add     r1, r1, #16
        add     r0, r0, #16
 5:      // Copy up to 15 bytes (count in r2).
        movs    ip, r2, lsl #29
        bcc     1f
        // Copy 8 bytes.
        ldrd    r4, r5, [r1], #8
        strd    r4, r5, [r0], #8
 1:      bge         2f
        // Copy 4 bytes.
        ldr     r4, [r1], #4
        str     r4, [r0], #4
 2:      // Copy 0 to 4 bytes.
        lsls    r2, r2, #31
        itt     ne
        ldrbne  lr, [r1], #1
        strbne  lr, [r0], #1
        itttt   cs
        ldrbcs  ip, [r1], #1
        ldrbcs  lr, [r1]
        strbcs  ip, [r0], #1
        strbcs  lr, [r0]
        // Restore registers: optimized pop {r0, pc}
        ldrd    r8, r9, [sp], #8
        ldrd    r6, r7, [sp], #8
        ldrd    r4, r5, [sp], #8
        pop     {r0, pc}
 dst_not_word_aligned:
        // Align dst to word.
        rsb     ip, ip, #4
        cmp     ip, #2
        itt     gt
        ldrbgt  lr, [r1], #1
        strbgt  lr, [r0], #1
        itt     ge
        ldrbge  lr, [r1], #1
        strbge  lr, [r0], #1
        ldrb    lr, [r1], #1
        strb    lr, [r0], #1
        sub     r2, r2, ip
        // Src is guaranteed to be at least word aligned by this point.
        b       word_aligned
 END(memcpy)
        .cfi_startproc
 __memcpy_chk_fail:
        // Preserve lr for backtrace.
        push    {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr     r0, error_message
        ldr     r1, error_code
 1:
        add     r0, pc
        bl      __fortify_chk_fail
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
        .word   error_string-(1b+8)
        .cfi_endproc
        .data
 error_string:
        .string "memcpy buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
@ -0,0 +1,303 @@
 /*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
        // Assumes that n >= 0, and dst, src are valid pointers.
        // For any sizes less than 832 use the neon code that doesn't
        // care about the src alignment. This avoids any checks
        // for src alignment, and offers the best improvement since
        // smaller sized copies are dominated by the overhead of
        // the pre and post main loop.
        // For larger copies, if src and dst cannot both be aligned to
        // word boundaries, use the neon code.
        // For all other copies, align dst to a double word boundary
        // and copy using LDRD/STRD instructions.
        cmp     r2, #16
        blo     .L_copy_less_than_16_unknown_align
        cmp     r2, #832
        bge     .L_check_alignment
 .L_copy_unknown_alignment:
        // Unknown alignment of src and dst.
        // Assumes that the first few bytes have already been prefetched.
        // Align destination to 128 bits. The mainloop store instructions
        // require this alignment or they will throw an exception.
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         2f
        // Copy up to 15 bytes (count in r3).
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        itt         mi
        ldrbmi      lr, [r1], #1
        strbmi      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
 1:      bcc         2f
        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:      // Make sure we have at least 64 bytes to copy.
        subs        r2, r2, #64
        blo         2f
 1:      // The main loop copies 64 bytes at a time.
        vld1.8      {d0  - d3},   [r1]!
        vld1.8      {d4  - d7},   [r1]!
        pld         [r1, #(64*4)]
        subs        r2, r2, #64
        vst1.8      {d0  - d3},   [r0, :128]!
        vst1.8      {d4  - d7},   [r0, :128]!
        bhs         1b
 2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
        adds        r2, r2, #32
        blo         3f
        // 32 bytes. These cache lines were already preloaded.
        vld1.8      {d0 - d3},  [r1]!
        sub         r2, r2, #32
        vst1.8      {d0 - d3},  [r0, :128]!
 3:      // Less than 32 left.
        add         r2, r2, #32
        tst         r2, #0x10
        beq         .L_copy_less_than_16_unknown_align
        // Copies 16 bytes, destination 128 bits aligned.
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
 .L_copy_less_than_16_unknown_align:
        // Copy up to 15 bytes (count in r2).
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
 2:      // Copy 0 to 4 bytes.
        lsls        r2, r2, #31
        itt         ne
        ldrbne      lr, [r1], #1
        strbne      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1]
        strbcs      ip, [r0], #1
        strbcs      lr, [r0]
        pop         {r0, pc}
 .L_check_alignment:
        // If src and dst cannot both be aligned to a word boundary,
        // use the unaligned copy version.
        eor     r3, r0, r1
        ands    r3, r3, #0x3
        bne     .L_copy_unknown_alignment
        // To try and improve performance, stack layout changed,
        // i.e., not keeping the stack looking like users expect
        // (highest numbered register at highest address).
        // TODO: Add debug frame directives.
        // We don't need exception unwind directives, because the code below
        // does not throw any exceptions and does not call any other functions.
        // Generally, newlib functions like this lack debug information for
        // assembler source.
        .save   {r4, r5}
        strd    r4, r5, [sp, #-8]!
        .save   {r6, r7}
        strd    r6, r7, [sp, #-8]!
        .save   {r8, r9}
        strd    r8, r9, [sp, #-8]!
        // Optimized for already aligned dst code.
        ands    ip, r0, #3
        bne     .L_dst_not_word_aligned
 .L_word_aligned:
        // Align the destination buffer to 8 bytes, to make sure double
        // loads and stores don't cross a cache line boundary,
        // as they are then more expensive even if the data is in the cache
        // (require two load/store issue cycles instead of one).
        // If only one of the buffers is not 8 bytes aligned,
        // then it's more important to align dst than src,
        // because there is more penalty for stores
        // than loads that cross a cacheline boundary.
        // This check and realignment are only done if there is >= 832
        // bytes to copy.
        // Dst is word aligned, but check if it is already double word aligned.
        ands    r3, r0, #4
        beq     1f
        ldr     r3, [r1], #4
        str     r3, [r0], #4
        sub     r2, #4
 1:      // Can only get here if > 64 bytes to copy, so don't do check r2.
        sub     r2, #64
 2:      // Every loop iteration copies 64 bytes.
        .irp    offset, #0, #8, #16, #24, #32
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        ldrd    r4, r5, [r1, #40]
        ldrd    r6, r7, [r1, #48]
        ldrd    r8, r9, [r1, #56]
        // Keep the pld as far from the next load as possible.
        // The amount to prefetch was determined experimentally using
        // large sizes, and verifying the prefetch size does not affect
        // the smaller copies too much.
        // WARNING: If the ldrd and strd instructions get too far away
        //          from each other, performance suffers. Three loads
        //          in a row is the best tradeoff.
        pld     [r1, #(64*16)]
        strd    r4, r5, [r0, #40]
        strd    r6, r7, [r0, #48]
        strd    r8, r9, [r0, #56]
        add     r0, r0, #64
        add     r1, r1, #64
        subs    r2, r2, #64
        bge     2b
        // Fix-up the remaining count and make sure we have >= 32 bytes left.
        adds    r2, r2, #32
        blo     4f
        // Copy 32 bytes. These cache lines were already preloaded.
        .irp    offset, #0, #8, #16, #24
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        add     r1, r1, #32
        add     r0, r0, #32
        sub     r2, r2, #32
 4:      // Less than 32 left.
        add     r2, r2, #32
        tst     r2, #0x10
        beq     5f
        // Copy 16 bytes.
        .irp    offset, #0, #8
        ldrd    r4, r5, [r1, \offset]
        strd    r4, r5, [r0, \offset]
        .endr
        add     r1, r1, #16
        add     r0, r0, #16
 5:      // Copy up to 15 bytes (count in r2).
        movs    ip, r2, lsl #29
        bcc     1f
        // Copy 8 bytes.
        ldrd    r4, r5, [r1], #8
        strd    r4, r5, [r0], #8
 1:      bge         2f
        // Copy 4 bytes.
        ldr     r4, [r1], #4
        str     r4, [r0], #4
 2:      // Copy 0 to 4 bytes.
        lsls    r2, r2, #31
        itt     ne
        ldrbne  lr, [r1], #1
        strbne  lr, [r0], #1
        itttt   cs
        ldrbcs  ip, [r1], #1
        ldrbcs  lr, [r1]
        strbcs  ip, [r0], #1
        strbcs  lr, [r0]
        // Restore registers: optimized pop {r0, pc}
        ldrd    r8, r9, [sp], #8
        ldrd    r6, r7, [sp], #8
        ldrd    r4, r5, [sp], #8
        pop     {r0, pc}
 .L_dst_not_word_aligned:
        // Align dst to word.
        rsb     ip, ip, #4
        cmp     ip, #2
        itt     gt
        ldrbgt  lr, [r1], #1
        strbgt  lr, [r0], #1
        itt     ge
        ldrbge  lr, [r1], #1
        strbge  lr, [r0], #1
        ldrb    lr, [r1], #1
        strb    lr, [r0], #1
        sub     r2, r2, ip
        // Src is guaranteed to be at least word aligned by this point.
        b       .L_word_aligned
--- a/libc/arch-arm/cortex-a15/bionic/memset.S
+++ b/libc/arch-arm/cortex-a15/bionic/memset.S
@ -28,30 +28,59 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
-		/*
+        /*
-		 * Optimized memset() for ARM.
+         * Optimized memset() for ARM.
         *
         * memset() returns its first argument.
-		 */
+         */
        .fpu        neon
        .syntax     unified
 ENTRY(__memset_chk)
        .cfi_startproc
        cmp         r2, r3
        bls         .L_done
        // Preserve lr for backtrace.
        push        {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr         r0, error_message
        ldr         r1, error_code
 1:
        add         r0, pc
        bl          __fortify_chk_fail
 error_code:
        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
 error_message:
        .word       error_string-(1b+8)
        .cfi_endproc
 END(__memset_chk)
 ENTRY(bzero)
        .cfi_startproc
        mov         r2, r1
        mov         r1, #0
 .L_done:
        // Fall through to memset...
        .cfi_endproc
 END(bzero)
 ENTRY(memset)
-        .save       {r0}
+        .cfi_startproc
        stmfd       sp!, {r0}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset r0, 0
        // The new algorithm is slower for copies < 16 so use the old
        // neon code in that case.
        cmp         r2, #16
-        blo         set_less_than_16_unknown_align
+        blo         .L_set_less_than_16_unknown_align
        // Use strd which requires an even and odd register so move the
        // values so that:
@ -65,17 +94,17 @@ ENTRY(memset)
        orr         r1, r1, r1, lsr #8
        orr         r1, r1, r1, lsr #16
-check_alignment:
+.L_check_alignment:
        // Align destination to a double word to avoid the strd crossing
        // a cache line boundary.
        ands        ip, r3, #7
-        bne         do_double_word_align
+        bne         .L_do_double_word_align
-double_word_aligned:
+.L_double_word_aligned:
        mov         r0, r1
        subs        r2, #64
-        blo         set_less_than_64
+        blo         .L_set_less_than_64
 1:      // Main loop sets 64 bytes at a time.
        .irp        offset, #0, #8, #16, #24, #32, #40, #48, #56
@ -86,39 +115,39 @@ double_word_aligned:
        subs        r2, #64
        bge         1b
-set_less_than_64:
+.L_set_less_than_64:
        // Restore r2 to the count of bytes left to set.
        add         r2, #64
        lsls        ip, r2, #27
-        bcc         set_less_than_32
+        bcc         .L_set_less_than_32
        // Set 32 bytes.
        .irp        offset, #0, #8, #16, #24
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #32
-set_less_than_32:
+.L_set_less_than_32:
-        bpl         set_less_than_16
+        bpl         .L_set_less_than_16
        // Set 16 bytes.
        .irp        offset, #0, #8
        strd        r0, r1, [r3, \offset]
        .endr
        add         r3, #16
-set_less_than_16:
+.L_set_less_than_16:
        // Less than 16 bytes to set.
        lsls        ip, r2, #29
-        bcc         set_less_than_8
+        bcc         .L_set_less_than_8
        // Set 8 bytes.
        strd        r0, r1, [r3], #8
-set_less_than_8:
+.L_set_less_than_8:
-        bpl         set_less_than_4
+        bpl         .L_set_less_than_4
        // Set 4 bytes
        str         r1, [r3], #4
-set_less_than_4:
+.L_set_less_than_4:
        lsls        ip, r2, #31
        it          ne
        strbne      r1, [r3], #1
@ -129,7 +158,7 @@ set_less_than_4:
        ldmfd       sp!, {r0}
        bx          lr
-do_double_word_align:
+.L_do_double_word_align:
        rsb         ip, ip, #8
        sub         r2, r2, ip
        movs        r0, ip, lsl #31
@ -141,11 +170,11 @@ do_double_word_align:
        // Dst is at least word aligned by this point.
        cmp         ip, #4
-        blo         double_word_aligned
+        blo         .L_double_word_aligned
        str         r1, [r3], #4
-        b           double_word_aligned
+        b           .L_double_word_aligned
-set_less_than_16_unknown_align:
+.L_set_less_than_16_unknown_align:
        // Set up to 15 bytes.
        vdup.8      d0, r1
        movs        ip, r2, lsl #29
@ -161,4 +190,9 @@ set_less_than_16_unknown_align:
        strbcs      r1, [r0], #1
        ldmfd       sp!, {r0}
        bx          lr
        .cfi_endproc
 END(memset)
        .data
 error_string:
        .string     "memset buffer overflow"
--- a/libc/arch-arm/cortex-a15/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcat.S
@ -0,0 +1,568 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/asm.h>
    .syntax unified
    .thumb
    .thumb_func
    .macro m_push
    push    {r0, r4, r5, lr}
    .endm // m_push
    .macro m_pop
    pop     {r0, r4, r5, pc}
    .endm // m_pop
    .macro m_scan_byte
    ldrb    r3, [r0]
    cbz     r3, strcat_r0_scan_done
    add     r0, #1
    .endm // m_scan_byte
    .macro m_copy_byte reg, cmd, label
    ldrb    \reg, [r1], #1
    strb    \reg, [r0], #1
    \cmd    \reg, \label
    .endm // m_copy_byte
 ENTRY(strcat)
    // Quick check to see if src is empty.
    ldrb    r2, [r1]
    pld     [r1, #0]
    cbnz    r2, strcat_continue
    bx      lr
 strcat_continue:
    // To speed up really small dst strings, unroll checking the first 4 bytes.
    m_push
    m_scan_byte
    m_scan_byte
    m_scan_byte
    m_scan_byte
    ands    r3, r0, #7
    beq     strcat_mainloop
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcat_align_to_32
    ldrb    r5, [r0]
    cbz     r5, strcat_r0_scan_done
    add     r0, r0, #1
 strcat_align_to_32:
    bcc     strcat_align_to_64
    ldrb    r2, [r0]
    cbz     r2, strcat_r0_scan_done
    add     r0, r0, #1
    ldrb    r4, [r0]
    cbz     r4, strcat_r0_scan_done
    add     r0, r0, #1
 strcat_align_to_64:
    tst     r3, #4
    beq     strcat_mainloop
    ldr     r3, [r0], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_second_register
    b       strcat_mainloop
 strcat_r0_scan_done:
    // For short copies, hard-code checking the first 8 bytes since this
    // new code doesn't win until after about 8 bytes.
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
 strcpy_finish:
    m_pop
 strcpy_continue:
    ands    r3, r0, #7
    beq     strcpy_check_src_align
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcpy_align_to_32
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_32:
    bcc     strcpy_align_to_64
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_64:
    tst     r3, #4
    beq     strcpy_check_src_align
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    str     r2, [r0], #4
 strcpy_check_src_align:
    // At this point dst is aligned to a double word, check if src
    // is also aligned to a double word.
    ands    r3, r1, #7
    bne     strcpy_unaligned_copy
    .p2align 2
 strcpy_mainloop:
    ldrd    r2, r3, [r1], #8
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_mainloop
 strcpy_complete:
    m_pop
 strcpy_zero_in_first_register:
    lsls    lr, ip, #17
    bne     strcpy_copy1byte
    bcs     strcpy_copy2bytes
    lsls    ip, ip, #1
    bne     strcpy_copy3bytes
 strcpy_copy4bytes:
    // Copy 4 bytes to the destiniation.
    str     r2, [r0]
    m_pop
 strcpy_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_copy2bytes:
    strh    r2, [r0]
    m_pop
 strcpy_copy3bytes:
    strh    r2, [r0], #2
    lsr     r2, #16
    strb    r2, [r0]
    m_pop
 strcpy_zero_in_second_register:
    lsls    lr, ip, #17
    bne     strcpy_copy5bytes
    bcs     strcpy_copy6bytes
    lsls    ip, ip, #1
    bne     strcpy_copy7bytes
    // Copy 8 bytes to the destination.
    strd    r2, r3, [r0]
    m_pop
 strcpy_copy5bytes:
    str     r2, [r0], #4
    strb    r3, [r0]
    m_pop
 strcpy_copy6bytes:
    str     r2, [r0], #4
    strh    r3, [r0]
    m_pop
 strcpy_copy7bytes:
    str     r2, [r0], #4
    strh    r3, [r0], #2
    lsr     r3, #16
    strb    r3, [r0]
    m_pop
 strcpy_unaligned_copy:
    // Dst is aligned to a double word, while src is at an unknown alignment.
    // There are 7 different versions of the unaligned copy code
    // to prevent overreading the src. The mainloop of every single version
    // will store 64 bits per loop. The difference is how much of src can
    // be read without potentially crossing a page boundary.
    tbb     [pc, r3]
 strcpy_unaligned_branchtable:
    .byte 0
    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
    .p2align 2
    // Can read 7 bytes before possibly crossing a page.
 strcpy_unalign7:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r3, [r1]
    cbz     r3, strcpy_unalign7_copy5bytes
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign7_copy6bytes
    ldrb    r5, [r1, #2]
    cbz     r5, strcpy_unalign7_copy7bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    ip, r3, #24
    strd    r2, r3, [r0], #8
    beq     strcpy_unalign_return
    b       strcpy_unalign7
 strcpy_unalign7_copy5bytes:
    str     r2, [r0], #4
    strb    r3, [r0]
 strcpy_unalign_return:
    m_pop
 strcpy_unalign7_copy6bytes:
    str     r2, [r0], #4
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    m_pop
 strcpy_unalign7_copy7bytes:
    str     r2, [r0], #4
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    strb    r5, [r0], #1
    m_pop
    .p2align 2
    // Can read 6 bytes before possibly crossing a page.
 strcpy_unalign6:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldrb    r5, [r1, #1]
    cbz     r5, strcpy_unalign_copy6bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r3, #0xff0000
    beq     strcpy_copy7bytes
    lsrs    ip, r3, #24
    strd    r2, r3, [r0], #8
    beq     strcpy_unalign_return
    b       strcpy_unalign6
    .p2align 2
    // Can read 5 bytes before possibly crossing a page.
 strcpy_unalign5:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign5
 strcpy_unalign_copy5bytes:
    str     r2, [r0], #4
    strb    r4, [r0]
    m_pop
 strcpy_unalign_copy6bytes:
    str     r2, [r0], #4
    strb    r4, [r0], #1
    strb    r5, [r0]
    m_pop
    .p2align 2
    // Can read 4 bytes before possibly crossing a page.
 strcpy_unalign4:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign4
    .p2align 2
    // Can read 3 bytes before possibly crossing a page.
 strcpy_unalign3:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign3_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign3_copy2bytes
    ldrb    r4, [r1, #2]
    cbz     r4, strcpy_unalign3_copy3bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    lr, r2, #24
    beq     strcpy_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign3
 strcpy_unalign3_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_unalign3_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_pop
 strcpy_unalign3_copy3bytes:
    strb    r2, [r0], #1
    strb    r3, [r0], #1
    strb    r4, [r0]
    m_pop
    .p2align 2
    // Can read 2 bytes before possibly crossing a page.
 strcpy_unalign2:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign_copy2bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r2, #0xff0000
    beq     strcpy_copy3bytes
    lsrs    ip, r2, #24
    beq     strcpy_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign2
    .p2align 2
    // Can read 1 byte before possibly crossing a page.
 strcpy_unalign1:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign1
 strcpy_unalign_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_unalign_copy2bytes:
    strb    r2, [r0], #1
    strb    r4, [r0]
    m_pop
    .p2align 2
 strcat_mainloop:
    ldrd    r2, r3, [r0], #8
    pld     [r0, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_second_register
    b       strcat_mainloop
 strcat_zero_in_first_register:
    // Prefetch the src now, it's going to be used soon.
    pld     [r1, #0]
    lsls    lr, ip, #17
    bne     strcat_sub8
    bcs     strcat_sub7
    lsls    ip, ip, #1
    bne     strcat_sub6
    sub     r0, r0, #5
    b       strcat_r0_scan_done
 strcat_sub8:
    sub     r0, r0, #8
    b       strcat_r0_scan_done
 strcat_sub7:
    sub     r0, r0, #7
    b       strcat_r0_scan_done
 strcat_sub6:
    sub     r0, r0, #6
    b       strcat_r0_scan_done
 strcat_zero_in_second_register:
    // Prefetch the src now, it's going to be used soon.
    pld     [r1, #0]
    lsls    lr, ip, #17
    bne     strcat_sub4
    bcs     strcat_sub3
    lsls    ip, ip, #1
    bne     strcat_sub2
    sub     r0, r0, #1
    b       strcat_r0_scan_done
 strcat_sub4:
    sub     r0, r0, #4
    b       strcat_r0_scan_done
 strcat_sub3:
    sub     r0, r0, #3
    b       strcat_r0_scan_done
 strcat_sub2:
    sub     r0, r0, #2
    b       strcat_r0_scan_done
 END(strcat)
--- a/libc/arch-arm/cortex-a15/bionic/strcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcpy.S
@ -0,0 +1,451 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/asm.h>
    .syntax unified
    .thumb
    .thumb_func
    .macro m_push
    push    {r0, r4, r5, lr}
    .endm // m_push
    .macro m_pop
    pop     {r0, r4, r5, pc}
    .endm // m_pop
    .macro m_copy_byte reg, cmd, label
    ldrb    \reg, [r1], #1
    strb    \reg, [r0], #1
    \cmd    \reg, \label
    .endm // m_copy_byte
 ENTRY(strcpy)
    // For short copies, hard-code checking the first 8 bytes since this
    // new code doesn't win until after about 8 bytes.
    m_push
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
 strcpy_finish:
    m_pop
 strcpy_continue:
    pld     [r1, #0]
    ands    r3, r0, #7
    beq     strcpy_check_src_align
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcpy_align_to_32
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_32:
    bcc     strcpy_align_to_64
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_64:
    tst     r3, #4
    beq     strcpy_check_src_align
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    str     r2, [r0], #4
 strcpy_check_src_align:
    // At this point dst is aligned to a double word, check if src
    // is also aligned to a double word.
    ands    r3, r1, #7
    bne     strcpy_unaligned_copy
    .p2align 2
 strcpy_mainloop:
    ldrd    r2, r3, [r1], #8
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_mainloop
 strcpy_complete:
    m_pop
 strcpy_zero_in_first_register:
    lsls    lr, ip, #17
    bne     strcpy_copy1byte
    bcs     strcpy_copy2bytes
    lsls    ip, ip, #1
    bne     strcpy_copy3bytes
 strcpy_copy4bytes:
    // Copy 4 bytes to the destiniation.
    str     r2, [r0]
    m_pop
 strcpy_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_copy2bytes:
    strh    r2, [r0]
    m_pop
 strcpy_copy3bytes:
    strh    r2, [r0], #2
    lsr     r2, #16
    strb    r2, [r0]
    m_pop
 strcpy_zero_in_second_register:
    lsls    lr, ip, #17
    bne     strcpy_copy5bytes
    bcs     strcpy_copy6bytes
    lsls    ip, ip, #1
    bne     strcpy_copy7bytes
    // Copy 8 bytes to the destination.
    strd    r2, r3, [r0]
    m_pop
 strcpy_copy5bytes:
    str     r2, [r0], #4
    strb    r3, [r0]
    m_pop
 strcpy_copy6bytes:
    str     r2, [r0], #4
    strh    r3, [r0]
    m_pop
 strcpy_copy7bytes:
    str     r2, [r0], #4
    strh    r3, [r0], #2
    lsr     r3, #16
    strb    r3, [r0]
    m_pop
 strcpy_unaligned_copy:
    // Dst is aligned to a double word, while src is at an unknown alignment.
    // There are 7 different versions of the unaligned copy code
    // to prevent overreading the src. The mainloop of every single version
    // will store 64 bits per loop. The difference is how much of src can
    // be read without potentially crossing a page boundary.
    tbb     [pc, r3]
 strcpy_unaligned_branchtable:
    .byte 0
    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
    .p2align 2
    // Can read 7 bytes before possibly crossing a page.
 strcpy_unalign7:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r3, [r1]
    cbz     r3, strcpy_unalign7_copy5bytes
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign7_copy6bytes
    ldrb    r5, [r1, #2]
    cbz     r5, strcpy_unalign7_copy7bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    ip, r3, #24
    strd    r2, r3, [r0], #8
    beq     strcpy_unalign_return
    b       strcpy_unalign7
 strcpy_unalign7_copy5bytes:
    str     r2, [r0], #4
    strb    r3, [r0]
 strcpy_unalign_return:
    m_pop
 strcpy_unalign7_copy6bytes:
    str     r2, [r0], #4
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    m_pop
 strcpy_unalign7_copy7bytes:
    str     r2, [r0], #4
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    strb    r5, [r0], #1
    m_pop
    .p2align 2
    // Can read 6 bytes before possibly crossing a page.
 strcpy_unalign6:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldrb    r5, [r1, #1]
    cbz     r5, strcpy_unalign_copy6bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r3, #0xff0000
    beq     strcpy_copy7bytes
    lsrs    ip, r3, #24
    strd    r2, r3, [r0], #8
    beq     strcpy_unalign_return
    b       strcpy_unalign6
    .p2align 2
    // Can read 5 bytes before possibly crossing a page.
 strcpy_unalign5:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign5
 strcpy_unalign_copy5bytes:
    str     r2, [r0], #4
    strb    r4, [r0]
    m_pop
 strcpy_unalign_copy6bytes:
    str     r2, [r0], #4
    strb    r4, [r0], #1
    strb    r5, [r0]
    m_pop
    .p2align 2
    // Can read 4 bytes before possibly crossing a page.
 strcpy_unalign4:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign4
    .p2align 2
    // Can read 3 bytes before possibly crossing a page.
 strcpy_unalign3:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign3_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign3_copy2bytes
    ldrb    r4, [r1, #2]
    cbz     r4, strcpy_unalign3_copy3bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    lr, r2, #24
    beq     strcpy_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign3
 strcpy_unalign3_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_unalign3_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_pop
 strcpy_unalign3_copy3bytes:
    strb    r2, [r0], #1
    strb    r3, [r0], #1
    strb    r4, [r0]
    m_pop
    .p2align 2
    // Can read 2 bytes before possibly crossing a page.
 strcpy_unalign2:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign_copy2bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r2, #0xff0000
    beq     strcpy_copy3bytes
    lsrs    ip, r2, #24
    beq     strcpy_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign2
    .p2align 2
    // Can read 1 byte before possibly crossing a page.
 strcpy_unalign1:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    strd    r2, r3, [r0], #8
    b       strcpy_unalign1
 strcpy_unalign_copy1byte:
    strb    r2, [r0]
    m_pop
 strcpy_unalign_copy2bytes:
    strb    r2, [r0], #1
    strb    r4, [r0]
    m_pop
 END(strcpy)
--- a/libc/arch-arm/cortex-a15/bionic/strlen.S
+++ b/libc/arch-arm/cortex-a15/bionic/strlen.S
@ -61,34 +61,32 @@
    .thumb_func
 ENTRY(strlen)
-    pld [r1, #128]
+    pld     [r0, #0]
-    mov r1, r0
+    mov     r1, r0
-    rsb     r3, r0, #0
+    ands    r3, r0, #7
    ands    r3, r3, #7
    beq     mainloop
    // Align to a double word (64 bits).
-    ands    ip, r3, #1
+    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     align_to_32
    ldrb    r2, [r1], #1
-    cmp     r2, #0
+    cbz     r2, update_count_and_return
    beq     update_count_and_return
 align_to_32:
    bcc     align_to_64
    ands    ip, r3, #2
    beq     align_to_64
    ldrb    r2, [r1], #1
-    cmp     r2, #0
+    cbz     r2, update_count_and_return
    beq     update_count_and_return
    ldrb    r2, [r1], #1
-    cmp     r2, #0
+    cbz     r2, update_count_and_return
    beq     update_count_and_return
 align_to_64:
-    ands    ip, r3, #4
+    tst     r3, #4
    beq     mainloop
    ldr     r3, [r1], #4
@ -97,6 +95,7 @@ align_to_64:
    ands    ip, ip, #0x80808080
    bne     zero_in_second_register
    .p2align 2
 mainloop:
    ldrd    r2, r3, [r1], #8
@ -113,39 +112,54 @@ mainloop:
    bne     zero_in_second_register
    b       mainloop
 update_count_and_return:
    sub     r0, r1, r0
    sub     r0, r0, #1
    bx      lr
 zero_in_first_register:
-    sub     r1, r1, #4
+    sub     r0, r1, r0
    lsls    r3, ip, #17
    bne     sub8_and_return
    bcs     sub7_and_return
    lsls    ip, ip, #1
    bne     sub6_and_return
    sub     r0, r0, #5
    bx      lr
 sub8_and_return:
    sub     r0, r0, #8
    bx      lr
 sub7_and_return:
    sub     r0, r0, #7
    bx      lr
 sub6_and_return:
    sub     r0, r0, #6
    bx      lr
 zero_in_second_register:
    sub     r0, r1, r0
    lsls    r3, ip, #17
    bne     sub4_and_return
    bcs     sub3_and_return
    lsls    ip, ip, #1
    bne     sub2_and_return
-    // Check for zero in byte 0.
+    sub     r0, r0, #1
-    ands    r1, ip, #0x80
+    bx      lr
    beq     check_byte1
 sub4_and_return:
    sub     r0, r0, #4
    bx      lr
-check_byte1:
+sub3_and_return:
    // Check for zero in byte 1.
    ands    r1, ip, #0x8000
    beq     check_byte2
    sub     r0, r0, #3
    bx      lr
-check_byte2:
+sub2_and_return:
    // Check for zero in byte 2.
    ands    r1, ip, #0x800000
    beq     return
    sub     r0, r0, #2
    bx      lr
 update_count_and_return:
    sub     r0, r1, r0
 return:
    sub     r0, r0, #1
    bx      lr
 END(strlen)
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@ -1,6 +1,10 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/cortex-a15/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/cortex-a15/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a15/bionic/strcmp.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S)
 $(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a15/bionic/__strcat_chk.S)
 $(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a15/bionic/__strcpy_chk.S)
 include bionic/libc/arch-arm/generic/generic.mk
--- a/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a9/bionic/__strcat_chk.S
@ -0,0 +1,218 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .fpu    neon
    .thumb
    .thumb_func
 // Get the length of src string, then get the source of the dst string.
 // Check that the two lengths together don't exceed the threshold, then
 // do a memcpy of the data.
 ENTRY(__strcat_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    push    {r4, r5}
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 4
    mov     lr, r2
    // Save the dst register to r5
    mov     r5, r0
    // Zero out r4
    eor     r4, r4, r4
    // r1 contains the address of the string to count.
 .L_strlen_start:
    mov     r0, r1
    ands    r3, r0, #7
    bne     .L_align_src
    .p2align 2
 .L_mainloop:
    ldmia   r1!, {r2, r3}
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_zero_in_first_register:
    sub     r3, r1, r0
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     .L_check_byte1_reg1
    sub     r3, r3, #8
    b       .L_finish
 .L_check_byte1_reg1:
    bcc     .L_check_byte2_reg1
    sub     r3, r3, #7
    b       .L_finish
 .L_check_byte2_reg1:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    it      ne
    subne   r3, r3, #6
    bne     .L_finish
    sub     r3, r3, #5
    b       .L_finish
 .L_zero_in_second_register:
    sub     r3, r1, r0
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     .L_check_byte1_reg2
    sub     r3, r3, #4
    b       .L_finish
 .L_check_byte1_reg2:
    bcc     .L_check_byte2_reg2
    sub     r3, r3, #3
    b       .L_finish
 .L_check_byte2_reg2:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    it      ne
    subne   r3, r3, #2
    bne     .L_finish
    sub     r3, r3, #1
    b       .L_finish
 .L_align_src:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r1], #1
    cbz     r2, .L_done
 .L_align_to_32:
    bcc     .L_align_to_64
    ldrb    r2, [r1], #1
    cbz     r2, .L_done
    ldrb    r2, [r1], #1
    cbz     r2, .L_done
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_done:
    sub     r3, r1, r0
    sub     r3, r3, #1
 .L_finish:
    cmp     r4, #0
    bne     .L_strlen_done
    // Time to get the dst string length.
    mov     r1, r5
    // Save the original source address to r5.
    mov     r5, r0
    // Save the current length (adding 1 for the terminator).
    add     r4, r3, #1
    b       .L_strlen_start
    // r0 holds the pointer to the dst string.
    // r3 holds the dst string length.
    // r4 holds the src string length + 1.
 .L_strlen_done:
    add     r2, r3, r4
    cmp     r2, lr
    bgt     .L_fortify_check_failed
    // Set up the registers for the memcpy code.
    mov     r1, r5
    pld     [r1, #64]
    mov     r2, r4
    add     r0, r0, r3
    pop     {r4, r5}
    .cfi_adjust_cfa_offset -8
    .cfi_restore r4
    .cfi_restore r5
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 4
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcat_chk)
    .data
 error_string:
    .string "strcat buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a9/bionic/__strcpy_chk.S
@ -0,0 +1,178 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .fpu    neon
    .thumb
    .thumb_func
 // Get the length of the source string first, then do a memcpy of the data
 // instead of a strcpy.
 ENTRY(__strcpy_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    mov     lr, r2
    mov     r0, r1
    ands    r3, r0, #7
    bne     .L_align_src
    .p2align 2
 .L_mainloop:
    ldmia   r0!, {r2, r3}
    pld     [r0, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_zero_in_first_register:
    sub     r3, r0, r1
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     .L_check_byte1_reg1
    sub     r3, r3, #8
    b       .L_check_size
 .L_check_byte1_reg1:
    bcc     .L_check_byte2_reg1
    sub     r3, r3, #7
    b       .L_check_size
 .L_check_byte2_reg1:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    it      ne
    subne   r3, r3, #6
    bne     .L_check_size
    sub     r3, r3, #5
    b       .L_check_size
 .L_zero_in_second_register:
    sub     r3, r0, r1
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     .L_check_byte1_reg2
    sub     r3, r3, #4
    b       .L_check_size
 .L_check_byte1_reg2:
    bcc     .L_check_byte2_reg2
    sub     r3, r3, #3
    b       .L_check_size
 .L_check_byte2_reg2:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    it      ne
    subne   r3, r3, #2
    bne     .L_check_size
    sub     r3, r3, #1
    b       .L_check_size
 .L_align_src:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r0], #1
    cbz     r2, .L_done
 .L_align_to_32:
    bcc     .L_align_to_64
    ldrb    r2, [r0], #1
    cbz     r2, .L_done
    ldrb    r2, [r0], #1
    cbz     r2, .L_done
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r2, [r0], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_done:
    sub     r3, r0, r1
    sub     r3, r3, #1
 .L_check_size:
    pld     [r1, #0]
    pld     [r1, #64]
    ldr     r0, [sp]
    cmp     r3, lr
    bge     .L_fortify_check_failed
    // Add 1 for copy length to get the string terminator.
    add     r2, r3, #1
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcpy_chk)
    .data
 error_string:
    .string "strcpy buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy.S
@ -26,8 +26,8 @@
 * SUCH DAMAGE.
 */
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
 /*
 * This code assumes it is running on a processor that supports all arm v7
@ -35,177 +35,51 @@
 * cache line.
 */
-        .text
+        .syntax unified
        .fpu    neon
        .thumb
        .thumb_func
-#define CACHE_LINE_SIZE     32
+ENTRY(__memcpy_chk)
        .cfi_startproc
        cmp         r2, r3
        bgt         __memcpy_chk_fail
        // Fall through to memcpy...
        .cfi_endproc
 END(__memcpy_chk)
 ENTRY(memcpy)
-        .save       {r0, lr}
+        .cfi_startproc
-        /* start preloading as early as possible */
+        pld     [r1, #0]
-        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        stmfd   sp!, {r0, lr}
-        stmfd       sp!, {r0, lr}
+        .cfi_def_cfa_offset 8
-        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        .cfi_rel_offset r0, 0
        .cfi_rel_offset lr, 4
        pld     [r1, #64]
-        // Check so divider is at least 16 bytes, needed for alignment code.
+        #include "memcpy_base.S"
-        cmp         r2, #16
+        .cfi_endproc
        blo         5f
        /* check if buffers are aligned. If so, run arm-only version */
        eor         r3, r0, r1
        ands        r3, r3, #0x3
        beq         11f
        /* Check the upper size limit for Neon unaligned memory access in memcpy */
        cmp         r2, #224
        blo         3f
        /* align destination to 16 bytes for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         3f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        ldrmib      lr, [r1], #1
        strmib      lr, [r0], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
        /* preload immediately the next cache line, which we may need */
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
 3:
        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
        /* preload all the cache lines we need */
        pld         [r1, #(CACHE_LINE_SIZE * 4)]
        pld         [r1, #(CACHE_LINE_SIZE * 6)]
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0 - d3}, [r1]!
        vld1.8      {d4 - d7}, [r1]!
        pld         [r1, #(CACHE_LINE_SIZE * 6)]
        subs        r2, r2, #64
        vst1.8      {d0 - d3}, [r0]!
        vst1.8      {d4 - d7}, [r0]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3}, [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3}, [r0]!
        bhs         3b
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        ldrmib      r3, [r1], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strmib      r3, [r0], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
 11:
        /* Simple arm-only copy loop to handle aligned copy operations */
        stmfd       sp!, {r4, r5, r6, r7, r8}
        pld         [r1, #(CACHE_LINE_SIZE * 4)]
        /* Check alignment */
        rsb         r3, r1, #0
        ands        r3, #3
        beq         2f
        /* align source to 32 bits. We need to insert 2 instructions between
         * a ldr[b|h] and str[b|h] because byte and half-word instructions
         * stall 2 cycles.
         */
        movs        r12, r3, lsl #31
        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
        ldrmib      r3, [r1], #1
        ldrcsb      r4, [r1], #1
        ldrcsb      r5, [r1], #1
        strmib      r3, [r0], #1
        strcsb      r4, [r0], #1
        strcsb      r5, [r0], #1
 2:
        subs        r2, r2, #64
        blt         4f
 3:      /* Main copy loop, copying 64 bytes at a time */
        pld         [r1, #(CACHE_LINE_SIZE * 8)]
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        subs        r2, r2, #64
        bge         3b
 4:      /* Check if there are > 32 bytes left */
        adds        r2, r2, #64
        subs        r2, r2, #32
        blt         5f
        /* Copy 32 bytes */
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        subs        r2, #32
 5:      /* Handle any remaining bytes */
        adds        r2, #32
        beq         6f
        movs        r12, r2, lsl #28
        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
        stmcsia     r0!, {r3, r4, r5, r6}
        stmmiia     r0!, {r7, r8}
        movs        r12, r2, lsl #30
        ldrcs       r3, [r1], #4            /*  4 bytes */
        ldrmih      r4, [r1], #2            /*  2 bytes */
        strcs       r3, [r0], #4
        strmih      r4, [r0], #2
        tst         r2, #0x1
        ldrneb      r3, [r1]                /*  last byte  */
        strneb      r3, [r0]
 6:
        ldmfd       sp!, {r4, r5, r6, r7, r8}
        ldmfd       sp!, {r0, pc}
 END(memcpy)
        .cfi_startproc
 __memcpy_chk_fail:
        // Preserve lr for backtrace.
        push    {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr     r0, error_message
        ldr     r1, error_code
 1:
        add     r0, pc
        bl      __fortify_chk_fail
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
        .word   error_string-(1b+4)
        .cfi_endproc
        .data
 error_string:
        .string     "memcpy buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
@ -0,0 +1,206 @@
 /*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * This code assumes it is running on a processor that supports all arm v7
 * instructions, that supports neon instructions, and that has a 32 byte
 * cache line.
 */
        // Check so divider is at least 16 bytes, needed for alignment code.
        cmp         r2, #16
        blo         5f
        /* check if buffers are aligned. If so, run arm-only version */
        eor         r3, r0, r1
        ands        r3, r3, #0x3
        beq         11f
        /* Check the upper size limit for Neon unaligned memory access in memcpy */
        cmp         r2, #224
        blo         3f
        /* align destination to 16 bytes for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         3f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        itt         mi
        ldrbmi      lr, [r1], #1
        strbmi      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
        /* preload immediately the next cache line, which we may need */
        pld         [r1, #0]
        pld         [r1, #(32 * 2)]
 3:
        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
        /* preload all the cache lines we need */
        pld         [r1, #(32 * 4)]
        pld         [r1, #(32 * 6)]
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0 - d3}, [r1]!
        vld1.8      {d4 - d7}, [r1]!
        pld         [r1, #(32 * 6)]
        subs        r2, r2, #64
        vst1.8      {d0 - d3}, [r0]!
        vst1.8      {d4 - d7}, [r0]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3}, [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3}, [r0]!
        bhs         3b
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld1.32     {d0[0]}, [r1]!
        vst1.32     {d0[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        itt         mi
        ldrbmi      r3, [r1], #1
        strbmi      r3, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
 11:
        /* Simple arm-only copy loop to handle aligned copy operations */
        stmfd       sp!, {r4, r5, r6, r7, r8}
        pld         [r1, #(32 * 4)]
        /* Check alignment */
        rsb         r3, r1, #0
        ands        r3, #3
        beq         2f
        /* align source to 32 bits. We need to insert 2 instructions between
         * a ldr[b|h] and str[b|h] because byte and half-word instructions
         * stall 2 cycles.
         */
        movs        r12, r3, lsl #31
        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
        itt         mi
        ldrbmi      r3, [r1], #1
        strbmi      r3, [r0], #1
        itttt       cs
        ldrbcs      r4, [r1], #1
        ldrbcs      r5, [r1], #1
        strbcs      r4, [r0], #1
        strbcs      r5, [r0], #1
 2:
        subs        r2, r2, #64
        blt         4f
 3:      /* Main copy loop, copying 64 bytes at a time */
        pld         [r1, #(32 * 8)]
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        subs        r2, r2, #64
        bge         3b
 4:      /* Check if there are > 32 bytes left */
        adds        r2, r2, #64
        subs        r2, r2, #32
        blt         5f
        /* Copy 32 bytes */
        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
        subs        r2, #32
 5:      /* Handle any remaining bytes */
        adds        r2, #32
        beq         6f
        movs        r12, r2, lsl #28
        itt         cs
        ldmiacs     r1!, {r3, r4, r5, r6}   /* 16 bytes */
        stmiacs     r0!, {r3, r4, r5, r6}
        itt         mi
        ldmiami     r1!, {r7, r8}           /*  8 bytes */
        stmiami     r0!, {r7, r8}
        movs        r12, r2, lsl #30
        itt         cs
        ldrcs       r3, [r1], #4            /*  4 bytes */
        strcs       r3, [r0], #4
        itt         mi
        ldrhmi      r4, [r1], #2            /*  2 bytes */
        strhmi      r4, [r0], #2
        tst         r2, #0x1
        itt         ne
        ldrbne      r3, [r1]                /*  last byte  */
        strbne      r3, [r0]
 6:
        ldmfd       sp!, {r4, r5, r6, r7, r8}
        ldmfd       sp!, {r0, pc}
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@ -28,6 +28,7 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
 /*
 * This code assumes it is running on a processor that supports all arm v7
@ -36,19 +37,49 @@
    .fpu    neon
 ENTRY(__memset_chk)
        .cfi_startproc
        cmp         r2, r3
        bls         .L_done
        // Preserve lr for backtrace.
        push        {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr         r0, error_message
        ldr         r1, error_code
 1:
        add         r0, pc
        bl          __fortify_chk_fail
 error_code:
        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
 error_message:
        .word       error_string-(1b+8)
        .cfi_endproc
 END(__memset_chk)
 ENTRY(bzero)
        .cfi_startproc
        mov     r2, r1
        mov     r1, #0
 .L_done:
        // Fall through to memset...
        .cfi_endproc
 END(bzero)
 /* memset() returns its first argument.  */
 ENTRY(memset)
        .cfi_startproc
        # The neon memset only wins for less than 132.
        cmp         r2, #132
        bhi         11f
        .save       {r0}
        stmfd       sp!, {r0}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset r0, 0
        vdup.8      q0, r1
@ -86,8 +117,15 @@ ENTRY(memset)
         * offset = (4-(src&3))&3 = -src & 3
         */
        .save       {r0, r4-r7, lr}
        stmfd       sp!, {r0, r4-r7, lr}
        .cfi_def_cfa_offset 24
        .cfi_rel_offset r0, 0
        .cfi_rel_offset r4, 4
        .cfi_rel_offset r5, 8
        .cfi_rel_offset r6, 12
        .cfi_rel_offset r7, 16
        .cfi_rel_offset lr, 20
        rsb         r3, r0, #0
        ands        r3, r3, #3
        cmp         r3, r2
@ -149,4 +187,9 @@ ENTRY(memset)
        strcsb      r1, [r0]
        ldmfd       sp!, {r0, r4-r7, lr}
        bx          lr
        .cfi_endproc
 END(memset)
        .data
 error_string:
        .string     "memset buffer overflow"
--- a/libc/arch-arm/cortex-a9/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcat.S
@ -0,0 +1,548 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/asm.h>
    .syntax unified
    .thumb
    .thumb_func
    .macro m_push
    push    {r0, r4, r5, lr}
    .endm // m_push
    .macro m_ret inst
    \inst   {r0, r4, r5, pc}
    .endm // m_ret
    .macro m_scan_byte
    ldrb    r3, [r0]
    cbz     r3, strcat_r0_scan_done
    add     r0, #1
    .endm // m_scan_byte
    .macro m_copy_byte reg, cmd, label
    ldrb    \reg, [r1], #1
    strb    \reg, [r0], #1
    \cmd    \reg, \label
    .endm // m_copy_byte
 ENTRY(strcat)
    // Quick check to see if src is empty.
    ldrb        r2, [r1]
    pld         [r1, #0]
    cbnz        r2, strcat_continue
    bx          lr
 strcat_continue:
    // To speed up really small dst strings, unroll checking the first 4 bytes.
    m_push
    m_scan_byte
    m_scan_byte
    m_scan_byte
    m_scan_byte
    ands    r3, r0, #7
    bne     strcat_align_src
    .p2align 2
 strcat_mainloop:
    ldmia   r0!, {r2, r3}
    pld     [r0, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_second_register
    b       strcat_mainloop
 strcat_zero_in_first_register:
    sub     r0, r0, #4
 strcat_zero_in_second_register:
    // Check for zero in byte 0.
    tst     ip, #0x80
    it      ne
    subne   r0, r0, #4
    bne     strcat_r0_scan_done
    // Check for zero in byte 1.
    tst     ip, #0x8000
    it      ne
    subne   r0, r0, #3
    bne     strcat_r0_scan_done
    // Check for zero in byte 2.
    tst     ip, #0x800000
    it      ne
    subne   r0, r0, #2
    it      eq
    // Zero is in byte 3.
    subeq   r0, r0, #1
 strcat_r0_scan_done:
    // Unroll the first 8 bytes that will be copied.
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
 strcpy_finish:
    m_ret   inst=pop
 strcpy_continue:
    pld     [r1, #0]
    ands    r3, r0, #7
    bne     strcpy_align_dst
 strcpy_check_src_align:
    // At this point dst is aligned to a double word, check if src
    // is also aligned to a double word.
    ands    r3, r1, #7
    bne     strcpy_unaligned_copy
    .p2align 2
 strcpy_mainloop:
    ldmia   r1!, {r2, r3}
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_mainloop
 strcpy_zero_in_first_register:
    lsls    lr, ip, #17
    itt     ne
    strbne  r2, [r0]
    m_ret   inst=popne
    itt     cs
    strhcs  r2, [r0]
    m_ret   inst=popcs
    lsls    ip, ip, #1
    itt     eq
    streq   r2, [r0]
    m_ret   inst=popeq
    strh    r2, [r0], #2
    lsr     r3, r2, #16
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_zero_in_second_register:
    lsls    lr, ip, #17
    ittt    ne
    stmiane r0!, {r2}
    strbne  r3, [r0]
    m_ret   inst=popne
    ittt    cs
    strcs   r2, [r0], #4
    strhcs  r3, [r0]
    m_ret   inst=popcs
    lsls    ip, ip, #1
    itt     eq
    stmiaeq r0, {r2, r3}
    m_ret   inst=popeq
    stmia   r0!, {r2}
    strh    r3, [r0], #2
    lsr     r4, r3, #16
    strb    r4, [r0]
    m_ret   inst=pop
 strcpy_align_dst:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcpy_align_to_32
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_32:
    bcc     strcpy_align_to_64
    ldrb    r4, [r1], #1
    strb    r4, [r0], #1
    cmp     r4, #0
    it      eq
    m_ret   inst=popeq
    ldrb    r5, [r1], #1
    strb    r5, [r0], #1
    cmp     r5, #0
    it      eq
    m_ret   inst=popeq
 strcpy_align_to_64:
    tst     r3, #4
    beq     strcpy_check_src_align
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    stmia   r0!, {r2}
    b       strcpy_check_src_align
 strcpy_complete:
    m_ret   inst=pop
 strcpy_unaligned_copy:
    // Dst is aligned to a double word, while src is at an unknown alignment.
    // There are 7 different versions of the unaligned copy code
    // to prevent overreading the src. The mainloop of every single version
    // will store 64 bits per loop. The difference is how much of src can
    // be read without potentially crossing a page boundary.
    tbb     [pc, r3]
 strcpy_unaligned_branchtable:
    .byte 0
    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
    .p2align 2
    // Can read 7 bytes before possibly crossing a page.
 strcpy_unalign7:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r3, [r1]
    cbz     r3, strcpy_unalign7_copy5bytes
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign7_copy6bytes
    ldrb    r5, [r1, #2]
    cbz     r5, strcpy_unalign7_copy7bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    ip, r3, #24
    stmia   r0!, {r2, r3}
    beq     strcpy_unalign_return
    b       strcpy_unalign7
 strcpy_unalign7_copy5bytes:
    stmia   r0!, {r2}
    strb    r3, [r0]
 strcpy_unalign_return:
    m_ret   inst=pop
 strcpy_unalign7_copy6bytes:
    stmia   r0!, {r2}
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    m_ret   inst=pop
 strcpy_unalign7_copy7bytes:
    stmia   r0!, {r2}
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    strb    r5, [r0], #1
    m_ret   inst=pop
    .p2align 2
    // Can read 6 bytes before possibly crossing a page.
 strcpy_unalign6:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldrb    r5, [r1, #1]
    cbz     r5, strcpy_unalign_copy6bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r3, #0xff0000
    beq     strcpy_unalign6_copy7bytes
    lsrs    ip, r3, #24
    stmia   r0!, {r2, r3}
    beq     strcpy_unalign_return
    b       strcpy_unalign6
 strcpy_unalign6_copy7bytes:
    stmia   r0!, {r2}
    strh    r3, [r0], #2
    lsr     r3, #16
    strb    r3, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 5 bytes before possibly crossing a page.
 strcpy_unalign5:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign5
 strcpy_unalign_copy5bytes:
    stmia   r0!, {r2}
    strb    r4, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy6bytes:
    stmia   r0!, {r2}
    strb    r4, [r0], #1
    strb    r5, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 4 bytes before possibly crossing a page.
 strcpy_unalign4:
    ldmia   r1!, {r2}
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldmia   r1!, {r3}
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign4
    .p2align 2
    // Can read 3 bytes before possibly crossing a page.
 strcpy_unalign3:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign3_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign3_copy2bytes
    ldrb    r4, [r1, #2]
    cbz     r4, strcpy_unalign3_copy3bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    lr, r2, #24
    beq     strcpy_unalign_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign3
 strcpy_unalign3_copy1byte:
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign3_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_unalign3_copy3bytes:
    strb    r2, [r0], #1
    strb    r3, [r0], #1
    strb    r4, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 2 bytes before possibly crossing a page.
 strcpy_unalign2:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign_copy2bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r2, #0xff0000
    beq     strcpy_unalign_copy3bytes
    lsrs    ip, r2, #24
    beq     strcpy_unalign_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign2
    .p2align 2
    // Can read 1 byte before possibly crossing a page.
 strcpy_unalign1:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign1
 strcpy_unalign_copy1byte:
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy3bytes:
    strh    r2, [r0], #2
    lsr     r2, #16
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy4bytes:
    stmia   r0, {r2}
    m_ret   inst=pop
 strcat_align_src:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcat_align_to_32
    ldrb    r2, [r0], #1
    cbz     r2, strcat_r0_update
 strcat_align_to_32:
    bcc     strcat_align_to_64
    ldrb    r2, [r0], #1
    cbz     r2, strcat_r0_update
    ldrb    r2, [r0], #1
    cbz     r2, strcat_r0_update
 strcat_align_to_64:
    tst     r3, #4
    beq     strcat_mainloop
    ldr     r3, [r0], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcat_zero_in_second_register
    b       strcat_mainloop
 strcat_r0_update:
    sub     r0, r0, #1
    b strcat_r0_scan_done
 END(strcat)
--- a/libc/arch-arm/cortex-a9/bionic/strcpy.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcpy.S
@ -0,0 +1,456 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/asm.h>
    .syntax unified
    .thumb
    .thumb_func
    .macro m_push
    push    {r0, r4, r5, lr}
    .endm // m_push
    .macro m_ret inst
    \inst   {r0, r4, r5, pc}
    .endm // m_ret
    .macro m_copy_byte reg, cmd, label
    ldrb    \reg, [r1], #1
    strb    \reg, [r0], #1
    \cmd    \reg, \label
    .endm // m_copy_byte
 ENTRY(strcpy)
    // Unroll the first 8 bytes that will be copied.
    m_push
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
 strcpy_finish:
    m_ret   inst=pop
 strcpy_continue:
    pld     [r1, #0]
    ands    r3, r0, #7
    bne     strcpy_align_dst
 strcpy_check_src_align:
    // At this point dst is aligned to a double word, check if src
    // is also aligned to a double word.
    ands    r3, r1, #7
    bne     strcpy_unaligned_copy
    .p2align 2
 strcpy_mainloop:
    ldmia   r1!, {r2, r3}
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_mainloop
 strcpy_zero_in_first_register:
    lsls    lr, ip, #17
    itt     ne
    strbne  r2, [r0]
    m_ret   inst=popne
    itt     cs
    strhcs  r2, [r0]
    m_ret   inst=popcs
    lsls    ip, ip, #1
    itt     eq
    streq   r2, [r0]
    m_ret   inst=popeq
    strh    r2, [r0], #2
    lsr     r3, r2, #16
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_zero_in_second_register:
    lsls    lr, ip, #17
    ittt    ne
    stmiane r0!, {r2}
    strbne  r3, [r0]
    m_ret   inst=popne
    ittt    cs
    strcs   r2, [r0], #4
    strhcs  r3, [r0]
    m_ret   inst=popcs
    lsls    ip, ip, #1
    itt     eq
    stmiaeq r0, {r2, r3}
    m_ret   inst=popeq
    stmia   r0!, {r2}
    strh    r3, [r0], #2
    lsr     r4, r3, #16
    strb    r4, [r0]
    m_ret   inst=pop
 strcpy_align_dst:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     strcpy_align_to_32
    ldrb    r2, [r1], #1
    strb    r2, [r0], #1
    cbz     r2, strcpy_complete
 strcpy_align_to_32:
    bcc     strcpy_align_to_64
    ldrb    r4, [r1], #1
    strb    r4, [r0], #1
    cmp     r4, #0
    it      eq
    m_ret   inst=popeq
    ldrb    r5, [r1], #1
    strb    r5, [r0], #1
    cmp     r5, #0
    it      eq
    m_ret   inst=popeq
 strcpy_align_to_64:
    tst     r3, #4
    beq     strcpy_check_src_align
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    stmia   r0!, {r2}
    b       strcpy_check_src_align
 strcpy_complete:
    m_ret   inst=pop
 strcpy_unaligned_copy:
    // Dst is aligned to a double word, while src is at an unknown alignment.
    // There are 7 different versions of the unaligned copy code
    // to prevent overreading the src. The mainloop of every single version
    // will store 64 bits per loop. The difference is how much of src can
    // be read without potentially crossing a page boundary.
    tbb     [pc, r3]
 strcpy_unaligned_branchtable:
    .byte 0
    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
    .p2align 2
    // Can read 7 bytes before possibly crossing a page.
 strcpy_unalign7:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r3, [r1]
    cbz     r3, strcpy_unalign7_copy5bytes
    ldrb    r4, [r1, #1]
    cbz     r4, strcpy_unalign7_copy6bytes
    ldrb    r5, [r1, #2]
    cbz     r5, strcpy_unalign7_copy7bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    ip, r3, #24
    stmia   r0!, {r2, r3}
    beq     strcpy_unalign_return
    b       strcpy_unalign7
 strcpy_unalign7_copy5bytes:
    stmia   r0!, {r2}
    strb    r3, [r0]
 strcpy_unalign_return:
    m_ret   inst=pop
 strcpy_unalign7_copy6bytes:
    stmia   r0!, {r2}
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    m_ret   inst=pop
 strcpy_unalign7_copy7bytes:
    stmia   r0!, {r2}
    strb    r3, [r0], #1
    strb    r4, [r0], #1
    strb    r5, [r0], #1
    m_ret   inst=pop
    .p2align 2
    // Can read 6 bytes before possibly crossing a page.
 strcpy_unalign6:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldrb    r5, [r1, #1]
    cbz     r5, strcpy_unalign_copy6bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r3, #0xff0000
    beq     strcpy_unalign6_copy7bytes
    lsrs    ip, r3, #24
    stmia   r0!, {r2, r3}
    beq     strcpy_unalign_return
    b       strcpy_unalign6
 strcpy_unalign6_copy7bytes:
    stmia   r0!, {r2}
    strh    r3, [r0], #2
    lsr     r3, #16
    strb    r3, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 5 bytes before possibly crossing a page.
 strcpy_unalign5:
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldrb    r4, [r1]
    cbz     r4, strcpy_unalign_copy5bytes
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign5
 strcpy_unalign_copy5bytes:
    stmia   r0!, {r2}
    strb    r4, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy6bytes:
    stmia   r0!, {r2}
    strb    r4, [r0], #1
    strb    r5, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 4 bytes before possibly crossing a page.
 strcpy_unalign4:
    ldmia   r1!, {r2}
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    ldmia   r1!, {r3}
    pld     [r1, #64]
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign4
    .p2align 2
    // Can read 3 bytes before possibly crossing a page.
 strcpy_unalign3:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign3_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign3_copy2bytes
    ldrb    r4, [r1, #2]
    cbz     r4, strcpy_unalign3_copy3bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    lsrs    lr, r2, #24
    beq     strcpy_unalign_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign3
 strcpy_unalign3_copy1byte:
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign3_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_unalign3_copy3bytes:
    strb    r2, [r0], #1
    strb    r3, [r0], #1
    strb    r4, [r0]
    m_ret   inst=pop
    .p2align 2
    // Can read 2 bytes before possibly crossing a page.
 strcpy_unalign2:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldrb    r3, [r1, #1]
    cbz     r3, strcpy_unalign_copy2bytes
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    tst     r2, #0xff0000
    beq     strcpy_unalign_copy3bytes
    lsrs    ip, r2, #24
    beq     strcpy_unalign_copy4bytes
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign2
    .p2align 2
    // Can read 1 byte before possibly crossing a page.
 strcpy_unalign1:
    ldrb    r2, [r1]
    cbz     r2, strcpy_unalign_copy1byte
    ldr     r2, [r1], #4
    ldr     r3, [r1], #4
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     strcpy_zero_in_second_register
    stmia   r0!, {r2, r3}
    b       strcpy_unalign1
 strcpy_unalign_copy1byte:
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy2bytes:
    strb    r2, [r0], #1
    strb    r3, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy3bytes:
    strh    r2, [r0], #2
    lsr     r2, #16
    strb    r2, [r0]
    m_ret   inst=pop
 strcpy_unalign_copy4bytes:
    stmia   r0, {r2}
    m_ret   inst=pop
 END(strcpy)
--- a/libc/arch-arm/cortex-a9/bionic/strlen.S
+++ b/libc/arch-arm/cortex-a9/bionic/strlen.S
@ -0,0 +1,167 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * Copyright (c) 2013 ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <machine/asm.h>
    .syntax unified
    .thumb
    .thumb_func
 ENTRY(strlen)
    pld     [r0, #0]
    mov     r1, r0
    ands    r3, r0, #7
    bne     align_src
    .p2align 2
 mainloop:
    ldmia   r1!, {r2, r3}
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     zero_in_second_register
    b       mainloop
 zero_in_first_register:
    sub     r0, r1, r0
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     check_byte1_reg1
    sub     r0, r0, #8
    bx      lr
 check_byte1_reg1:
    bcc     check_byte2_reg1
    sub     r0, r0, #7
    bx      lr
 check_byte2_reg1:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    itt     ne
    subne   r0, r0, #6
    bxne    lr
    sub     r0, r0, #5
    bx      lr
 zero_in_second_register:
    sub     r0, r1, r0
    // Check for zero in byte 0.
    lsls    r2, ip, #17
    beq     check_byte1_reg2
    sub     r0, r0, #4
    bx      lr
 check_byte1_reg2:
    bcc     check_byte2_reg2
    sub     r0, r0, #3
    bx      lr
 check_byte2_reg2:
    // Check for zero in byte 2.
    tst     ip, #0x800000
    itt     ne
    subne   r0, r0, #2
    bxne    lr
    sub     r0, r0, #1
    bx      lr
 align_src:
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     align_to_32
    ldrb    r2, [r1], #1
    cbz     r2, done
 align_to_32:
    bcc     align_to_64
    ldrb    r2, [r1], #1
    cbz     r2, done
    ldrb    r2, [r1], #1
    cbz     r2, done
 align_to_64:
    tst     r3, #4
    beq     mainloop
    ldr     r2, [r1], #4
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     zero_in_second_register
    b       mainloop
 done:
    sub     r0, r1, r0
    sub     r0, r0, #1
    bx      lr
 END(strlen)
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@ -1,7 +1,10 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/cortex-a9/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/cortex-a9/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a9/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a9/bionic/strcmp.S)
-# Use cortex-a15 version of strlen.
+$(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a9/bionic/strcpy.S)
-$(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S)
+$(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a9/bionic/strlen.S)
 $(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/cortex-a9/bionic/__strcat_chk.S)
 $(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/cortex-a9/bionic/__strcpy_chk.S)
 include bionic/libc/arch-arm/generic/generic.mk
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@ -28,6 +28,7 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
        /*
         * Optimized memcpy() for ARM.
@ -36,6 +37,13 @@
         * so we have to preserve R0.
         */
 ENTRY(__memcpy_chk)
        cmp         r2, r3
        bgt         fortify_check_failed
        // Fall through to memcpy...
 END(__memcpy_chk)
 ENTRY(memcpy)
        /* The stack must always be 64-bits aligned to be compliant with the
         * ARM ABI. Since we have to save R0, we might as well save R4
@ -377,4 +385,20 @@ copy_last_3_and_return:
        add         sp,  sp, #28
        ldmfd       sp!, {r0, r4, lr}
        bx          lr
        // Only reached when the __memcpy_chk check fails.
 fortify_check_failed:
        ldr     r0, error_message
        ldr     r1, error_code
 1:
        add     r0, pc
        bl      __fortify_chk_fail
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
        .word   error_string-(1b+8)
 END(memcpy)
        .data
 error_string:
        .string     "memcpy buffer overflow"
--- a/libc/arch-arm/generic/bionic/memset.S
+++ b/libc/arch-arm/generic/bionic/memset.S
@ -27,6 +27,7 @@
 */
 #include <machine/asm.h>
 #include "libc_events.h"
        /*
         * Optimized memset() for ARM.
@ -34,9 +35,28 @@
         * memset() returns its first argument.
         */
 ENTRY(__memset_chk)
        cmp         r2, r3
        bls         done
        ldr         r0, error_message
        ldr         r1, error_code
 1:
        add         r0, pc
        bl          __fortify_chk_fail
 error_code:
        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
 error_message:
        .word       error_string-(1b+8)
 END(__memset_chk)
 ENTRY(bzero)
        mov     r2, r1
        mov     r1, #0
 done:
        // Fall through to memset...
 END(bzero)
 ENTRY(memset)
@ -107,3 +127,7 @@ ENTRY(memset)
        ldmfd       sp!, {r0, r4-r7, lr}
        bx          lr
 END(memset)
        .data
 error_string:
        .string     "memset buffer overflow"
--- a/libc/arch-arm/generic/bionic/strcpy.S
+++ b/libc/arch-arm/generic/bionic/strcpy.S
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@ -1,4 +1,8 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/generic/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/generic/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCAT,string/strcat.c)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c)
 $(call libc-add-cpu-variant-src,__STRCAT_CHK,bionic/__strcat_chk.cpp)
 $(call libc-add-cpu-variant-src,__STRCPY_CHK,bionic/__strcpy_chk.cpp)
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@ -0,0 +1,215 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .thumb
    .thumb_func
 // Get the length of src string, then get the source of the dst string.
 // Check that the two lengths together don't exceed the threshold, then
 // do a memcpy of the data.
 ENTRY(__strcat_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    push    {r4, r5}
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 4
    mov     lr, r2
    // Save the dst register to r5
    mov     r5, r0
    // Zero out r4
    eor     r4, r4, r4
    // r1 contains the address of the string to count.
 .L_strlen_start:
    mov     r0, r1
    ands    r3, r1, #7
    beq     .L_mainloop
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_32:
    bcc     .L_align_to_64
    ands    ip, r3, #2
    beq     .L_align_to_64
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
    ldrb    r2, [r1], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r3, [r1], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    .p2align 2
 .L_mainloop:
    ldrd    r2, r3, [r1], #8
    pld     [r1, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_update_count_and_finish:
    sub     r3, r1, r0
    sub     r3, r3, #1
    b       .L_finish
 .L_zero_in_first_register:
    sub     r3, r1, r0
    lsls    r2, ip, #17
    bne     .L_sub8_and_finish
    bcs     .L_sub7_and_finish
    lsls    ip, ip, #1
    bne     .L_sub6_and_finish
    sub     r3, r3, #5
    b       .L_finish
 .L_sub8_and_finish:
    sub     r3, r3, #8
    b       .L_finish
 .L_sub7_and_finish:
    sub     r3, r3, #7
    b       .L_finish
 .L_sub6_and_finish:
    sub     r3, r3, #6
    b       .L_finish
 .L_zero_in_second_register:
    sub     r3, r1, r0
    lsls    r2, ip, #17
    bne     .L_sub4_and_finish
    bcs     .L_sub3_and_finish
    lsls    ip, ip, #1
    bne     .L_sub2_and_finish
    sub     r3, r3, #1
    b       .L_finish
 .L_sub4_and_finish:
    sub     r3, r3, #4
    b       .L_finish
 .L_sub3_and_finish:
    sub     r3, r3, #3
    b       .L_finish
 .L_sub2_and_finish:
    sub     r3, r3, #2
 .L_finish:
    cmp     r4, #0
    bne     .L_strlen_done
    // Time to get the dst string length.
    mov     r1, r5
    // Save the original source address to r5.
    mov     r5, r0
    // Save the current length (adding 1 for the terminator).
    add     r4, r3, #1
    b       .L_strlen_start
    // r0 holds the pointer to the dst string.
    // r3 holds the dst string length.
    // r4 holds the src string length + 1.
 .L_strlen_done:
    add     r2, r3, r4
    cmp     r2, lr
    bgt     .L_fortify_check_failed
    // Set up the registers for the memcpy code.
    mov     r1, r5
    pld     [r1, #64]
    mov     r2, r4
    add     r0, r0, r3
    pop     {r4, r5}
    .cfi_adjust_cfa_offset -8
    .cfi_restore r4
    .cfi_restore r5
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    .cfi_adjust_cfa_offset 8
    .cfi_rel_offset r4, 0
    .cfi_rel_offset r5, 4
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcat_chk)
    .data
 error_string:
    .string "strcat buffer overflow"
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@ -0,0 +1,175 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <machine/asm.h>
 #include "libc_events.h"
    .syntax unified
    .thumb
    .thumb_func
 // Get the length of the source string first, then do a memcpy of the data
 // instead of a strcpy.
 ENTRY(__strcpy_chk)
    .cfi_startproc
    pld     [r0, #0]
    push    {r0, lr}
    .cfi_def_cfa_offset 8
    .cfi_rel_offset r0, 0
    .cfi_rel_offset lr, 4
    mov     lr, r2
    mov     r0, r1
    ands    r3, r1, #7
    beq     .L_mainloop
    // Align to a double word (64 bits).
    rsb     r3, r3, #8
    lsls    ip, r3, #31
    beq     .L_align_to_32
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_32:
    bcc     .L_align_to_64
    ands    ip, r3, #2
    beq     .L_align_to_64
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
    ldrb    r2, [r0], #1
    cbz     r2, .L_update_count_and_finish
 .L_align_to_64:
    tst     r3, #4
    beq     .L_mainloop
    ldr     r3, [r0], #4
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    .p2align 2
 .L_mainloop:
    ldrd    r2, r3, [r0], #8
    pld     [r0, #64]
    sub     ip, r2, #0x01010101
    bic     ip, ip, r2
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_first_register
    sub     ip, r3, #0x01010101
    bic     ip, ip, r3
    ands    ip, ip, #0x80808080
    bne     .L_zero_in_second_register
    b       .L_mainloop
 .L_update_count_and_finish:
    sub     r3, r0, r1
    sub     r3, r3, #1
    b       .L_check_size
 .L_zero_in_first_register:
    sub     r3, r0, r1
    lsls    r2, ip, #17
    bne     .L_sub8_and_finish
    bcs     .L_sub7_and_finish
    lsls    ip, ip, #1
    bne     .L_sub6_and_finish
    sub     r3, r3, #5
    b       .L_check_size
 .L_sub8_and_finish:
    sub     r3, r3, #8
    b       .L_check_size
 .L_sub7_and_finish:
    sub     r3, r3, #7
    b       .L_check_size
 .L_sub6_and_finish:
    sub     r3, r3, #6
    b       .L_check_size
 .L_zero_in_second_register:
    sub     r3, r0, r1
    lsls    r2, ip, #17
    bne     .L_sub4_and_finish
    bcs     .L_sub3_and_finish
    lsls    ip, ip, #1
    bne     .L_sub2_and_finish
    sub     r3, r3, #1
    b       .L_check_size
 .L_sub4_and_finish:
    sub     r3, r3, #4
    b       .L_check_size
 .L_sub3_and_finish:
    sub     r3, r3, #3
    b       .L_check_size
 .L_sub2_and_finish:
    sub     r3, r3, #2
 .L_check_size:
    pld     [r1, #0]
    pld     [r1, #64]
    ldr     r0, [sp]
    cmp     r3, lr
    bge     .L_fortify_check_failed
    // Add 1 for copy length to get the string terminator.
    add     r2, r3, #1
    #include "memcpy_base.S"
 .L_fortify_check_failed:
    ldr     r0, error_message
    ldr     r1, error_code
 1:
    add     r0, pc
    bl      __fortify_chk_fail
 error_code:
    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
    .word   error_string-(1b+4)
    .cfi_endproc
 END(__strcpy_chk)
    .data
 error_string:
    .string "strcpy buffer overflow"
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@ -28,8 +28,8 @@
 /* Assumes neon instructions and a cache line size of 32 bytes. */
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
 /*
 * This code assumes it is running on a processor that supports all arm v7
@ -38,109 +38,50 @@
 */
        .text
        .syntax unified
        .fpu    neon
        .thumb
        .thumb_func
-#define CACHE_LINE_SIZE     32
+ENTRY(__memcpy_chk)
        .cfi_startproc
        cmp         r2, r3
        bgt         __memcpy_chk_fail
        // Fall through to memcpy...
        .cfi_endproc
 END(__memcpy_chk)
 ENTRY(memcpy)
-        .save       {r0, lr}
+        .cfi_startproc
-        /* start preloading as early as possible */
+        pld     [r1, #64]
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        stmfd   sp!, {r0, lr}
-        stmfd       sp!, {r0, lr}
+        .cfi_def_cfa_offset 8
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
+        .cfi_rel_offset r0, 0
        .cfi_rel_offset lr, 4
-        /* do we have at least 16-bytes to copy (needed for alignment below) */
+        #include "memcpy_base.S"
-        cmp         r2, #16
+        .cfi_endproc
        blo         5f
        /* align destination to cache-line for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         0f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        ldrmib      lr, [r1], #1
        strmib      lr, [r0], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:
 0:      /* preload immediately the next cache line, which we may need */
        pld         [r1, #(CACHE_LINE_SIZE*0)]
        pld         [r1, #(CACHE_LINE_SIZE*2)]
        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
        /* Preload all the cache lines we need.
         * NOTE: The number of pld below depends on CACHE_LINE_SIZE,
         * ideally we would increase the distance in the main loop to
         * avoid the goofy code below. In practice this doesn't seem to make
         * a big difference.
         * NOTE: The value CACHE_LINE_SIZE * 8 was chosen through
         * experimentation.
         */
        pld         [r1, #(CACHE_LINE_SIZE*4)]
        pld         [r1, #(CACHE_LINE_SIZE*6)]
        pld         [r1, #(CACHE_LINE_SIZE*8)]
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0  - d3},   [r1]!
        vld1.8      {d4  - d7},   [r1]!
        pld         [r1, #(CACHE_LINE_SIZE*8)]
        subs        r2, r2, #64
        vst1.8      {d0  - d3},   [r0, :128]!
        vst1.8      {d4  - d7},   [r0, :128]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3},  [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3},  [r0, :128]!
        bhs         3b
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        ldrmib      r3, [r1], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strmib      r3, [r0], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
 END(memcpy)
        .cfi_startproc
 __memcpy_chk_fail:
        // Preserve lr for backtrace.
        push    {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr     r0, error_message
        ldr     r1, error_code
 1:
        add     r0, pc
        bl      __fortify_chk_fail
 error_code:
        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
        .word   error_string-(1b+4)
        .cfi_endproc
        .data
 error_string:
        .string     "memcpy buffer overflow"
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@ -0,0 +1,117 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 /*
 * This code assumes it is running on a processor that supports all arm v7
 * instructions, that supports neon instructions, and that has a 32 byte
 * cache line.
 */
 // Assumes neon instructions and a cache line size of 32 bytes.
        /* do we have at least 16-bytes to copy (needed for alignment below) */
        cmp         r2, #16
        blo         5f
        /* align destination to cache-line for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         2f
        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        itt         mi
        ldrbmi      lr, [r1], #1
        strbmi      lr, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
 1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
 2:      /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f
 1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0  - d3},   [r1]!
        vld1.8      {d4  - d7},   [r1]!
        pld         [r1, #(32*2)]
        subs        r2, r2, #64
        vst1.8      {d0  - d3},   [r0, :128]!
        vst1.8      {d4  - d7},   [r0, :128]!
        bhs         1b
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        adds        r2, r2, #32
        blo         4f
        /* Copy 32 bytes. These cache lines were already preloaded */
        vld1.8      {d0 - d3},  [r1]!
        sub         r2, r2, #32
        vst1.8      {d0 - d3},  [r0, :128]!
 4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!
 5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
 1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
 2:      movs        ip, r2, lsl #31
        itt         mi
        ldrbmi      r3, [r1], #1
        strbmi      r3, [r0], #1
        itttt       cs
        ldrbcs      ip, [r1], #1
        ldrbcs      lr, [r1], #1
        strbcs      ip, [r0], #1
        strbcs      lr, [r0], #1
        ldmfd       sp!, {r0, lr}
        bx          lr
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@ -28,6 +28,7 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 #include "libc_events.h"
 /*
 * This code assumes it is running on a processor that supports all arm v7
@ -37,15 +38,45 @@
    .fpu    neon
 ENTRY(__memset_chk)
        .cfi_startproc
        cmp         r2, r3
        bls         .L_done
        // Preserve lr for backtrace.
        push        {lr}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset lr, 0
        ldr         r0, error_message
        ldr         r1, error_code
 1:
        add         r0, pc
        bl          __fortify_chk_fail
 error_code:
        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
 error_message:
        .word       error_string-(1b+8)
        .cfi_endproc
 END(__memset_chk)
 ENTRY(bzero)
        .cfi_startproc
        mov     r2, r1
        mov     r1, #0
 .L_done:
        // Fall through to memset...
        .cfi_endproc
 END(bzero)
 /* memset() returns its first argument.  */
 ENTRY(memset)
-        .save       {r0}
+        .cfi_startproc
        stmfd       sp!, {r0}
        .cfi_def_cfa_offset 4
        .cfi_rel_offset r0, 0
        vdup.8      q0, r1
@ -78,4 +109,9 @@ ENTRY(memset)
        strcsb      r1, [r0], #1
        ldmfd       sp!, {r0}
        bx          lr
        .cfi_endproc
 END(memset)
        .data
 error_string:
        .string     "memset buffer overflow"
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@ -1,7 +1,11 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S)
-# Use cortex-a15 version of strlen.
+$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S)
 $(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S)
 # Use cortex-a15 versions of strcat/strcpy/strlen.
 $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a15/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S)
 include bionic/libc/arch-arm/generic/generic.mk
--- a/libc/bionic/__strrchr_chk.cpp
+++ b/libc/bionic/__strrchr_chk.cpp
@ -31,18 +31,17 @@
 #include <string.h>
 #include "libc_logging.h"
-char *
+extern "C" char* __strrchr_chk(const char *p, int ch, size_t s_len)
 __strrchr_chk(const char *p, int ch, size_t s_len)
 {
-	char *save;
+    char *save;
-	for (save = NULL;; ++p, s_len--) {
+    for (save = NULL;; ++p, s_len--) {
-		if (s_len == 0)
+        if (s_len == 0)
-			__fortify_chk_fail("strrchr read beyond buffer", 0);
+            __fortify_chk_fail("strrchr read beyond buffer", 0);
-		if (*p == (char) ch)
+        if (*p == (char) ch)
-			save = (char *)p;
+            save = (char *)p;
-		if (!*p)
+        if (!*p)
-			return(save);
+            return(save);
-	}
+    }
-	/* NOTREACHED */
+    /* NOTREACHED */
 }
--- a/libc/private/libc_events.h
+++ b/libc/private/libc_events.h
@ -0,0 +1,48 @@
 /*
 * Copyright (C) 2013 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #ifndef _LIBC_EVENTS_H
 #define _LIBC_EVENTS_H
 // This is going to be included in assembler code so only allow #define
 // values instead of defining an enum.
 #define BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW   80100
 #define BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW   80105
 #define BIONIC_EVENT_MEMMOVE_BUFFER_OVERFLOW  80110
 #define BIONIC_EVENT_STRNCAT_BUFFER_OVERFLOW  80115
 #define BIONIC_EVENT_STRNCPY_BUFFER_OVERFLOW  80120
 #define BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW   80125
 #define BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW   80130
 #define BIONIC_EVENT_RESOLVER_OLD_RESPONSE    80300
 #define BIONIC_EVENT_RESOLVER_WRONG_SERVER    80305
 #define BIONIC_EVENT_RESOLVER_WRONG_QUERY     80310
 #endif // _LIBC_EVENTS_H
--- a/libc/private/libc_logging.h
+++ b/libc/private/libc_logging.h
@ -36,19 +36,7 @@
 __BEGIN_DECLS
-enum {
+#include "libc_events.h"
  BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW = 80100,
  BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW = 80105,
  BIONIC_EVENT_MEMMOVE_BUFFER_OVERFLOW = 80110,
  BIONIC_EVENT_STRNCAT_BUFFER_OVERFLOW = 80115,
  BIONIC_EVENT_STRNCPY_BUFFER_OVERFLOW = 80120,
  BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW = 80125,
  BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW = 80130,
  BIONIC_EVENT_RESOLVER_OLD_RESPONSE = 80300,
  BIONIC_EVENT_RESOLVER_WRONG_SERVER = 80305,
  BIONIC_EVENT_RESOLVER_WRONG_QUERY = 80310,
 };
 enum {
  ANDROID_LOG_UNKNOWN = 0,