diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S
index b174aa9cc..157cc9f1d 100644
--- a/libc/arch-arm/cortex-a15/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcat.S
@@ -169,13 +169,20 @@ ENTRY(strcat)
 .L_strcpy_align_to_64:
     tst     r3, #4
     beq     .L_strcpy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .L_strcpy_zero_in_first_register
-    str     r2, [r0], #4
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
 
 .L_strcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
diff --git a/libc/arch-arm/cortex-a15/bionic/string_copy.S b/libc/arch-arm/cortex-a15/bionic/string_copy.S
index 20f0e91b0..92d1c9804 100644
--- a/libc/arch-arm/cortex-a15/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a15/bionic/string_copy.S
@@ -149,13 +149,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    str     r2, [r0], #4
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
 
 .Lstringcopy_check_src_align:
     // At this point dst is aligned to a double word, check if src
diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S
index f5a855e39..9077a7449 100644
--- a/libc/arch-arm/cortex-a9/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcat.S
@@ -70,7 +70,7 @@
 
     .macro m_scan_byte
     ldrb    r3, [r0]
-    cbz     r3, strcat_r0_scan_done
+    cbz     r3, .Lstrcat_r0_scan_done
     add     r0, #1
     .endm // m_scan_byte
 
@@ -84,10 +84,10 @@ ENTRY(strcat)
     // Quick check to see if src is empty.
     ldrb        r2, [r1]
     pld         [r1, #0]
-    cbnz        r2, strcat_continue
+    cbnz        r2, .Lstrcat_continue
     bx          lr
 
-strcat_continue:
+.Lstrcat_continue:
     // To speed up really small dst strings, unroll checking the first 4 bytes.
     m_push
     m_scan_byte
@@ -96,10 +96,10 @@ strcat_continue:
     m_scan_byte
 
     ands    r3, r0, #7
-    bne     strcat_align_src
+    bne     .Lstrcat_align_src
 
     .p2align 2
-strcat_mainloop:
+.Lstrcat_mainloop:
     ldmia   r0!, {r2, r3}
 
     pld     [r0, #64]
@@ -107,28 +107,28 @@ strcat_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_first_register
+    bne     .Lstrcat_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_zero_in_first_register:
+.Lstrcat_zero_in_first_register:
     sub     r0, r0, #4
 
-strcat_zero_in_second_register:
+.Lstrcat_zero_in_second_register:
     // Check for zero in byte 0.
     tst     ip, #0x80
     it      ne
     subne   r0, r0, #4
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 1.
     tst     ip, #0x8000
     it      ne
     subne   r0, r0, #3
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 2.
     tst     ip, #0x800000
     it      ne
@@ -137,33 +137,33 @@ strcat_zero_in_second_register:
     // Zero is in byte 3.
     subeq   r0, r0, #1
 
-strcat_r0_scan_done:
+.Lstrcat_r0_scan_done:
     // Unroll the first 8 bytes that will be copied.
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbnz, label=.Lstrcpy_continue
 
-strcpy_finish:
+.Lstrcpy_finish:
     m_ret   inst=pop
 
-strcpy_continue:
+.Lstrcpy_continue:
     pld     [r1, #0]
     ands    r3, r0, #7
-    bne     strcpy_align_dst
+    bne     .Lstrcpy_align_dst
 
-strcpy_check_src_align:
+.Lstrcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
     // is also aligned to a double word.
     ands    r3, r1, #7
-    bne     strcpy_unaligned_copy
+    bne     .Lstrcpy_unaligned_copy
 
     .p2align 2
-strcpy_mainloop:
+.Lstrcpy_mainloop:
     ldmia   r1!, {r2, r3}
 
     pld     [r1, #64]
@@ -171,17 +171,17 @@ strcpy_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_mainloop
+    b       .Lstrcpy_mainloop
 
-strcpy_zero_in_first_register:
+.Lstrcpy_zero_in_first_register:
     lsls    lr, ip, #17
     itt     ne
     strbne  r2, [r0]
@@ -198,7 +198,7 @@ strcpy_zero_in_first_register:
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_zero_in_second_register:
+.Lstrcpy_zero_in_second_register:
     lsls    lr, ip, #17
     ittt    ne
     stmiane r0!, {r2}
@@ -218,18 +218,18 @@ strcpy_zero_in_second_register:
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_align_dst:
+.Lstrcpy_align_dst:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcpy_align_to_32
+    beq     .Lstrcpy_align_to_32
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .Lstrcpy_complete
 
-strcpy_align_to_32:
-    bcc     strcpy_align_to_64
+.Lstrcpy_align_to_32:
+    bcc     .Lstrcpy_align_to_64
 
     ldrb    r4, [r1], #1
     strb    r4, [r0], #1
@@ -242,76 +242,83 @@ strcpy_align_to_32:
     it      eq
     m_ret   inst=popeq
 
-strcpy_align_to_64:
+.Lstrcpy_align_to_64:
     tst     r3, #4
-    beq     strcpy_check_src_align
-    ldr     r2, [r1], #4
+    beq     .Lstrcpy_check_src_align
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    b       .Lstrcpy_check_src_align
 
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
-    stmia   r0!, {r2}
-    b       strcpy_check_src_align
-
-strcpy_complete:
+.Lstrcpy_complete:
     m_ret   inst=pop
 
-strcpy_unaligned_copy:
+.Lstrcpy_unaligned_copy:
     // Dst is aligned to a double word, while src is at an unknown alignment.
     // There are 7 different versions of the unaligned copy code
     // to prevent overreading the src. The mainloop of every single version
     // will store 64 bits per loop. The difference is how much of src can
     // be read without potentially crossing a page boundary.
     tbb     [pc, r3]
-strcpy_unaligned_branchtable:
+.Lstrcpy_unaligned_branchtable:
     .byte 0
-    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign7 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign6 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign5 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign4 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign3 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign2 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign1 - .Lstrcpy_unaligned_branchtable)/2)
 
     .p2align 2
     // Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.Lstrcpy_unalign7:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r3, [r1]
-    cbz     r3, strcpy_unalign7_copy5bytes
+    cbz     r3, .Lstrcpy_unalign7_copy5bytes
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign7_copy6bytes
+    cbz     r4, .Lstrcpy_unalign7_copy6bytes
     ldrb    r5, [r1, #2]
-    cbz     r5, strcpy_unalign7_copy7bytes
+    cbz     r5, .Lstrcpy_unalign7_copy7bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign7
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign7
 
-strcpy_unalign7_copy5bytes:
+.Lstrcpy_unalign7_copy5bytes:
     stmia   r0!, {r2}
     strb    r3, [r0]
-strcpy_unalign_return:
+.Lstrcpy_unalign_return:
     m_ret   inst=pop
 
-strcpy_unalign7_copy6bytes:
+.Lstrcpy_unalign7_copy6bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
     m_ret   inst=pop
 
-strcpy_unalign7_copy7bytes:
+.Lstrcpy_unalign7_copy7bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
@@ -320,30 +327,30 @@ strcpy_unalign7_copy7bytes:
 
     .p2align 2
     // Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.Lstrcpy_unalign6:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
     ldrb    r5, [r1, #1]
-    cbz     r5, strcpy_unalign_copy6bytes
+    cbz     r5, .Lstrcpy_unalign_copy6bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r3, #0xff0000
-    beq     strcpy_unalign6_copy7bytes
+    beq     .Lstrcpy_unalign6_copy7bytes
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign6
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign6
 
-strcpy_unalign6_copy7bytes:
+.Lstrcpy_unalign6_copy7bytes:
     stmia   r0!, {r2}
     strh    r3, [r0], #2
     lsr     r3, #16
@@ -352,16 +359,16 @@ strcpy_unalign6_copy7bytes:
 
     .p2align 2
     // Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.Lstrcpy_unalign5:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
 
     ldr     r3, [r1], #4
 
@@ -370,17 +377,17 @@ strcpy_unalign5:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign5
+    b       .Lstrcpy_unalign5
 
-strcpy_unalign_copy5bytes:
+.Lstrcpy_unalign_copy5bytes:
     stmia   r0!, {r2}
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy6bytes:
+.Lstrcpy_unalign_copy6bytes:
     stmia   r0!, {r2}
     strb    r4, [r0], #1
     strb    r5, [r0]
@@ -388,13 +395,13 @@ strcpy_unalign_copy6bytes:
 
     .p2align 2
     // Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.Lstrcpy_unalign4:
     ldmia   r1!, {r2}
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldmia   r1!, {r3}
     pld     [r1, #64]
@@ -402,20 +409,20 @@ strcpy_unalign4:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign4
+    b       .Lstrcpy_unalign4
 
     .p2align 2
     // Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.Lstrcpy_unalign3:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign3_copy1byte
+    cbz     r2, .Lstrcpy_unalign3_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign3_copy2bytes
+    cbz     r3, .Lstrcpy_unalign3_copy2bytes
     ldrb    r4, [r1, #2]
-    cbz     r4, strcpy_unalign3_copy3bytes
+    cbz     r4, .Lstrcpy_unalign3_copy3bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -423,26 +430,26 @@ strcpy_unalign3:
     pld     [r1, #64]
 
     lsrs    lr, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign3
+    b       .Lstrcpy_unalign3
 
-strcpy_unalign3_copy1byte:
+.Lstrcpy_unalign3_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy2bytes:
+.Lstrcpy_unalign3_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy3bytes:
+.Lstrcpy_unalign3_copy3bytes:
     strb    r2, [r0], #1
     strb    r3, [r0], #1
     strb    r4, [r0]
@@ -450,34 +457,34 @@ strcpy_unalign3_copy3bytes:
 
     .p2align 2
     // Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.Lstrcpy_unalign2:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign_copy2bytes
+    cbz     r3, .Lstrcpy_unalign_copy2bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r2, #0xff0000
-    beq     strcpy_unalign_copy3bytes
+    beq     .Lstrcpy_unalign_copy3bytes
     lsrs    ip, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign2
+    b       .Lstrcpy_unalign2
 
     .p2align 2
     // Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.Lstrcpy_unalign1:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -487,62 +494,62 @@ strcpy_unalign1:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign1
+    b       .Lstrcpy_unalign1
 
-strcpy_unalign_copy1byte:
+.Lstrcpy_unalign_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy2bytes:
+.Lstrcpy_unalign_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy3bytes:
+.Lstrcpy_unalign_copy3bytes:
     strh    r2, [r0], #2
     lsr     r2, #16
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy4bytes:
+.Lstrcpy_unalign_copy4bytes:
     stmia   r0, {r2}
     m_ret   inst=pop
 
-strcat_align_src:
+.Lstrcat_align_src:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcat_align_to_32
+    beq     .Lstrcat_align_to_32
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_32:
-    bcc     strcat_align_to_64
+.Lstrcat_align_to_32:
+    bcc     .Lstrcat_align_to_64
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_64:
+.Lstrcat_align_to_64:
     tst     r3, #4
-    beq     strcat_mainloop
+    beq     .Lstrcat_mainloop
     ldr     r3, [r0], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_r0_update:
+.Lstrcat_r0_update:
     sub     r0, r0, #1
-    b strcat_r0_scan_done
+    b .Lstrcat_r0_scan_done
 END(strcat)
diff --git a/libc/arch-arm/cortex-a9/bionic/string_copy.S b/libc/arch-arm/cortex-a9/bionic/string_copy.S
index caf5a11fe..642db0f19 100644
--- a/libc/arch-arm/cortex-a9/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a9/bionic/string_copy.S
@@ -244,13 +244,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    stmia   r0!, {r2}
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
     b       .Lstringcopy_check_src_align
 
 .Lstringcopy_complete:
diff --git a/tests/buffer_tests.cpp b/tests/buffer_tests.cpp
index 7d830bb16..a5a0c2a16 100644
--- a/tests/buffer_tests.cpp
+++ b/tests/buffer_tests.cpp
@@ -381,15 +381,19 @@ void RunSrcDstBufferOverreadTest(void (*test_func)(uint8_t*, uint8_t*, size_t))
   // Make the second page unreadable and unwritable.
   ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_NONE) == 0);
 
-  uint8_t* dst = new uint8_t[pagesize];
-  for (size_t i = 0; i < pagesize; i++) {
-    uint8_t* src = &memory[pagesize-i];
+  uint8_t* dst_buffer = new uint8_t[2*pagesize];
+  // Change the dst alignment as we change the source.
+  for (size_t i = 0; i < 16; i++) {
+    uint8_t* dst = &dst_buffer[i];
+    for (size_t j = 0; j < pagesize; j++) {
+      uint8_t* src = &memory[pagesize-j];
 
-    test_func(src, dst, i);
+      test_func(src, dst, j);
+    }
   }
   ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_READ | PROT_WRITE) == 0);
   free(memory);
-  delete[] dst;
+  delete[] dst_buffer;
 }
 
 void RunCmpBufferOverreadTest(
diff --git a/tests/string_test.cpp b/tests/string_test.cpp
index 80190d7d7..842e1c7ee 100644
--- a/tests/string_test.cpp
+++ b/tests/string_test.cpp
@@ -1174,7 +1174,7 @@ static size_t LargeSetIncrement(size_t len) {
   return 1;
 }
 
-#define STRCAT_DST_LEN  128
+#define STRCAT_DST_LEN  64
 
 static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
   if (len >= 1) {
@@ -1189,7 +1189,7 @@ static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
       int value2 = 32 + (value + 2) % 96;
       memset(cmp_buf, value2, sizeof(cmp_buf));
 
-      for (size_t i = 1; i <= STRCAT_DST_LEN; i++) {
+      for (size_t i = 1; i <= STRCAT_DST_LEN;) {
         memset(dst, value2, i-1);
         memset(dst+i-1, 0, len-i);
         src[len-i] = '\0';
@@ -1197,6 +1197,13 @@ static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
                                                          reinterpret_cast<char*>(src))));
         ASSERT_TRUE(memcmp(dst, cmp_buf, i-1) == 0);
         ASSERT_TRUE(memcmp(src, dst+i-1, len-i+1) == 0);
+        // This is an expensive loop, so don't loop through every value,
+        // get to a certain size and then start doubling.
+        if (i < 16) {
+          i++;
+        } else {
+          i <<= 1;
+        }
       }
     } else {
       dst[0] = '\0';
@@ -1229,7 +1236,7 @@ static void DoStrlcatTest(uint8_t* src, uint8_t* dst, size_t len) {
       int value2 = 32 + (value + 2) % 96;
       memset(cmp_buf, value2, sizeof(cmp_buf));
 
-      for (size_t i = 1; i <= STRCAT_DST_LEN; i++) {
+      for (size_t i = 1; i <= STRCAT_DST_LEN;) {
         memset(dst, value2, i-1);
         memset(dst+i-1, 0, len-i);
         src[len-i] = '\0';
@@ -1237,6 +1244,13 @@ static void DoStrlcatTest(uint8_t* src, uint8_t* dst, size_t len) {
                                  reinterpret_cast<char*>(src), len));
         ASSERT_TRUE(memcmp(dst, cmp_buf, i-1) == 0);
         ASSERT_TRUE(memcmp(src, dst+i-1, len-i+1) == 0);
+        // This is an expensive loop, so don't loop through every value,
+        // get to a certain size and then start doubling.
+        if (i < 16) {
+          i++;
+        } else {
+          i <<= 1;
+        }
       }
     } else {
       dst[0] = '\0';