From 7123d4371a5e04337b1de5f8cdf6cdc1e08e9cad Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris@google.com>
Date: Fri, 17 Oct 2014 14:08:54 -0700
Subject: [PATCH] Fix generic __memcpy_chk implementation.

- Clean up the labels (add .L to make them local).
- Change to using cfi directives.
- Fix unwinding of the __memcpy_chk fail path.

Bug: 18033671
Change-Id: I12845f10c7ce5e6699c15c558bda64c83f6a392a
---
 libc/arch-arm/generic/bionic/memcpy.S | 74 +++++++++++++++------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S
index cd4a13d12..b0c79abf7 100644
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@@ -39,7 +39,7 @@
 
 ENTRY(__memcpy_chk)
         cmp         r2, r3
-        bgt         fortify_check_failed
+        bhi         __memcpy_chk_fail
 
         // Fall through to memcpy...
 END(__memcpy_chk)
@@ -49,11 +49,14 @@ ENTRY(memcpy)
          * ARM ABI. Since we have to save R0, we might as well save R4
          * which we can use for better pipelining of the reads below
          */
-        .save       {r0, r4, lr}
         stmfd       sp!, {r0, r4, lr}
+        .cfi_def_cfa_offset 12
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset r4, 4
+        .cfi_rel_offset lr, 8
         /* Making room for r5-r11 which will be spilled later */
-        .pad        #28
         sub         sp, sp, #28
+        .cfi_adjust_cfa_offset 28
 
         // preload the destination because we'll align it to a cache line
         // with small writes. Also start the source "pump".
@@ -63,14 +66,14 @@ ENTRY(memcpy)
 
         /* it simplifies things to take care of len<4 early */
         cmp         r2, #4
-        blo         copy_last_3_and_return
+        blo         .Lcopy_last_3_and_return
 
         /* compute the offset to align the source
          * offset = (4-(src&3))&3 = -src & 3
          */
         rsb         r3, r1, #0
         ands        r3, r3, #3
-        beq         src_aligned
+        beq         .Lsrc_aligned
 
         /* align source to 32 bits. We need to insert 2 instructions between
          * a ldr[b|h] and str[b|h] because byte and half-word instructions
@@ -85,12 +88,12 @@ ENTRY(memcpy)
         strcsb      r4, [r0], #1
         strcsb      r12,[r0], #1
 
-src_aligned:
+.Lsrc_aligned:
 
         /* see if src and dst are aligned together (congruent) */
         eor         r12, r0, r1
         tst         r12, #3
-        bne         non_congruent
+        bne         .Lnon_congruent
 
         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
          * frame. Don't update sp.
@@ -100,7 +103,7 @@ src_aligned:
         /* align the destination to a cache-line */
         rsb         r3, r0, #0
         ands        r3, r3, #0x1C
-        beq         congruent_aligned32
+        beq         .Lcongruent_aligned32
         cmp         r3, r2
         andhi       r3, r2, #0x1C
 
@@ -115,14 +118,14 @@ src_aligned:
         strne       r10,[r0], #4
         sub         r2, r2, r3
 
-congruent_aligned32:
+.Lcongruent_aligned32:
         /*
          * here source is aligned to 32 bytes.
          */
 
-cached_aligned32:
+.Lcached_aligned32:
         subs        r2, r2, #32
-        blo         less_than_32_left
+        blo         .Lless_than_32_left
 
         /*
          * We preload a cache-line up to 64 bytes ahead. On the 926, this will
@@ -160,10 +163,7 @@ cached_aligned32:
 
         add         r2, r2, #32
 
-
-
-
-less_than_32_left:
+.Lless_than_32_left:
         /*
          * less than 32 bytes left at this point (length in r2)
          */
@@ -197,7 +197,7 @@ less_than_32_left:
 
         /********************************************************************/
 
-non_congruent:
+.Lnon_congruent:
         /*
          * here source is aligned to 4 bytes
          * but destination is not.
@@ -207,9 +207,9 @@ non_congruent:
          * partial words in the shift queue)
          */
         cmp         r2, #4
-        blo         copy_last_3_and_return
+        blo         .Lcopy_last_3_and_return
 
-        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
+        /* Use post-increment mode for stm to spill r5-r11 to reserved stack
          * frame. Don't update sp.
          */
         stmea       sp, {r5-r11}
@@ -236,7 +236,7 @@ non_congruent:
         movcs       r3, r3, lsr #8
 
         cmp         r2, #4
-        blo         partial_word_tail
+        blo         .Lpartial_word_tail
 
         /* Align destination to 32 bytes (cache line boundary) */
 1:      tst         r0, #0x1c
@@ -248,11 +248,11 @@ non_congruent:
         str         r4, [r0], #4
         cmp         r2, #4
         bhs         1b
-        blo         partial_word_tail
+        blo         .Lpartial_word_tail
 
         /* copy 32 bytes at a time */
 2:      subs        r2, r2, #32
-        blo         less_than_thirtytwo
+        blo         .Lless_than_thirtytwo
 
         /* Use immediate mode for the shifts, because there is an extra cycle
          * for register shifts, which could account for up to 50% of
@@ -260,11 +260,11 @@ non_congruent:
          */
 
         cmp         r12, #24
-        beq         loop24
+        beq         .Lloop24
         cmp         r12, #8
-        beq         loop8
+        beq         .Lloop8
 
-loop16:
+.Lloop16:
         ldr         r12, [r1], #4
 1:      mov         r4, r12
         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
@@ -289,9 +289,9 @@ loop16:
         stmia       r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
         mov         r3, r11,        lsr #16
         bhs         1b
-        b           less_than_thirtytwo
+        b           .Lless_than_thirtytwo
 
-loop8:
+.Lloop8:
         ldr         r12, [r1], #4
 1:      mov         r4, r12
         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
@@ -316,9 +316,9 @@ loop8:
         stmia       r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
         mov         r3, r11,        lsr #8
         bhs         1b
-        b           less_than_thirtytwo
+        b           .Lless_than_thirtytwo
 
-loop24:
+.Lloop24:
         ldr         r12, [r1], #4
 1:      mov         r4, r12
         ldmia       r1!, {   r5,r6,r7,  r8,r9,r10,r11}
@@ -345,12 +345,12 @@ loop24:
         bhs         1b
 
 
-less_than_thirtytwo:
+.Lless_than_thirtytwo:
         /* copy the last 0 to 31 bytes of the source */
         rsb         r12, lr, #32        /* we corrupted r12, recompute it  */
         add         r2, r2, #32
         cmp         r2, #4
-        blo         partial_word_tail
+        blo         .Lpartial_word_tail
 
 1:      ldr         r5, [r1], #4
         sub         r2, r2, #4
@@ -360,7 +360,7 @@ less_than_thirtytwo:
         cmp         r2, #4
         bhs         1b
 
-partial_word_tail:
+.Lpartial_word_tail:
         /* we have a partial word in the input buffer */
         movs        r5, lr, lsl #(31-3)
         strmib      r3, [r0], #1
@@ -372,7 +372,7 @@ partial_word_tail:
         /* Refill spilled registers from the stack. Don't update sp. */
         ldmfd       sp, {r5-r11}
 
-copy_last_3_and_return:
+.Lcopy_last_3_and_return:
         movs        r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
         ldrmib      r2, [r1], #1
         ldrcsb      r3, [r1], #1
@@ -385,9 +385,15 @@ copy_last_3_and_return:
         add         sp,  sp, #28
         ldmfd       sp!, {r0, r4, lr}
         bx          lr
+END(memcpy)
 
         // Only reached when the __memcpy_chk check fails.
-fortify_check_failed:
+ENTRY_PRIVATE(__memcpy_chk_fail)
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
         ldr     r0, error_message
         ldr     r1, error_code
 1:
@@ -397,7 +403,7 @@ error_code:
         .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
         .word   error_string-(1b+8)
-END(memcpy)
+END(__memcpy_chk_fail)
 
         .data
 error_string: