Move the generic arm memcmp.S into the generic directory.
Change-Id: I48e4d14a0dcddbb246edbac6d0329619574ab44d
This commit is contained in:
343
libc/arch-arm/generic/bionic/memcmp.S
Normal file
343
libc/arch-arm/generic/bionic/memcmp.S
Normal file
@@ -0,0 +1,343 @@
|
||||
/*
|
||||
* Copyright (C) 2008 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <machine/cpu-features.h>
|
||||
#include <private/bionic_asm.h>
|
||||
|
||||
|
||||
#ifdef HAVE_32_BYTE_CACHE_LINE
|
||||
#define CACHE_LINE_SIZE 32
|
||||
#else
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Optimized memcmp() for Cortex-A9.
|
||||
*/
|
||||
|
||||
ENTRY(memcmp)
|
||||
pld [r0, #(CACHE_LINE_SIZE * 0)]
|
||||
pld [r0, #(CACHE_LINE_SIZE * 1)]
|
||||
|
||||
/* take of the case where length is 0 or the buffers are the same */
|
||||
cmp r0, r1
|
||||
moveq r0, #0
|
||||
bxeq lr
|
||||
|
||||
pld [r1, #(CACHE_LINE_SIZE * 0)]
|
||||
pld [r1, #(CACHE_LINE_SIZE * 1)]
|
||||
|
||||
/* make sure we have at least 8+4 bytes, this simplify things below
|
||||
* and avoid some overhead for small blocks
|
||||
*/
|
||||
cmp r2, #(8+4)
|
||||
bmi 10f
|
||||
/*
|
||||
* Neon optimization
|
||||
* Comparing 32 bytes at a time
|
||||
*/
|
||||
#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
|
||||
subs r2, r2, #32
|
||||
blo 3f
|
||||
|
||||
/* preload all the cache lines we need. */
|
||||
pld [r0, #(CACHE_LINE_SIZE * 2)]
|
||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||
|
||||
1: /* The main loop compares 32 bytes at a time */
|
||||
vld1.8 {d0 - d3}, [r0]!
|
||||
pld [r0, #(CACHE_LINE_SIZE * 2)]
|
||||
vld1.8 {d4 - d7}, [r1]!
|
||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||
|
||||
/* Start subtracting the values and merge results */
|
||||
vsub.i8 q0, q2
|
||||
vsub.i8 q1, q3
|
||||
vorr q2, q0, q1
|
||||
vorr d4, d5
|
||||
vmov r3, ip, d4
|
||||
/* Check if there are any differences among the 32 bytes */
|
||||
orrs r3, ip
|
||||
bne 2f
|
||||
subs r2, r2, #32
|
||||
bhs 1b
|
||||
b 3f
|
||||
2:
|
||||
/* Check if the difference was in the first or last 16 bytes */
|
||||
sub r0, #32
|
||||
vorr d0, d1
|
||||
sub r1, #32
|
||||
vmov r3, ip, d0
|
||||
orrs r3, ip
|
||||
/* if the first 16 bytes are equal, we only have to rewind 16 bytes */
|
||||
ittt eq
|
||||
subeq r2, #16
|
||||
addeq r0, #16
|
||||
addeq r1, #16
|
||||
|
||||
3: /* fix-up the remaining count */
|
||||
add r2, r2, #32
|
||||
|
||||
cmp r2, #(8+4)
|
||||
bmi 10f
|
||||
#endif
|
||||
|
||||
/* save registers */
|
||||
stmfd sp!, {r4, lr}
|
||||
.cfi_def_cfa_offset 8
|
||||
.cfi_rel_offset r4, 0
|
||||
.cfi_rel_offset lr, 4
|
||||
|
||||
/* since r0 hold the result, move the first source
|
||||
* pointer somewhere else
|
||||
*/
|
||||
mov r4, r0
|
||||
|
||||
/* align first pointer to word boundary
|
||||
* offset = -src & 3
|
||||
*/
|
||||
rsb r3, r4, #0
|
||||
ands r3, r3, #3
|
||||
beq 0f
|
||||
|
||||
/* align first pointer */
|
||||
sub r2, r2, r3
|
||||
1: ldrb r0, [r4], #1
|
||||
ldrb ip, [r1], #1
|
||||
subs r0, r0, ip
|
||||
bne 9f
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
|
||||
|
||||
0: /* here the first pointer is aligned, and we have at least 4 bytes
|
||||
* to process.
|
||||
*/
|
||||
|
||||
/* see if the pointers are congruent */
|
||||
eor r0, r4, r1
|
||||
ands r0, r0, #3
|
||||
bne 5f
|
||||
|
||||
/* congruent case, 32 bytes per iteration
|
||||
* We need to make sure there are at least 32+4 bytes left
|
||||
* because we effectively read ahead one word, and we could
|
||||
* read past the buffer (and segfault) if we're not careful.
|
||||
*/
|
||||
|
||||
ldr ip, [r1]
|
||||
subs r2, r2, #(32 + 4)
|
||||
bmi 1f
|
||||
|
||||
0: pld [r4, #(CACHE_LINE_SIZE * 2)]
|
||||
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||
ldr r0, [r4], #4
|
||||
ldr lr, [r1, #4]!
|
||||
eors r0, r0, ip
|
||||
ldreq r0, [r4], #4
|
||||
ldreq ip, [r1, #4]!
|
||||
eoreqs r0, r0, lr
|
||||
ldreq r0, [r4], #4
|
||||
ldreq lr, [r1, #4]!
|
||||
eoreqs r0, r0, ip
|
||||
ldreq r0, [r4], #4
|
||||
ldreq ip, [r1, #4]!
|
||||
eoreqs r0, r0, lr
|
||||
ldreq r0, [r4], #4
|
||||
ldreq lr, [r1, #4]!
|
||||
eoreqs r0, r0, ip
|
||||
ldreq r0, [r4], #4
|
||||
ldreq ip, [r1, #4]!
|
||||
eoreqs r0, r0, lr
|
||||
ldreq r0, [r4], #4
|
||||
ldreq lr, [r1, #4]!
|
||||
eoreqs r0, r0, ip
|
||||
ldreq r0, [r4], #4
|
||||
ldreq ip, [r1, #4]!
|
||||
eoreqs r0, r0, lr
|
||||
bne 2f
|
||||
subs r2, r2, #32
|
||||
bhs 0b
|
||||
|
||||
/* do we have at least 4 bytes left? */
|
||||
1: adds r2, r2, #(32 - 4 + 4)
|
||||
bmi 4f
|
||||
|
||||
/* finish off 4 bytes at a time */
|
||||
3: ldr r0, [r4], #4
|
||||
ldr ip, [r1], #4
|
||||
eors r0, r0, ip
|
||||
bne 2f
|
||||
subs r2, r2, #4
|
||||
bhs 3b
|
||||
|
||||
/* are we done? */
|
||||
4: adds r2, r2, #4
|
||||
moveq r0, #0
|
||||
beq 9f
|
||||
|
||||
/* finish off the remaining bytes */
|
||||
b 8f
|
||||
|
||||
2: /* the last 4 bytes are different, restart them */
|
||||
sub r4, r4, #4
|
||||
sub r1, r1, #4
|
||||
mov r2, #4
|
||||
|
||||
/* process the last few bytes */
|
||||
8: ldrb r0, [r4], #1
|
||||
ldrb ip, [r1], #1
|
||||
// stall
|
||||
subs r0, r0, ip
|
||||
bne 9f
|
||||
subs r2, r2, #1
|
||||
bne 8b
|
||||
|
||||
9: /* restore registers and return */
|
||||
ldmfd sp!, {r4, lr}
|
||||
bx lr
|
||||
|
||||
10: /* process less than 12 bytes */
|
||||
cmp r2, #0
|
||||
moveq r0, #0
|
||||
bxeq lr
|
||||
mov r3, r0
|
||||
11:
|
||||
ldrb r0, [r3], #1
|
||||
ldrb ip, [r1], #1
|
||||
subs r0, ip
|
||||
bxne lr
|
||||
subs r2, r2, #1
|
||||
bne 11b
|
||||
bx lr
|
||||
|
||||
5: /*************** non-congruent case ***************/
|
||||
and r0, r1, #3
|
||||
cmp r0, #2
|
||||
bne 4f
|
||||
|
||||
/* here, offset is 2 (16-bits aligned, special cased) */
|
||||
|
||||
/* make sure we have at least 16 bytes to process */
|
||||
subs r2, r2, #16
|
||||
addmi r2, r2, #16
|
||||
bmi 8b
|
||||
|
||||
/* align the unaligned pointer */
|
||||
bic r1, r1, #3
|
||||
ldr lr, [r1], #4
|
||||
|
||||
6: pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||
pld [r4, #(CACHE_LINE_SIZE * 2)]
|
||||
mov ip, lr, lsr #16
|
||||
ldr lr, [r1], #4
|
||||
ldr r0, [r4], #4
|
||||
orr ip, ip, lr, lsl #16
|
||||
eors r0, r0, ip
|
||||
moveq ip, lr, lsr #16
|
||||
ldreq lr, [r1], #4
|
||||
ldreq r0, [r4], #4
|
||||
orreq ip, ip, lr, lsl #16
|
||||
eoreqs r0, r0, ip
|
||||
moveq ip, lr, lsr #16
|
||||
ldreq lr, [r1], #4
|
||||
ldreq r0, [r4], #4
|
||||
orreq ip, ip, lr, lsl #16
|
||||
eoreqs r0, r0, ip
|
||||
moveq ip, lr, lsr #16
|
||||
ldreq lr, [r1], #4
|
||||
ldreq r0, [r4], #4
|
||||
orreq ip, ip, lr, lsl #16
|
||||
eoreqs r0, r0, ip
|
||||
bne 7f
|
||||
subs r2, r2, #16
|
||||
bhs 6b
|
||||
sub r1, r1, #2
|
||||
/* are we done? */
|
||||
adds r2, r2, #16
|
||||
moveq r0, #0
|
||||
beq 9b
|
||||
/* finish off the remaining bytes */
|
||||
b 8b
|
||||
|
||||
7: /* fix up the 2 pointers and fallthrough... */
|
||||
sub r1, r1, #(4+2)
|
||||
sub r4, r4, #4
|
||||
mov r2, #4
|
||||
b 8b
|
||||
|
||||
|
||||
4: /*************** offset is 1 or 3 (less optimized) ***************/
|
||||
|
||||
stmfd sp!, {r5, r6, r7}
|
||||
|
||||
// r5 = rhs
|
||||
// r6 = lhs
|
||||
// r7 = scratch
|
||||
|
||||
mov r5, r0, lsl #3 /* r5 = right shift */
|
||||
rsb r6, r5, #32 /* r6 = left shift */
|
||||
|
||||
/* align the unaligned pointer */
|
||||
bic r1, r1, #3
|
||||
ldr r7, [r1], #4
|
||||
sub r2, r2, #8
|
||||
|
||||
6: mov ip, r7, lsr r5
|
||||
ldr r7, [r1], #4
|
||||
ldr r0, [r4], #4
|
||||
orr ip, ip, r7, lsl r6
|
||||
eors r0, r0, ip
|
||||
moveq ip, r7, lsr r5
|
||||
ldreq r7, [r1], #4
|
||||
ldreq r0, [r4], #4
|
||||
orreq ip, ip, r7, lsl r6
|
||||
eoreqs r0, r0, ip
|
||||
bne 7f
|
||||
subs r2, r2, #8
|
||||
bhs 6b
|
||||
|
||||
sub r1, r1, r6, lsr #3
|
||||
ldmfd sp!, {r5, r6, r7}
|
||||
|
||||
/* are we done? */
|
||||
adds r2, r2, #8
|
||||
moveq r0, #0
|
||||
beq 9b
|
||||
|
||||
/* finish off the remaining bytes */
|
||||
b 8b
|
||||
|
||||
7: /* fix up the 2 pointers and fallthrough... */
|
||||
sub r1, r1, #4
|
||||
sub r1, r1, r6, lsr #3
|
||||
sub r4, r4, #4
|
||||
mov r2, #4
|
||||
ldmfd sp!, {r5, r6, r7}
|
||||
b 8b
|
||||
END(memcmp)
|
||||
@@ -1,4 +1,5 @@
|
||||
libc_bionic_src_files_arm += \
|
||||
arch-arm/generic/bionic/memcmp.S \
|
||||
arch-arm/generic/bionic/memcpy.S \
|
||||
arch-arm/generic/bionic/memset.S \
|
||||
arch-arm/generic/bionic/strcmp.S \
|
||||
|
||||
Reference in New Issue
Block a user