am c2132915
: Merge "Add optimized version of memcmp for Cortex A9"
* commit 'c2132915158014f578c3f003c9399961fe8d6da2': Add optimized version of memcmp for Cortex A9
This commit is contained in:
commit
6181aead7b
@ -29,43 +29,92 @@
|
|||||||
#include <machine/cpu-features.h>
|
#include <machine/cpu-features.h>
|
||||||
#include <machine/asm.h>
|
#include <machine/asm.h>
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef HAVE_32_BYTE_CACHE_LINE
|
||||||
|
#define CACHE_LINE_SIZE 32
|
||||||
|
#else
|
||||||
|
#define CACHE_LINE_SIZE 64
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Optimized memcmp() for ARM9.
|
* Optimized memcmp() for Cortex-A9.
|
||||||
* This would not be optimal on XScale or ARM11, where more prefetching
|
|
||||||
* and use of PLD will be needed.
|
|
||||||
* The 2 major optimzations here are
|
|
||||||
* (1) The main loop compares 16 bytes at a time
|
|
||||||
* (2) The loads are scheduled in a way they won't stall
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ENTRY(memcmp)
|
ENTRY(memcmp)
|
||||||
PLD (r0, #0)
|
pld [r0, #(CACHE_LINE_SIZE * 0)]
|
||||||
PLD (r1, #0)
|
pld [r0, #(CACHE_LINE_SIZE * 1)]
|
||||||
|
|
||||||
/* take of the case where length is 0 or the buffers are the same */
|
/* take of the case where length is 0 or the buffers are the same */
|
||||||
cmp r0, r1
|
cmp r0, r1
|
||||||
cmpne r2, #0
|
|
||||||
moveq r0, #0
|
moveq r0, #0
|
||||||
bxeq lr
|
bxeq lr
|
||||||
|
|
||||||
|
pld [r1, #(CACHE_LINE_SIZE * 0)]
|
||||||
|
pld [r1, #(CACHE_LINE_SIZE * 1)]
|
||||||
|
|
||||||
|
/* make sure we have at least 8+4 bytes, this simplify things below
|
||||||
|
* and avoid some overhead for small blocks
|
||||||
|
*/
|
||||||
|
cmp r2, #(8+4)
|
||||||
|
bmi 10f
|
||||||
|
/*
|
||||||
|
* Neon optimization
|
||||||
|
* Comparing 32 bytes at a time
|
||||||
|
*/
|
||||||
|
#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
|
||||||
|
subs r2, r2, #32
|
||||||
|
blo 3f
|
||||||
|
|
||||||
|
/* preload all the cache lines we need. */
|
||||||
|
pld [r0, #(CACHE_LINE_SIZE * 2)]
|
||||||
|
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||||
|
|
||||||
|
1: /* The main loop compares 32 bytes at a time */
|
||||||
|
vld1.8 {d0 - d3}, [r0]!
|
||||||
|
pld [r0, #(CACHE_LINE_SIZE * 2)]
|
||||||
|
vld1.8 {d4 - d7}, [r1]!
|
||||||
|
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||||
|
|
||||||
|
/* Start subtracting the values and merge results */
|
||||||
|
vsub.i8 q0, q2
|
||||||
|
vsub.i8 q1, q3
|
||||||
|
vorr q2, q0, q1
|
||||||
|
vorr d4, d5
|
||||||
|
vmov r3, ip, d4
|
||||||
|
/* Check if there are any differences among the 32 bytes */
|
||||||
|
orrs r3, ip
|
||||||
|
bne 2f
|
||||||
|
subs r2, r2, #32
|
||||||
|
bhs 1b
|
||||||
|
b 3f
|
||||||
|
2:
|
||||||
|
/* Check if the difference was in the first or last 16 bytes */
|
||||||
|
sub r0, #32
|
||||||
|
vorr d0, d1
|
||||||
|
sub r1, #32
|
||||||
|
vmov r3, ip, d0
|
||||||
|
orrs r3, ip
|
||||||
|
/* if the first 16 bytes are equal, we only have to rewind 16 bytes */
|
||||||
|
ittt eq
|
||||||
|
subeq r2, #16
|
||||||
|
addeq r0, #16
|
||||||
|
addeq r1, #16
|
||||||
|
|
||||||
|
3: /* fix-up the remaining count */
|
||||||
|
add r2, r2, #32
|
||||||
|
|
||||||
|
cmp r2, #(8+4)
|
||||||
|
bmi 10f
|
||||||
|
#endif
|
||||||
|
|
||||||
.save {r4, lr}
|
.save {r4, lr}
|
||||||
/* save registers */
|
/* save registers */
|
||||||
stmfd sp!, {r4, lr}
|
stmfd sp!, {r4, lr}
|
||||||
|
|
||||||
PLD (r0, #32)
|
|
||||||
PLD (r1, #32)
|
|
||||||
|
|
||||||
/* since r0 hold the result, move the first source
|
/* since r0 hold the result, move the first source
|
||||||
* pointer somewhere else
|
* pointer somewhere else
|
||||||
*/
|
*/
|
||||||
|
|
||||||
mov r4, r0
|
mov r4, r0
|
||||||
|
|
||||||
/* make sure we have at least 8+4 bytes, this simplify things below
|
|
||||||
* and avoid some overhead for small blocks
|
|
||||||
*/
|
|
||||||
cmp r2, #(8+4)
|
|
||||||
bmi 8f
|
|
||||||
|
|
||||||
/* align first pointer to word boundary
|
/* align first pointer to word boundary
|
||||||
* offset = -src & 3
|
* offset = -src & 3
|
||||||
@ -103,8 +152,8 @@ ENTRY(memcmp)
|
|||||||
subs r2, r2, #(32 + 4)
|
subs r2, r2, #(32 + 4)
|
||||||
bmi 1f
|
bmi 1f
|
||||||
|
|
||||||
0: PLD (r4, #64)
|
0: pld [r4, #(CACHE_LINE_SIZE * 2)]
|
||||||
PLD (r1, #64)
|
pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||||
ldr r0, [r4], #4
|
ldr r0, [r4], #4
|
||||||
ldr lr, [r1, #4]!
|
ldr lr, [r1, #4]!
|
||||||
eors r0, r0, ip
|
eors r0, r0, ip
|
||||||
@ -170,12 +219,24 @@ ENTRY(memcmp)
|
|||||||
9: /* restore registers and return */
|
9: /* restore registers and return */
|
||||||
ldmfd sp!, {r4, lr}
|
ldmfd sp!, {r4, lr}
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
|
10: /* process less than 12 bytes */
|
||||||
|
cmp r2, #0
|
||||||
|
moveq r0, #0
|
||||||
|
bxeq lr
|
||||||
|
mov r3, r0
|
||||||
|
11:
|
||||||
|
ldrb r0, [r3], #1
|
||||||
|
ldrb ip, [r1], #1
|
||||||
|
subs r0, ip
|
||||||
|
bxne lr
|
||||||
|
subs r2, r2, #1
|
||||||
|
bne 11b
|
||||||
|
bx lr
|
||||||
END(memcmp)
|
END(memcmp)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
5: /*************** non-congruent case ***************/
|
5: /*************** non-congruent case ***************/
|
||||||
and r0, r1, #3
|
and r0, r1, #3
|
||||||
cmp r0, #2
|
cmp r0, #2
|
||||||
@ -192,8 +253,8 @@ END(memcmp)
|
|||||||
bic r1, r1, #3
|
bic r1, r1, #3
|
||||||
ldr lr, [r1], #4
|
ldr lr, [r1], #4
|
||||||
|
|
||||||
6: PLD (r1, #64)
|
6: pld [r1, #(CACHE_LINE_SIZE * 2)]
|
||||||
PLD (r4, #64)
|
pld [r4, #(CACHE_LINE_SIZE * 2)]
|
||||||
mov ip, lr, lsr #16
|
mov ip, lr, lsr #16
|
||||||
ldr lr, [r1], #4
|
ldr lr, [r1], #4
|
||||||
ldr r0, [r4], #4
|
ldr r0, [r4], #4
|
||||||
|
Loading…
Reference in New Issue
Block a user