bionic/libc/arch-x86_64/string/ssse3-strcmp-slm.S
Varvara Rainchik a020a244ae Add 64-bit Silvermont-optimized string/memory functions.
Add following functions:
bcopy, bzero, memcpy, memmove, memset, stpcpy, stpncpy, strcat, strcpy,
strlen, strncat, strncpy, memcmp, strcmp, strncmp.
Set all these functions as the default ones.

Change-Id: Ic66b250ad8c349a43d25e2d4dea075604f6df6ac
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
2014-05-12 17:37:07 -07:00

1926 lines
41 KiB
ArmAsm

/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef USE_AS_STRNCMP
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
if the new counter > the old one or is 0. */
#define UPDATE_STRNCMP_COUNTER \
/* calculate left number to compare */ \
lea -16(%rcx, %r11), %r9; \
cmp %r9, %r11; \
jb L(strcmp_exitz); \
test %r9, %r9; \
je L(strcmp_exitz); \
mov %r9, %r11
#else
#define UPDATE_STRNCMP_COUNTER
#ifndef STRCMP
#define STRCMP strcmp
#endif
#endif
#ifndef L
# define L(label) .L##label
#endif
#ifndef cfi_startproc
# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
# define cfi_endproc .cfi_endproc
#endif
#ifndef ENTRY
# define ENTRY(name) \
.type name, @function; \
.globl name; \
.p2align 4; \
name: \
cfi_startproc
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
.size name, .-name
#endif
#define RETURN ret
.section .text.ssse3,"ax",@progbits
ENTRY (STRCMP)
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
#ifdef USE_AS_STRNCMP
test %rdx, %rdx
je L(strcmp_exitz)
cmp $1, %rdx
je L(Byte0)
mov %rdx, %r11
#endif
mov %esi, %ecx
mov %edi, %eax
/* Use 64bit AND here to avoid long NOP padding. */
and $0x3f, %rcx /* rsi alignment in cache line */
and $0x3f, %rax /* rdi alignment in cache line */
cmp $0x30, %ecx
ja L(crosscache) /* rsi: 16-byte load will cross cache line */
cmp $0x30, %eax
ja L(crosscache) /* rdi: 16-byte load will cross cache line */
movlpd (%rdi), %xmm1
movlpd (%rsi), %xmm2
movhpd 8(%rdi), %xmm1
movhpd 8(%rsi), %xmm2
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz L(less16bytes) /* If not, find different value or null char */
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz) /* finish comparision */
#endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
/*
* Determine source and destination string offsets from 16-byte alignment.
* Use relative offset difference between the two to determine which case
* below to use.
*/
.p2align 4
L(crosscache):
and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
mov $0xffff, %edx /* for equivalent offset */
xor %r8d, %r8d
and $0xf, %ecx /* offset of rsi */
and $0xf, %eax /* offset of rdi */
cmp %eax, %ecx
je L(ashr_0) /* rsi and rdi relative offset same */
ja L(bigger)
mov %edx, %r8d /* r8d is offset flag for exit tail */
xchg %ecx, %eax
xchg %rsi, %rdi
L(bigger):
lea 15(%rax), %r9
sub %rcx, %r9
lea L(unaligned_table)(%rip), %r10
movslq (%r10, %r9,4), %r9
lea (%r10, %r9), %r10
jmp *%r10 /* jump to corresponding case */
/*
* The following cases will be handled by ashr_0
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(0~15) n(0~15) 15(15+ n-n) ashr_0
*/
.p2align 4
L(ashr_0):
movdqa (%rsi), %xmm1
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
sub %r9d, %edx
/*
* edx must be the same with r9d if in left byte (16-rcx) is equal to
* the start from (16-rax) and no null char was seen.
*/
jne L(less32bytes) /* mismatch or null char */
UPDATE_STRNCMP_COUNTER
mov $16, %rcx
mov $16, %r9
pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
/*
* Now both strings are aligned at 16-byte boundary. Loop over strings
* checking 32-bytes per iteration.
*/
.p2align 4
L(loop_ashr_0):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit) /* mismatch or null char seen */
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
jmp L(loop_ashr_0)
/*
* The following cases will be handled by ashr_1
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(15) n -15 0(15 +(n-15) - n) ashr_1
*/
.p2align 4
L(ashr_1):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pslldq $15, %xmm2 /* shift first string to align with second */
pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
psubb %xmm0, %xmm2 /* packed sub of comparison results*/
pmovmskb %xmm2, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
sub %r9d, %edx
jnz L(less32bytes) /* mismatch or null char seen */
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads*/
mov $1, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 1(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_1):
add $16, %r10
jg L(nibble_ashr_1) /* cross page boundary */
L(gobble_ashr_1):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_1) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_1)
/*
* Nibble avoids loads across page boundary. This is to avoid a potential
* access into unmapped memory.
*/
.p2align 4
L(nibble_ashr_1):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
pmovmskb %xmm0, %edx
test $0xfffe, %edx
jnz L(ashr_1_exittail) /* find null char*/
#ifdef USE_AS_STRNCMP
cmp $14, %r11
jbe L(ashr_1_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10 /* substract 4K from %r10 */
jmp L(gobble_ashr_1)
/*
* Once find null char, determine if there is a string mismatch
* before the null char.
*/
.p2align 4
L(ashr_1_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $1, %xmm0
psrldq $1, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_2
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
*/
.p2align 4
L(ashr_2):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $14, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $2, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 2(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_2):
add $16, %r10
jg L(nibble_ashr_2)
L(gobble_ashr_2):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_2) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_2)
.p2align 4
L(nibble_ashr_2):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xfffc, %edx
jnz L(ashr_2_exittail)
#ifdef USE_AS_STRNCMP
cmp $13, %r11
jbe L(ashr_2_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_2)
.p2align 4
L(ashr_2_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $2, %xmm0
psrldq $2, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_3
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3
*/
.p2align 4
L(ashr_3):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $13, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $3, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 3(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_3):
add $16, %r10
jg L(nibble_ashr_3)
L(gobble_ashr_3):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_3) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_3)
.p2align 4
L(nibble_ashr_3):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xfff8, %edx
jnz L(ashr_3_exittail)
#ifdef USE_AS_STRNCMP
cmp $12, %r11
jbe L(ashr_3_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_3)
.p2align 4
L(ashr_3_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $3, %xmm0
psrldq $3, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_4
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4
*/
.p2align 4
L(ashr_4):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $12, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $4, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 4(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_4):
add $16, %r10
jg L(nibble_ashr_4)
L(gobble_ashr_4):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_4) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_4)
.p2align 4
L(nibble_ashr_4):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xfff0, %edx
jnz L(ashr_4_exittail)
#ifdef USE_AS_STRNCMP
cmp $11, %r11
jbe L(ashr_4_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_4)
.p2align 4
L(ashr_4_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $4, %xmm0
psrldq $4, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_5
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
*/
.p2align 4
L(ashr_5):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $11, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $5, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 5(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_5):
add $16, %r10
jg L(nibble_ashr_5)
L(gobble_ashr_5):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_5) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_5)
.p2align 4
L(nibble_ashr_5):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xffe0, %edx
jnz L(ashr_5_exittail)
#ifdef USE_AS_STRNCMP
cmp $10, %r11
jbe L(ashr_5_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_5)
.p2align 4
L(ashr_5_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $5, %xmm0
psrldq $5, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_6
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
*/
.p2align 4
L(ashr_6):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $10, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $6, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 6(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_6):
add $16, %r10
jg L(nibble_ashr_6)
L(gobble_ashr_6):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_6) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_6)
.p2align 4
L(nibble_ashr_6):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xffc0, %edx
jnz L(ashr_6_exittail)
#ifdef USE_AS_STRNCMP
cmp $9, %r11
jbe L(ashr_6_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_6)
.p2align 4
L(ashr_6_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $6, %xmm0
psrldq $6, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_7
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
*/
.p2align 4
L(ashr_7):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $9, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $7, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 7(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_7):
add $16, %r10
jg L(nibble_ashr_7)
L(gobble_ashr_7):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_7) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_7)
.p2align 4
L(nibble_ashr_7):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xff80, %edx
jnz L(ashr_7_exittail)
#ifdef USE_AS_STRNCMP
cmp $8, %r11
jbe L(ashr_7_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_7)
.p2align 4
L(ashr_7_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $7, %xmm0
psrldq $7, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_8
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
*/
.p2align 4
L(ashr_8):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $8, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $8, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 8(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_8):
add $16, %r10
jg L(nibble_ashr_8)
L(gobble_ashr_8):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_8) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_8)
.p2align 4
L(nibble_ashr_8):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xff00, %edx
jnz L(ashr_8_exittail)
#ifdef USE_AS_STRNCMP
cmp $7, %r11
jbe L(ashr_8_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_8)
.p2align 4
L(ashr_8_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $8, %xmm0
psrldq $8, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_9
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
*/
.p2align 4
L(ashr_9):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $7, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $9, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 9(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_9):
add $16, %r10
jg L(nibble_ashr_9)
L(gobble_ashr_9):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_9) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3 /* store for next cycle */
jmp L(loop_ashr_9)
.p2align 4
L(nibble_ashr_9):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xfe00, %edx
jnz L(ashr_9_exittail)
#ifdef USE_AS_STRNCMP
cmp $6, %r11
jbe L(ashr_9_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_9)
.p2align 4
L(ashr_9_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $9, %xmm0
psrldq $9, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_10
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
*/
.p2align 4
L(ashr_10):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $6, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $10, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 10(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_10):
add $16, %r10
jg L(nibble_ashr_10)
L(gobble_ashr_10):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_10) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_10)
.p2align 4
L(nibble_ashr_10):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xfc00, %edx
jnz L(ashr_10_exittail)
#ifdef USE_AS_STRNCMP
cmp $5, %r11
jbe L(ashr_10_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_10)
.p2align 4
L(ashr_10_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $10, %xmm0
psrldq $10, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_11
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
*/
.p2align 4
L(ashr_11):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $5, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $11, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 11(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_11):
add $16, %r10
jg L(nibble_ashr_11)
L(gobble_ashr_11):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_11) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_11)
.p2align 4
L(nibble_ashr_11):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xf800, %edx
jnz L(ashr_11_exittail)
#ifdef USE_AS_STRNCMP
cmp $4, %r11
jbe L(ashr_11_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_11)
.p2align 4
L(ashr_11_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $11, %xmm0
psrldq $11, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_12
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
*/
.p2align 4
L(ashr_12):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $4, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $12, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 12(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_12):
add $16, %r10
jg L(nibble_ashr_12)
L(gobble_ashr_12):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_12) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_12)
.p2align 4
L(nibble_ashr_12):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xf000, %edx
jnz L(ashr_12_exittail)
#ifdef USE_AS_STRNCMP
cmp $3, %r11
jbe L(ashr_12_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_12)
.p2align 4
L(ashr_12_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $12, %xmm0
psrldq $12, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_13
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
*/
.p2align 4
L(ashr_13):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $3, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $13, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 13(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_13):
add $16, %r10
jg L(nibble_ashr_13)
L(gobble_ashr_13):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_13) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_13)
.p2align 4
L(nibble_ashr_13):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xe000, %edx
jnz L(ashr_13_exittail)
#ifdef USE_AS_STRNCMP
cmp $2, %r11
jbe L(ashr_13_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_13)
.p2align 4
L(ashr_13_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $13, %xmm0
psrldq $13, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_14
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
*/
.p2align 4
L(ashr_14):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $2, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $14, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 14(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_14):
add $16, %r10
jg L(nibble_ashr_14)
L(gobble_ashr_14):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_14) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_14)
.p2align 4
L(nibble_ashr_14):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0xc000, %edx
jnz L(ashr_14_exittail)
#ifdef USE_AS_STRNCMP
cmp $1, %r11
jbe L(ashr_14_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_14)
.p2align 4
L(ashr_14_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $14, %xmm0
psrldq $14, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_15
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
*/
.p2align 4
L(ashr_15):
pxor %xmm0, %xmm0
movdqa (%rdi), %xmm2
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $1, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
sub %r9d, %edx
jnz L(less32bytes)
movdqa (%rdi), %xmm3
UPDATE_STRNCMP_COUNTER
pxor %xmm0, %xmm0
mov $16, %rcx /* index for loads */
mov $15, %r9d /* byte position left over from less32bytes case */
/*
* Setup %r10 value allows us to detect crossing a page boundary.
* When %r10 goes positive we have crossed a page boundary and
* need to do a nibble.
*/
lea 15(%rdi), %r10
and $0xfff, %r10 /* offset into 4K page */
sub $0x1000, %r10 /* subtract 4K pagesize */
.p2align 4
L(loop_ashr_15):
add $16, %r10
jg L(nibble_ashr_15)
L(gobble_ashr_15):
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
add $16, %r10
jg L(nibble_ashr_15) /* cross page boundary */
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz L(exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
jbe L(strcmp_exitz)
#endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_15)
.p2align 4
L(nibble_ashr_15):
pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
pmovmskb %xmm0, %edx
test $0x8000, %edx
jnz L(ashr_15_exittail)
#ifdef USE_AS_STRNCMP
test %r11, %r11
je L(ashr_15_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
jmp L(gobble_ashr_15)
.p2align 4
L(ashr_15_exittail):
movdqa (%rsi, %rcx), %xmm1
psrldq $15, %xmm3
psrldq $15, %xmm0
.p2align 4
L(aftertail):
pcmpeqb %xmm3, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edx
not %edx
.p2align 4
L(exit):
lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
L(less32bytes):
lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
test %r8d, %r8d
jz L(ret)
xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
.p2align 4
L(ret):
L(less16bytes):
bsf %rdx, %rdx /* find and store bit index in %rdx */
#ifdef USE_AS_STRNCMP
sub %rdx, %r11
jbe L(strcmp_exitz)
#endif
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax
sub %ecx, %eax
ret
L(strcmp_exitz):
xor %eax, %eax
ret
.p2align 4
L(Byte0):
movzx (%rsi), %ecx
movzx (%rdi), %eax
sub %ecx, %eax
ret
END (STRCMP)
.section .rodata,"a",@progbits
.p2align 3
L(unaligned_table):
.int L(ashr_1) - L(unaligned_table)
.int L(ashr_2) - L(unaligned_table)
.int L(ashr_3) - L(unaligned_table)
.int L(ashr_4) - L(unaligned_table)
.int L(ashr_5) - L(unaligned_table)
.int L(ashr_6) - L(unaligned_table)
.int L(ashr_7) - L(unaligned_table)
.int L(ashr_8) - L(unaligned_table)
.int L(ashr_9) - L(unaligned_table)
.int L(ashr_10) - L(unaligned_table)
.int L(ashr_11) - L(unaligned_table)
.int L(ashr_12) - L(unaligned_table)
.int L(ashr_13) - L(unaligned_table)
.int L(ashr_14) - L(unaligned_table)
.int L(ashr_15) - L(unaligned_table)
.int L(ashr_0) - L(unaligned_table)