bionic/libc/arch-x86/string/ssse3-strcmp-atom.S
Liubov Dmitrieva 0a490665a3 bionic/x86: Optimization for string routines
Optimized strcpy, strcat,
strncpy, strncat, strlcpy, strlcat,
memchr, memrchr, strchr, strrchr, index,
strnlen, strlen, wcslen, wmemcmp, wcscmp,
wcschr, wcsrchr, wcscpy, wcscat

Change-Id: I82b29132edf9a2e144e0bb3ee4ff5217df8d2a6d
Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
2013-05-31 13:37:03 +04:00

2279 lines
38 KiB
ArmAsm

/*
Copyright (c) 2010, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef L
# define L(label) .L##label
#endif
#ifndef cfi_startproc
# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
# define cfi_endproc .cfi_endproc
#endif
#ifndef cfi_rel_offset
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
#endif
#ifndef cfi_restore
# define cfi_restore(reg) .cfi_restore reg
#endif
#ifndef cfi_adjust_cfa_offset
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
#endif
#ifndef cfi_remember_state
# define cfi_remember_state .cfi_remember_state
#endif
#ifndef cfi_restore_state
# define cfi_restore_state .cfi_restore_state
#endif
#ifndef ENTRY
# define ENTRY(name) \
.type name, @function; \
.globl name; \
.p2align 4; \
name: \
cfi_startproc
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
.size name, .-name
#endif
#define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
#define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
#define POP(REG) popl REG; CFI_POP (REG)
#ifndef USE_AS_STRNCMP
# define STR1 4
# define STR2 STR1+4
# define RETURN ret
# define UPDATE_STRNCMP_COUNTER
#else
# define STR1 8
# define STR2 STR1+4
# define CNT STR2+4
# define RETURN POP (%ebp); ret; CFI_PUSH (%ebp)
# define UPDATE_STRNCMP_COUNTER \
/* calculate left number to compare */ \
mov $16, %esi; \
sub %ecx, %esi; \
cmpl %esi, %ebp; \
jbe L(more8byteseq); \
sub %esi, %ebp
#endif
#ifndef STRCMP
# define STRCMP strcmp
#endif
.section .text.ssse3,"ax",@progbits
ENTRY (STRCMP)
#ifdef USE_AS_STRNCMP
PUSH (%ebp)
#endif
movl STR1(%esp), %edx
movl STR2(%esp), %eax
#ifdef USE_AS_STRNCMP
movl CNT(%esp), %ebp
cmpl $16, %ebp
jb L(less16bytes_sncmp)
jmp L(more16bytes)
#endif
movzbl (%eax), %ecx
cmpb %cl, (%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 1(%eax), %ecx
cmpb %cl, 1(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 2(%eax), %ecx
cmpb %cl, 2(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 3(%eax), %ecx
cmpb %cl, 3(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 4(%eax), %ecx
cmpb %cl, 4(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 5(%eax), %ecx
cmpb %cl, 5(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 6(%eax), %ecx
cmpb %cl, 6(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
movzbl 7(%eax), %ecx
cmpb %cl, 7(%edx)
jne L(neq)
cmpl $0, %ecx
je L(eq)
add $8, %edx
add $8, %eax
#ifdef USE_AS_STRNCMP
cmpl $8, %ebp
lea -8(%ebp), %ebp
je L(eq)
L(more16bytes):
#endif
movl %edx, %ecx
and $0xfff, %ecx
cmpl $0xff0, %ecx
ja L(crosspage)
mov %eax, %ecx
and $0xfff, %ecx
cmpl $0xff0, %ecx
ja L(crosspage)
pxor %xmm0, %xmm0
movlpd (%eax), %xmm1
movlpd (%edx), %xmm2
movhpd 8(%eax), %xmm1
movhpd 8(%edx), %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %ecx
sub $0xffff, %ecx
jnz L(less16bytes)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(eq)
#endif
add $16, %eax
add $16, %edx
L(crosspage):
PUSH (%ebx)
PUSH (%edi)
PUSH (%esi)
#ifdef USE_AS_STRNCMP
cfi_remember_state
#endif
movl %edx, %edi
movl %eax, %ecx
and $0xf, %ecx
and $0xf, %edi
xor %ecx, %eax
xor %edi, %edx
xor %ebx, %ebx
cmpl %edi, %ecx
je L(ashr_0)
ja L(bigger)
or $0x20, %ebx
xchg %edx, %eax
xchg %ecx, %edi
L(bigger):
lea 15(%edi), %edi
sub %ecx, %edi
cmpl $8, %edi
jle L(ashr_less_8)
cmpl $14, %edi
je L(ashr_15)
cmpl $13, %edi
je L(ashr_14)
cmpl $12, %edi
je L(ashr_13)
cmpl $11, %edi
je L(ashr_12)
cmpl $10, %edi
je L(ashr_11)
cmpl $9, %edi
je L(ashr_10)
L(ashr_less_8):
je L(ashr_9)
cmpl $7, %edi
je L(ashr_8)
cmpl $6, %edi
je L(ashr_7)
cmpl $5, %edi
je L(ashr_6)
cmpl $4, %edi
je L(ashr_5)
cmpl $3, %edi
je L(ashr_4)
cmpl $2, %edi
je L(ashr_3)
cmpl $1, %edi
je L(ashr_2)
cmpl $0, %edi
je L(ashr_1)
/*
* The following cases will be handled by ashr_0
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(0~15) n(0~15) 15(15+ n-n) ashr_0
*/
.p2align 4
L(ashr_0):
mov $0xffff, %esi
movdqa (%eax), %xmm1
pxor %xmm0, %xmm0
pcmpeqb %xmm1, %xmm0
pcmpeqb (%edx), %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
mov %ecx, %edi
jne L(less32bytes)
UPDATE_STRNCMP_COUNTER
mov $0x10, %ebx
mov $0x10, %ecx
pxor %xmm0, %xmm0
.p2align 4
L(loop_ashr_0):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
jmp L(loop_ashr_0)
/*
* The following cases will be handled by ashr_1
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(15) n -15 0(15 +(n-15) - n) ashr_1
*/
.p2align 4
L(ashr_1):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $15, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -15(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $1, %ebx
lea 1(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_1):
add $16, %edi
jg L(nibble_ashr_1)
L(gobble_ashr_1):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $1, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_1)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $1, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_1)
.p2align 4
L(nibble_ashr_1):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfffe, %esi
jnz L(ashr_1_exittail)
#ifdef USE_AS_STRNCMP
cmpl $15, %ebp
jbe L(ashr_1_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_1)
.p2align 4
L(ashr_1_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $1, %xmm0
psrldq $1, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_2
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
*/
.p2align 4
L(ashr_2):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $14, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -14(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $2, %ebx
lea 2(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_2):
add $16, %edi
jg L(nibble_ashr_2)
L(gobble_ashr_2):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $2, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_2)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $2, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_2)
.p2align 4
L(nibble_ashr_2):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfffc, %esi
jnz L(ashr_2_exittail)
#ifdef USE_AS_STRNCMP
cmpl $14, %ebp
jbe L(ashr_2_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_2)
.p2align 4
L(ashr_2_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $2, %xmm0
psrldq $2, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_3
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3
*/
.p2align 4
L(ashr_3):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $13, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -13(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $3, %ebx
lea 3(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_3):
add $16, %edi
jg L(nibble_ashr_3)
L(gobble_ashr_3):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $3, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_3)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $3, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_3)
.p2align 4
L(nibble_ashr_3):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfff8, %esi
jnz L(ashr_3_exittail)
#ifdef USE_AS_STRNCMP
cmpl $13, %ebp
jbe L(ashr_3_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_3)
.p2align 4
L(ashr_3_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $3, %xmm0
psrldq $3, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_4
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4
*/
.p2align 4
L(ashr_4):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $12, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -12(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $4, %ebx
lea 4(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_4):
add $16, %edi
jg L(nibble_ashr_4)
L(gobble_ashr_4):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $4, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_4)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $4, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_4)
.p2align 4
L(nibble_ashr_4):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfff0, %esi
jnz L(ashr_4_exittail)
#ifdef USE_AS_STRNCMP
cmpl $12, %ebp
jbe L(ashr_4_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_4)
.p2align 4
L(ashr_4_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $4, %xmm0
psrldq $4, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_5
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(11~15) n -11 4(15 +(n-11) - n) ashr_5
*/
.p2align 4
L(ashr_5):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $11, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -11(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $5, %ebx
lea 5(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_5):
add $16, %edi
jg L(nibble_ashr_5)
L(gobble_ashr_5):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $5, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_5)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $5, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_5)
.p2align 4
L(nibble_ashr_5):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xffe0, %esi
jnz L(ashr_5_exittail)
#ifdef USE_AS_STRNCMP
cmpl $11, %ebp
jbe L(ashr_5_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_5)
.p2align 4
L(ashr_5_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $5, %xmm0
psrldq $5, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_6
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(10~15) n -10 5(15 +(n-10) - n) ashr_6
*/
.p2align 4
L(ashr_6):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $10, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -10(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $6, %ebx
lea 6(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_6):
add $16, %edi
jg L(nibble_ashr_6)
L(gobble_ashr_6):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $6, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_6)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $6, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_6)
.p2align 4
L(nibble_ashr_6):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xffc0, %esi
jnz L(ashr_6_exittail)
#ifdef USE_AS_STRNCMP
cmpl $10, %ebp
jbe L(ashr_6_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_6)
.p2align 4
L(ashr_6_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $6, %xmm0
psrldq $6, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_7
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(9~15) n - 9 6(15 +(n-9) - n) ashr_7
*/
.p2align 4
L(ashr_7):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $9, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -9(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $7, %ebx
lea 8(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_7):
add $16, %edi
jg L(nibble_ashr_7)
L(gobble_ashr_7):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $7, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_7)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $7, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_7)
.p2align 4
L(nibble_ashr_7):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xff80, %esi
jnz L(ashr_7_exittail)
#ifdef USE_AS_STRNCMP
cmpl $9, %ebp
jbe L(ashr_7_exittail)
#endif
pxor %xmm0, %xmm0
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_7)
.p2align 4
L(ashr_7_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $7, %xmm0
psrldq $7, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_8
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(8~15) n - 8 7(15 +(n-8) - n) ashr_8
*/
.p2align 4
L(ashr_8):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $8, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -8(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $8, %ebx
lea 8(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_8):
add $16, %edi
jg L(nibble_ashr_8)
L(gobble_ashr_8):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $8, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_8)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $8, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_8)
.p2align 4
L(nibble_ashr_8):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xff00, %esi
jnz L(ashr_8_exittail)
#ifdef USE_AS_STRNCMP
cmpl $8, %ebp
jbe L(ashr_8_exittail)
#endif
pxor %xmm0, %xmm0
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_8)
.p2align 4
L(ashr_8_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $8, %xmm0
psrldq $8, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_9
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(7~15) n - 7 8(15 +(n-7) - n) ashr_9
*/
.p2align 4
L(ashr_9):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $7, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -7(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $9, %ebx
lea 9(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_9):
add $16, %edi
jg L(nibble_ashr_9)
L(gobble_ashr_9):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $9, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_9)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $9, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_9)
.p2align 4
L(nibble_ashr_9):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfe00, %esi
jnz L(ashr_9_exittail)
#ifdef USE_AS_STRNCMP
cmpl $7, %ebp
jbe L(ashr_9_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_9)
.p2align 4
L(ashr_9_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $9, %xmm0
psrldq $9, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_10
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(6~15) n - 6 9(15 +(n-6) - n) ashr_10
*/
.p2align 4
L(ashr_10):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $6, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -6(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $10, %ebx
lea 10(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_10):
add $16, %edi
jg L(nibble_ashr_10)
L(gobble_ashr_10):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $10, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_10)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $10, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_10)
.p2align 4
L(nibble_ashr_10):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xfc00, %esi
jnz L(ashr_10_exittail)
#ifdef USE_AS_STRNCMP
cmpl $6, %ebp
jbe L(ashr_10_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_10)
.p2align 4
L(ashr_10_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $10, %xmm0
psrldq $10, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_11
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(5~15) n - 5 10(15 +(n-5) - n) ashr_11
*/
.p2align 4
L(ashr_11):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $5, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -5(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $11, %ebx
lea 11(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_11):
add $16, %edi
jg L(nibble_ashr_11)
L(gobble_ashr_11):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $11, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_11)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $11, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_11)
.p2align 4
L(nibble_ashr_11):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xf800, %esi
jnz L(ashr_11_exittail)
#ifdef USE_AS_STRNCMP
cmpl $5, %ebp
jbe L(ashr_11_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_11)
.p2align 4
L(ashr_11_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $11, %xmm0
psrldq $11, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_12
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(4~15) n - 4 11(15 +(n-4) - n) ashr_12
*/
.p2align 4
L(ashr_12):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $4, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -4(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $12, %ebx
lea 12(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_12):
add $16, %edi
jg L(nibble_ashr_12)
L(gobble_ashr_12):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $12, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_12)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $12, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_12)
.p2align 4
L(nibble_ashr_12):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xf000, %esi
jnz L(ashr_12_exittail)
#ifdef USE_AS_STRNCMP
cmpl $4, %ebp
jbe L(ashr_12_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_12)
.p2align 4
L(ashr_12_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $12, %xmm0
psrldq $12, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_13
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(3~15) n - 3 12(15 +(n-3) - n) ashr_13
*/
.p2align 4
L(ashr_13):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $3, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -3(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $13, %ebx
lea 13(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_13):
add $16, %edi
jg L(nibble_ashr_13)
L(gobble_ashr_13):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $13, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_13)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $13, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_13)
.p2align 4
L(nibble_ashr_13):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xe000, %esi
jnz L(ashr_13_exittail)
#ifdef USE_AS_STRNCMP
cmpl $3, %ebp
jbe L(ashr_13_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_13)
.p2align 4
L(ashr_13_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $13, %xmm0
psrldq $13, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_14
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(2~15) n - 2 13(15 +(n-2) - n) ashr_14
*/
.p2align 4
L(ashr_14):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $2, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -2(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $14, %ebx
lea 14(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_14):
add $16, %edi
jg L(nibble_ashr_14)
L(gobble_ashr_14):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $14, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_14)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $14, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_14)
.p2align 4
L(nibble_ashr_14):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0xc000, %esi
jnz L(ashr_14_exittail)
#ifdef USE_AS_STRNCMP
cmpl $2, %ebp
jbe L(ashr_14_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_14)
.p2align 4
L(ashr_14_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $14, %xmm0
psrldq $14, %xmm3
jmp L(aftertail)
/*
* The following cases will be handled by ashr_14
* ecx(offset of esi) eax(offset of edi) relative offset corresponding case
* n(1~15) n - 1 14(15 +(n-1) - n) ashr_15
*/
.p2align 4
L(ashr_15):
mov $0xffff, %esi
pxor %xmm0, %xmm0
movdqa (%edx), %xmm2
movdqa (%eax), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $1, %xmm2
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %edi
shr %cl, %esi
shr %cl, %edi
sub %edi, %esi
lea -1(%ecx), %edi
jnz L(less32bytes)
UPDATE_STRNCMP_COUNTER
movdqa (%edx), %xmm3
pxor %xmm0, %xmm0
mov $16, %ecx
or $15, %ebx
lea 15(%edx), %edi
and $0xfff, %edi
sub $0x1000, %edi
.p2align 4
L(loop_ashr_15):
add $16, %edi
jg L(nibble_ashr_15)
L(gobble_ashr_15):
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $15, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_15)
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
palignr $15, %xmm3, %xmm2
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
sub $0xffff, %esi
jnz L(exit)
#ifdef USE_AS_STRNCMP
cmpl $16, %ebp
lea -16(%ebp), %ebp
jbe L(more8byteseq)
#endif
add $16, %ecx
movdqa %xmm4, %xmm3
jmp L(loop_ashr_15)
.p2align 4
L(nibble_ashr_15):
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %esi
test $0x8000, %esi
jnz L(ashr_15_exittail)
#ifdef USE_AS_STRNCMP
cmpl $1, %ebp
jbe L(ashr_15_exittail)
#endif
pxor %xmm0, %xmm0
sub $0x1000, %edi
jmp L(gobble_ashr_15)
.p2align 4
L(ashr_15_exittail):
movdqa (%eax, %ecx), %xmm1
psrldq $15, %xmm0
psrldq $15, %xmm3
jmp L(aftertail)
.p2align 4
L(aftertail):
pcmpeqb %xmm3, %xmm1
psubb %xmm0, %xmm1
pmovmskb %xmm1, %esi
not %esi
L(exit):
mov %ebx, %edi
and $0x1f, %edi
lea -16(%edi, %ecx), %edi
L(less32bytes):
add %edi, %edx
add %ecx, %eax
test $0x20, %ebx
jz L(ret2)
xchg %eax, %edx
.p2align 4
L(ret2):
mov %esi, %ecx
POP (%esi)
POP (%edi)
POP (%ebx)
L(less16bytes):
test %cl, %cl
jz L(2next_8_bytes)
test $0x01, %cl
jnz L(Byte0)
test $0x02, %cl
jnz L(Byte1)
test $0x04, %cl
jnz L(Byte2)
test $0x08, %cl
jnz L(Byte3)
test $0x10, %cl
jnz L(Byte4)
test $0x20, %cl
jnz L(Byte5)
test $0x40, %cl
jnz L(Byte6)
#ifdef USE_AS_STRNCMP
cmpl $7, %ebp
jbe L(eq)
#endif
movzbl 7(%eax), %ecx
movzbl 7(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte0):
#ifdef USE_AS_STRNCMP
cmpl $0, %ebp
jbe L(eq)
#endif
movzbl (%eax), %ecx
movzbl (%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte1):
#ifdef USE_AS_STRNCMP
cmpl $1, %ebp
jbe L(eq)
#endif
movzbl 1(%eax), %ecx
movzbl 1(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte2):
#ifdef USE_AS_STRNCMP
cmpl $2, %ebp
jbe L(eq)
#endif
movzbl 2(%eax), %ecx
movzbl 2(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte3):
#ifdef USE_AS_STRNCMP
cmpl $3, %ebp
jbe L(eq)
#endif
movzbl 3(%eax), %ecx
movzbl 3(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte4):
#ifdef USE_AS_STRNCMP
cmpl $4, %ebp
jbe L(eq)
#endif
movzbl 4(%eax), %ecx
movzbl 4(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte5):
#ifdef USE_AS_STRNCMP
cmpl $5, %ebp
jbe L(eq)
#endif
movzbl 5(%eax), %ecx
movzbl 5(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(Byte6):
#ifdef USE_AS_STRNCMP
cmpl $6, %ebp
jbe L(eq)
#endif
movzbl 6(%eax), %ecx
movzbl 6(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(2next_8_bytes):
add $8, %eax
add $8, %edx
#ifdef USE_AS_STRNCMP
cmpl $8, %ebp
lea -8(%ebp), %ebp
jbe L(eq)
#endif
test $0x01, %ch
jnz L(Byte0)
test $0x02, %ch
jnz L(Byte1)
test $0x04, %ch
jnz L(Byte2)
test $0x08, %ch
jnz L(Byte3)
test $0x10, %ch
jnz L(Byte4)
test $0x20, %ch
jnz L(Byte5)
test $0x40, %ch
jnz L(Byte6)
#ifdef USE_AS_STRNCMP
cmpl $7, %ebp
jbe L(eq)
#endif
movzbl 7(%eax), %ecx
movzbl 7(%edx), %eax
sub %ecx, %eax
RETURN
.p2align 4
L(neq):
mov $1, %eax
ja L(neq_bigger)
neg %eax
L(neq_bigger):
RETURN
#ifdef USE_AS_STRNCMP
cfi_restore_state
.p2align 4
L(more8byteseq):
POP (%esi)
POP (%edi)
POP (%ebx)
#endif
L(eq):
#ifdef USE_AS_STRNCMP
POP (%ebp)
#endif
xorl %eax, %eax
ret
#ifdef USE_AS_STRNCMP
CFI_PUSH (%ebp)
.p2align 4
L(less16bytes_sncmp):
test %ebp, %ebp
jz L(eq)
movzbl (%eax), %ecx
cmpb %cl, (%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $1, %ebp
je L(eq)
movzbl 1(%eax), %ecx
cmpb %cl, 1(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $2, %ebp
je L(eq)
movzbl 2(%eax), %ecx
cmpb %cl, 2(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $3, %ebp
je L(eq)
movzbl 3(%eax), %ecx
cmpb %cl, 3(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $4, %ebp
je L(eq)
movzbl 4(%eax), %ecx
cmpb %cl, 4(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $5, %ebp
je L(eq)
movzbl 5(%eax), %ecx
cmpb %cl, 5(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $6, %ebp
je L(eq)
movzbl 6(%eax), %ecx
cmpb %cl, 6(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $7, %ebp
je L(eq)
movzbl 7(%eax), %ecx
cmpb %cl, 7(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $8, %ebp
je L(eq)
movzbl 8(%eax), %ecx
cmpb %cl, 8(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $9, %ebp
je L(eq)
movzbl 9(%eax), %ecx
cmpb %cl, 9(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $10, %ebp
je L(eq)
movzbl 10(%eax), %ecx
cmpb %cl, 10(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $11, %ebp
je L(eq)
movzbl 11(%eax), %ecx
cmpb %cl, 11(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $12, %ebp
je L(eq)
movzbl 12(%eax), %ecx
cmpb %cl, 12(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $13, %ebp
je L(eq)
movzbl 13(%eax), %ecx
cmpb %cl, 13(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $14, %ebp
je L(eq)
movzbl 14(%eax), %ecx
cmpb %cl, 14(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
cmpl $15, %ebp
je L(eq)
movzbl 15(%eax), %ecx
cmpb %cl, 15(%edx)
jne L(neq)
test %cl, %cl
je L(eq)
POP (%ebp)
xor %eax, %eax
ret
#endif
END (STRCMP)