bionic/libc/arch-x86_64/string/sse2-strlcpy-slm.S
Varvara Rainchik 2e7145c048 Add 64-bit slm optimized strlcpy and srlcat.
Change-Id: Ic948934d91c83bbfdfd00c05ee8b14952e012549
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
2014-11-12 17:32:28 +03:00

1063 lines
20 KiB
ArmAsm
Executable File

/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef L
# define L(label) .L##label
#endif
#ifndef cfi_startproc
# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
# define cfi_endproc .cfi_endproc
#endif
#ifndef ENTRY
# define ENTRY(name) \
.type name, @function; \
.globl name; \
.p2align 4; \
name: \
cfi_startproc
#endif
#ifndef END
# define END(name) \
cfi_endproc; \
.size name, .-name
#endif
#ifndef STRLCPY
# define STRLCPY strlcpy
#endif
#define JMPTBL(I, B) I - B
#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
lea TABLE(%rip), %r11; \
movslq (%r11, INDEX, SCALE), %rcx; \
lea (%r11, %rcx), %rcx; \
jmp *%rcx
#define RETURN \
add %r9, %rax; \
ret
.text
ENTRY (STRLCPY)
xor %rax, %rax
xor %r9, %r9
mov %rdx, %r8
cmp $0, %r8
jz L(CalculateSrcLen)
#ifdef USE_AS_STRLCAT
xor %rcx, %rcx
pxor %xmm0, %xmm0
movdqu (%rdi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
cmp $17, %r8
jb L(SizeEndCase1)
test %rdx, %rdx
jnz L(StringEndCase1)
add $16, %rax
movdqu 16(%rdi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
cmp $33, %r8
jb L(SizeEndCase1)
test %rdx, %rdx
jnz L(StringEndCase1)
mov %rdi, %rcx
and $15, %rcx
and $-16, %rdi
add %rcx, %r8
sub $16, %r8
L(DstLenLoop):
movdqa (%rdi, %rax), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
sub $16, %r8
jbe L(SizeEndCase2)
test %rdx, %rdx
jnz L(StringEndCase2)
add $16, %rax
jmp L(DstLenLoop)
L(StringEndCase2):
add $16, %r8
bsf %rdx, %rdx
sub %rdx, %r8
add %rdx, %rax
sub %rcx, %r9
add %rax, %rdi
jmp L(CopySrcString)
L(SizeEndCase1):
test %rdx, %rdx
jz L(SizeEnd)
bsf %rdx, %rdx
add %rdx, %rax
cmp %r8, %rax
jb L(StringEnd)
L(SizeEnd):
mov %r8, %r9
jmp L(CalculateSrcLenCase1)
L(SizeEndCase2):
add $16, %r8
test %rdx, %rdx
jz L(StringEndCase4)
bsf %rdx, %rdx
cmp %r8, %rdx
jb L(StringEndCase3)
L(StringEndCase4):
add %r8, %rax
sub %rcx, %rax
mov %rax, %r9
jmp L(CalculateSrcLenCase1)
L(StringEndCase3):
add %rdx, %rax
sub %rcx, %r9
add %rax, %rdi
sub %rdx, %r8
jmp L(CopySrcString)
L(StringEndCase1):
bsf %rdx, %rdx
add %rdx, %rax
sub %rcx, %rax
L(StringEnd):
add %rax, %rdi
sub %rax, %r8
#endif
mov %rsi, %rcx
and $63, %rcx
cmp $32, %rcx
jbe L(CopySrcString)
and $-16, %rsi
and $15, %rcx
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pcmpeqb (%rsi), %xmm1
pmovmskb %xmm1, %rdx
shr %cl, %rdx
mov $16, %r10
sub %rcx, %r10
cmp %r10, %r8
jbe L(CopyFrom1To16BytesTailCase2OrCase3)
test %rdx, %rdx
jnz L(CopyFrom1To16BytesTail)
pcmpeqb 16(%rsi), %xmm0
pmovmskb %xmm0, %rdx
add $16, %r10
cmp %r10, %r8
jbe L(CopyFrom1To32BytesCase2OrCase3)
test %rdx, %rdx
jnz L(CopyFrom1To32Bytes)
movdqu (%rsi, %rcx), %xmm1
movdqu %xmm1, (%rdi)
#ifdef USE_AS_STRLCAT
add %rax, %r9
#endif
jmp L(LoopStart)
.p2align 4
L(CopySrcString):
#ifdef USE_AS_STRLCAT
add %rax, %r9
xor %rax, %rax
#endif
pxor %xmm0, %xmm0
movdqu (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
cmp $17, %r8
jb L(CopyFrom1To16BytesTail1Case2OrCase3)
test %rdx, %rdx
jnz L(CopyFrom1To16BytesTail1)
movdqu 16(%rsi), %xmm2
pcmpeqb %xmm2, %xmm0
movdqu %xmm1, (%rdi)
pmovmskb %xmm0, %rdx
add $16, %rax
cmp $33, %r8
jb L(CopyFrom1To32Bytes1Case2OrCase3)
test %rdx, %rdx
jnz L(CopyFrom1To32Bytes1)
mov %rsi, %rcx
and $15, %rcx
and $-16, %rsi
L(LoopStart):
sub %rcx, %rdi
add %rcx, %r8
sub $16, %r8
mov $16, %rax
L(16Loop):
movdqa (%rsi, %rax), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
test %rdx, %rdx
jnz L(CopyFrom1To16BytesXmmExit)
movdqu %xmm1, (%rdi, %rax)
add $16, %rax
jmp L(16Loop)
/*------End of main part with loops---------------------*/
/* Case1 */
.p2align 4
L(CopyFrom1To16Bytes):
add %rcx, %rdi
add %rcx, %rsi
bsf %rdx, %rdx
add %rdx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
.p2align 4
L(CopyFrom1To16BytesTail):
add %rcx, %rsi
bsf %rdx, %rdx
add %rdx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
.p2align 4
L(CopyFrom1To32Bytes1):
add $16, %rsi
add $16, %rdi
sub $16, %r8
L(CopyFrom1To16BytesTail1):
bsf %rdx, %rdx
add %rdx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
.p2align 4
L(CopyFrom1To32Bytes):
bsf %rdx, %rdx
add %rcx, %rsi
add $16, %rdx
sub %rcx, %rdx
add %rdx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
.p2align 4
L(CopyFrom1To16BytesExit):
add %rdx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
/* Case2 */
.p2align 4
L(CopyFrom1To16BytesCase2):
add $16, %r8
add %rax, %rdi
add %rax, %rsi
bsf %rdx, %rdx
sub %rcx, %rax
cmp %r8, %rdx
jb L(CopyFrom1To16BytesExit)
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To32BytesCase2):
add %rcx, %rsi
bsf %rdx, %rdx
add $16, %rdx
sub %rcx, %rdx
cmp %r8, %rdx
jb L(CopyFrom1To16BytesExit)
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
L(CopyFrom1To16BytesTailCase2):
add %rcx, %rsi
bsf %rdx, %rdx
cmp %r8, %rdx
jb L(CopyFrom1To16BytesExit)
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To16BytesTail1Case2):
bsf %rdx, %rdx
cmp %r8, %rdx
jb L(CopyFrom1To16BytesExit)
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
/* Case2 or Case3, Case3 */
.p2align 4
L(CopyFrom1To16BytesCase2OrCase3):
test %rdx, %rdx
jnz L(CopyFrom1To16BytesCase2)
add $16, %r8
add %rax, %rdi
add %rax, %rsi
add %r8, %rax
sub %rcx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To32BytesCase2OrCase3):
test %rdx, %rdx
jnz L(CopyFrom1To32BytesCase2)
add %rcx, %rsi
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To16BytesTailCase2OrCase3):
test %rdx, %rdx
jnz L(CopyFrom1To16BytesTailCase2)
add %rcx, %rsi
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To32Bytes1Case2OrCase3):
add $16, %rdi
add $16, %rsi
sub $16, %r8
L(CopyFrom1To16BytesTail1Case2OrCase3):
test %rdx, %rdx
jnz L(CopyFrom1To16BytesTail1Case2)
add %r8, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
.p2align 4
L(CopyFrom1To16BytesXmmExit):
bsf %rdx, %rdx
add %rax, %rdi
add %rax, %rsi
add %rdx, %rax
sub %rcx, %rax
BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
.p2align 4
L(Exit0):
RETURN
.p2align 4
L(Exit1):
movb $0, (%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit2):
movb (%rsi), %dh
movb %dh, (%rdi)
movb $0, 1(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit3):
movw (%rsi), %dx
movw %dx, (%rdi)
movb $0, 2(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit4):
movw (%rsi), %cx
movb 2(%rsi), %dh
movw %cx, (%rdi)
movb %dh, 2(%rdi)
movb $0, 3(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit5):
movl (%rsi), %edx
movl %edx, (%rdi)
movb $0, 4(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit6):
movl (%rsi), %ecx
movb 4(%rsi), %dh
movl %ecx, (%rdi)
movb %dh, 4(%rdi)
movb $0, 5(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit7):
movl (%rsi), %ecx
movw 4(%rsi), %dx
movl %ecx, (%rdi)
movw %dx, 4(%rdi)
movb $0, 6(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit8):
movl (%rsi), %ecx
movl 3(%rsi), %edx
movl %ecx, (%rdi)
movl %edx, 3(%rdi)
movb $0, 7(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit9):
movq (%rsi), %rdx
movq %rdx, (%rdi)
movb $0, 8(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit10):
movq (%rsi), %rcx
movb 8(%rsi), %dh
movq %rcx, (%rdi)
movb %dh, 8(%rdi)
movb $0, 9(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit11):
movq (%rsi), %rcx
movw 8(%rsi), %dx
movq %rcx, (%rdi)
movw %dx, 8(%rdi)
movb $0, 10(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit12):
movq (%rsi), %rcx
movl 7(%rsi), %edx
movq %rcx, (%rdi)
movl %edx, 7(%rdi)
movb $0, 11(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit13):
movq (%rsi), %rcx
movl 8(%rsi), %edx
movq %rcx, (%rdi)
movl %edx, 8(%rdi)
movb $0, 12(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit14):
movq (%rsi), %rcx
movq 5(%rsi), %rdx
movq %rcx, (%rdi)
movq %rdx, 5(%rdi)
movb $0, 13(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit15):
movq (%rsi), %rcx
movq 6(%rsi), %rdx
movq %rcx, (%rdi)
movq %rdx, 6(%rdi)
movb $0, 14(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit16):
movq (%rsi), %rcx
movq 7(%rsi), %rdx
movq %rcx, (%rdi)
movq %rdx, 7(%rdi)
movb $0, 15(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit17):
movdqu (%rsi), %xmm0
movdqu %xmm0, (%rdi)
movb $0, 16(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit18):
movdqu (%rsi), %xmm0
movb 16(%rsi), %dh
movdqu %xmm0, (%rdi)
movb %dh, 16(%rdi)
movb $0, 17(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit19):
movdqu (%rsi), %xmm0
movw 16(%rsi), %cx
movdqu %xmm0, (%rdi)
movw %cx, 16(%rdi)
movb $0, 18(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit20):
movdqu (%rsi), %xmm0
movl 15(%rsi), %ecx
movdqu %xmm0, (%rdi)
movl %ecx, 15(%rdi)
movb $0, 19(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit21):
movdqu (%rsi), %xmm0
movl 16(%rsi), %ecx
movdqu %xmm0, (%rdi)
movl %ecx, 16(%rdi)
movb $0, 20(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit22):
movdqu (%rsi), %xmm0
movl 16(%rsi), %ecx
movb 20(%rsi), %dh
movdqu %xmm0, (%rdi)
movl %ecx, 16(%rdi)
movb %dh, 20(%rdi)
movb $0, 21(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit23):
movdqu (%rsi), %xmm0
movq 14(%rsi), %rcx
movdqu %xmm0, (%rdi)
movq %rcx, 14(%rdi)
movb $0, 22(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit24):
movdqu (%rsi), %xmm0
movq 15(%rsi), %rcx
movdqu %xmm0, (%rdi)
movq %rcx, 15(%rdi)
movb $0, 23(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit25):
movdqu (%rsi), %xmm0
movq 16(%rsi), %rcx
movdqu %xmm0, (%rdi)
movq %rcx, 16(%rdi)
movb $0, 24(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit26):
movdqu (%rsi), %xmm0
movq 16(%rsi), %rcx
movb 24(%rsi), %dh
movdqu %xmm0, (%rdi)
movq %rcx, 16(%rdi)
mov %dh, 24(%rdi)
movb $0, 25(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit27):
movdqu (%rsi), %xmm0
movq 16(%rsi), %rdx
movw 24(%rsi), %cx
movdqu %xmm0, (%rdi)
movq %rdx, 16(%rdi)
movw %cx, 24(%rdi)
movb $0, 26(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit28):
movdqu (%rsi), %xmm0
movq 16(%rsi), %rdx
movl 23(%rsi), %ecx
movdqu %xmm0, (%rdi)
movq %rdx, 16(%rdi)
movl %ecx, 23(%rdi)
movb $0, 27(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit29):
movdqu (%rsi), %xmm0
movq 16(%rsi), %rdx
movl 24(%rsi), %ecx
movdqu %xmm0, (%rdi)
movq %rdx, 16(%rdi)
movl %ecx, 24(%rdi)
movb $0, 28(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit30):
movdqu (%rsi), %xmm0
movdqu 13(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 13(%rdi)
movb $0, 29(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit31):
movdqu (%rsi), %xmm0
movdqu 14(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 14(%rdi)
movb $0, 30(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(Exit32):
movdqu (%rsi), %xmm0
movdqu 15(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 15(%rdi)
movb $0, 31(%rdi)
jmp L(CalculateSrcLen)
.p2align 4
L(StringTail0):
mov (%rsi), %dl
mov %dl, (%rdi)
RETURN
.p2align 4
L(StringTail1):
mov (%rsi), %dx
mov %dx, (%rdi)
RETURN
.p2align 4
L(StringTail2):
mov (%rsi), %cx
mov 2(%rsi), %dl
mov %cx, (%rdi)
mov %dl, 2(%rdi)
RETURN
.p2align 4
L(StringTail3):
mov (%rsi), %edx
mov %edx, (%rdi)
RETURN
.p2align 4
L(StringTail4):
mov (%rsi), %ecx
mov 4(%rsi), %dl
mov %ecx, (%rdi)
mov %dl, 4(%rdi)
RETURN
.p2align 4
L(StringTail5):
mov (%rsi), %ecx
mov 4(%rsi), %dx
mov %ecx, (%rdi)
mov %dx, 4(%rdi)
RETURN
.p2align 4
L(StringTail6):
mov (%rsi), %ecx
mov 3(%rsi), %edx
mov %ecx, (%rdi)
mov %edx, 3(%rdi)
RETURN
.p2align 4
L(StringTail7):
mov (%rsi), %rdx
mov %rdx, (%rdi)
RETURN
.p2align 4
L(StringTail8):
mov (%rsi), %rcx
mov 8(%rsi), %dl
mov %rcx, (%rdi)
mov %dl, 8(%rdi)
RETURN
.p2align 4
L(StringTail9):
mov (%rsi), %rcx
mov 8(%rsi), %dx
mov %rcx, (%rdi)
mov %dx, 8(%rdi)
RETURN
.p2align 4
L(StringTail10):
mov (%rsi), %rcx
mov 7(%rsi), %edx
mov %rcx, (%rdi)
mov %edx, 7(%rdi)
RETURN
.p2align 4
L(StringTail11):
mov (%rsi), %rcx
mov 8(%rsi), %edx
mov %rcx, (%rdi)
mov %edx, 8(%rdi)
RETURN
.p2align 4
L(StringTail12):
mov (%rsi), %rcx
mov 5(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 5(%rdi)
RETURN
.p2align 4
L(StringTail13):
mov (%rsi), %rcx
mov 6(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 6(%rdi)
RETURN
.p2align 4
L(StringTail14):
mov (%rsi), %rcx
mov 7(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 7(%rdi)
RETURN
.p2align 4
L(StringTail15):
movdqu (%rsi), %xmm0
movdqu %xmm0, (%rdi)
RETURN
.p2align 4
L(StringTail16):
movdqu (%rsi), %xmm0
mov 16(%rsi), %cl
movdqu %xmm0, (%rdi)
mov %cl, 16(%rdi)
RETURN
.p2align 4
L(StringTail17):
movdqu (%rsi), %xmm0
mov 16(%rsi), %cx
movdqu %xmm0, (%rdi)
mov %cx, 16(%rdi)
RETURN
.p2align 4
L(StringTail18):
movdqu (%rsi), %xmm0
mov 15(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %ecx, 15(%rdi)
RETURN
.p2align 4
L(StringTail19):
movdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
RETURN
.p2align 4
L(StringTail20):
movdqu (%rsi), %xmm0
mov 16(%rsi), %ecx
mov 20(%rsi), %dl
movdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
mov %dl, 20(%rdi)
RETURN
.p2align 4
L(StringTail21):
movdqu (%rsi), %xmm0
mov 14(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 14(%rdi)
RETURN
.p2align 4
L(StringTail22):
movdqu (%rsi), %xmm0
mov 15(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 15(%rdi)
RETURN
.p2align 4
L(StringTail23):
movdqu (%rsi), %xmm0
mov 16(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 16(%rdi)
RETURN
.p2align 4
L(StringTail24):
movdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %cl
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cl, 24(%rdi)
RETURN
.p2align 4
L(StringTail25):
movdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %cx
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cx, 24(%rdi)
RETURN
.p2align 4
L(StringTail26):
movdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 23(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 23(%rdi)
RETURN
.p2align 4
L(StringTail27):
movdqu (%rsi), %xmm0
mov 16(%rsi), %rdx
mov 24(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 24(%rdi)
RETURN
.p2align 4
L(StringTail28):
movdqu (%rsi), %xmm0
movdqu 13(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 13(%rdi)
RETURN
.p2align 4
L(StringTail29):
movdqu (%rsi), %xmm0
movdqu 14(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 14(%rdi)
RETURN
.p2align 4
L(StringTail30):
movdqu (%rsi), %xmm0
movdqu 15(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 15(%rdi)
RETURN
.p2align 4
L(StringTail31):
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 16(%rdi)
RETURN
.p2align 4
L(StringTail32):
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm2
mov 32(%rsi), %cl
movdqu %xmm0, (%rdi)
movdqu %xmm2, 16(%rdi)
mov %cl, 32(%rdi)
RETURN
.p2align 4
L(StringTail33):
movdqu (%rsi), %xmm0
movdqu 16(%rsi), %xmm2
mov 32(%rsi), %cl
movdqu %xmm0, (%rdi)
movdqu %xmm2, 16(%rdi)
mov %cl, 32(%rdi)
RETURN
.p2align 4
L(CalculateSrcLenCase1):
xor %r8, %r8
xor %rax, %rax
L(CalculateSrcLen):
pxor %xmm0, %xmm0
xor %rcx, %rcx
add %r8, %rsi
movdqu (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
test %rdx, %rdx
jnz L(SrcLenLoopEnd)
add %rax, %r9
mov $16, %rax
mov %rsi, %rcx
and $15, %rcx
and $-16, %rsi
L(SrcLenLoop):
movdqa (%rsi, %rax), %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %rdx
test %rdx, %rdx
jnz L(SrcLenLoopEnd)
add $16, %rax
jmp L(SrcLenLoop)
.p2align 4
L(SrcLenLoopEnd):
bsf %rdx, %rdx
add %rdx, %rax
sub %rcx, %rax
RETURN
END (STRLCPY)
.p2align 4
.section .rodata
L(ExitTable):
.int JMPTBL(L(Exit0), L(ExitTable))
.int JMPTBL(L(Exit1), L(ExitTable))
.int JMPTBL(L(Exit2), L(ExitTable))
.int JMPTBL(L(Exit3), L(ExitTable))
.int JMPTBL(L(Exit4), L(ExitTable))
.int JMPTBL(L(Exit5), L(ExitTable))
.int JMPTBL(L(Exit6), L(ExitTable))
.int JMPTBL(L(Exit7), L(ExitTable))
.int JMPTBL(L(Exit8), L(ExitTable))
.int JMPTBL(L(Exit9), L(ExitTable))
.int JMPTBL(L(Exit10), L(ExitTable))
.int JMPTBL(L(Exit11), L(ExitTable))
.int JMPTBL(L(Exit12), L(ExitTable))
.int JMPTBL(L(Exit13), L(ExitTable))
.int JMPTBL(L(Exit14), L(ExitTable))
.int JMPTBL(L(Exit15), L(ExitTable))
.int JMPTBL(L(Exit16), L(ExitTable))
.int JMPTBL(L(Exit17), L(ExitTable))
.int JMPTBL(L(Exit18), L(ExitTable))
.int JMPTBL(L(Exit19), L(ExitTable))
.int JMPTBL(L(Exit20), L(ExitTable))
.int JMPTBL(L(Exit21), L(ExitTable))
.int JMPTBL(L(Exit22), L(ExitTable))
.int JMPTBL(L(Exit23), L(ExitTable))
.int JMPTBL(L(Exit24), L(ExitTable))
.int JMPTBL(L(Exit25), L(ExitTable))
.int JMPTBL(L(Exit26), L(ExitTable))
.int JMPTBL(L(Exit27), L(ExitTable))
.int JMPTBL(L(Exit28), L(ExitTable))
.int JMPTBL(L(Exit29), L(ExitTable))
.int JMPTBL(L(Exit30), L(ExitTable))
.int JMPTBL(L(Exit31), L(ExitTable))
.int JMPTBL(L(Exit32), L(ExitTable))
L(ExitStringTailTable):
.int JMPTBL(L(StringTail0), L(ExitStringTailTable))
.int JMPTBL(L(StringTail1), L(ExitStringTailTable))
.int JMPTBL(L(StringTail2), L(ExitStringTailTable))
.int JMPTBL(L(StringTail3), L(ExitStringTailTable))
.int JMPTBL(L(StringTail4), L(ExitStringTailTable))
.int JMPTBL(L(StringTail5), L(ExitStringTailTable))
.int JMPTBL(L(StringTail6), L(ExitStringTailTable))
.int JMPTBL(L(StringTail7), L(ExitStringTailTable))
.int JMPTBL(L(StringTail8), L(ExitStringTailTable))
.int JMPTBL(L(StringTail9), L(ExitStringTailTable))
.int JMPTBL(L(StringTail10), L(ExitStringTailTable))
.int JMPTBL(L(StringTail11), L(ExitStringTailTable))
.int JMPTBL(L(StringTail12), L(ExitStringTailTable))
.int JMPTBL(L(StringTail13), L(ExitStringTailTable))
.int JMPTBL(L(StringTail14), L(ExitStringTailTable))
.int JMPTBL(L(StringTail15), L(ExitStringTailTable))
.int JMPTBL(L(StringTail16), L(ExitStringTailTable))
.int JMPTBL(L(StringTail17), L(ExitStringTailTable))
.int JMPTBL(L(StringTail18), L(ExitStringTailTable))
.int JMPTBL(L(StringTail19), L(ExitStringTailTable))
.int JMPTBL(L(StringTail20), L(ExitStringTailTable))
.int JMPTBL(L(StringTail21), L(ExitStringTailTable))
.int JMPTBL(L(StringTail22), L(ExitStringTailTable))
.int JMPTBL(L(StringTail23), L(ExitStringTailTable))
.int JMPTBL(L(StringTail24), L(ExitStringTailTable))
.int JMPTBL(L(StringTail25), L(ExitStringTailTable))
.int JMPTBL(L(StringTail26), L(ExitStringTailTable))
.int JMPTBL(L(StringTail27), L(ExitStringTailTable))
.int JMPTBL(L(StringTail28), L(ExitStringTailTable))
.int JMPTBL(L(StringTail29), L(ExitStringTailTable))
.int JMPTBL(L(StringTail30), L(ExitStringTailTable))
.int JMPTBL(L(StringTail31), L(ExitStringTailTable))
.int JMPTBL(L(StringTail32), L(ExitStringTailTable))
.int JMPTBL(L(StringTail33), L(ExitStringTailTable))