diff --git a/libc/arch-x86_64/string/sse2-strlcat-slm.S b/libc/arch-x86_64/string/sse2-strlcat-slm.S new file mode 100644 index 000000000..d79e8c14e --- /dev/null +++ b/libc/arch-x86_64/string/sse2-strlcat-slm.S @@ -0,0 +1,37 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define USE_AS_STRLCAT + +#ifndef STRLCPY +# define STRLCPY strlcat +#endif + +#include "sse2-strlcpy-slm.S" diff --git a/libc/arch-x86_64/string/sse2-strlcpy-slm.S b/libc/arch-x86_64/string/sse2-strlcpy-slm.S new file mode 100755 index 000000000..9d4b52f60 --- /dev/null +++ b/libc/arch-x86_64/string/sse2-strlcpy-slm.S @@ -0,0 +1,1062 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + + +#ifndef STRLCPY +# define STRLCPY strlcpy +#endif + +#define JMPTBL(I, B) I - B +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +#define RETURN \ + add %r9, %rax; \ + ret + +.text +ENTRY (STRLCPY) + xor %rax, %rax + xor %r9, %r9 + mov %rdx, %r8 + cmp $0, %r8 + jz L(CalculateSrcLen) + +#ifdef USE_AS_STRLCAT + xor %rcx, %rcx + pxor %xmm0, %xmm0 + + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + + cmp $17, %r8 + jb L(SizeEndCase1) + test %rdx, %rdx + jnz L(StringEndCase1) + + add $16, %rax + movdqu 16(%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + + cmp $33, %r8 + jb L(SizeEndCase1) + test %rdx, %rdx + jnz L(StringEndCase1) + + mov %rdi, %rcx + and $15, %rcx + and $-16, %rdi + + add %rcx, %r8 + sub $16, %r8 + +L(DstLenLoop): + movdqa (%rdi, %rax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + sub $16, %r8 + jbe L(SizeEndCase2) + test %rdx, %rdx + jnz L(StringEndCase2) + add $16, %rax + jmp L(DstLenLoop) + +L(StringEndCase2): + add $16, %r8 + bsf %rdx, %rdx + sub %rdx, %r8 + add %rdx, %rax + sub %rcx, %r9 + add %rax, %rdi + jmp L(CopySrcString) + +L(SizeEndCase1): + test %rdx, %rdx + jz L(SizeEnd) + bsf %rdx, %rdx + add %rdx, %rax + cmp %r8, %rax + jb L(StringEnd) +L(SizeEnd): + mov %r8, %r9 + jmp L(CalculateSrcLenCase1) + +L(SizeEndCase2): + add $16, %r8 + test %rdx, %rdx + jz L(StringEndCase4) + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(StringEndCase3) +L(StringEndCase4): + add %r8, %rax + sub %rcx, %rax + mov %rax, %r9 + jmp L(CalculateSrcLenCase1) + +L(StringEndCase3): + add %rdx, %rax + sub %rcx, %r9 + add %rax, %rdi + sub %rdx, %r8 + jmp L(CopySrcString) + +L(StringEndCase1): + bsf %rdx, %rdx + add %rdx, %rax + sub %rcx, %rax +L(StringEnd): + add %rax, %rdi + sub %rax, %r8 +#endif + + mov %rsi, %rcx + and $63, %rcx + cmp $32, %rcx + jbe L(CopySrcString) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To16BytesTailCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 + movdqu %xmm1, (%rdi) +#ifdef USE_AS_STRLCAT + add %rax, %r9 +#endif + jmp L(LoopStart) + + .p2align 4 +L(CopySrcString): +#ifdef USE_AS_STRLCAT + add %rax, %r9 + xor %rax, %rax +#endif + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + + cmp $17, %r8 + jb L(CopyFrom1To16BytesTail1Case2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + add $16, %rax + + cmp $33, %r8 + jb L(CopyFrom1To32Bytes1Case2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + mov %rsi, %rcx + and $15, %rcx + and $-16, %rsi + +L(LoopStart): + sub %rcx, %rdi + add %rcx, %r8 + sub $16, %r8 + mov $16, %rax + +L(16Loop): + movdqa (%rsi, %rax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesXmmExit) + movdqu %xmm1, (%rdi, %rax) + add $16, %rax + jmp L(16Loop) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + add %rdx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + add %rdx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi + sub $16, %r8 +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + add %rdx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + add %rdx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesExit): + add %rdx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rax, %rdi + add %rax, %rsi + bsf %rdx, %rdx + sub %rcx, %rax + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) + add $16, %r8 + add %rax, %rdi + add %rax, %rsi + add %r8, %rax + sub %rcx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + add %r8, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add %rax, %rdi + add %rax, %rsi + add %rdx, %rax + sub %rcx, %rax + BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4) + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + + .p2align 4 +L(Exit0): + RETURN + + .p2align 4 +L(Exit1): + movb $0, (%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit2): + movb (%rsi), %dh + movb %dh, (%rdi) + movb $0, 1(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit3): + movw (%rsi), %dx + movw %dx, (%rdi) + movb $0, 2(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit4): + movw (%rsi), %cx + movb 2(%rsi), %dh + movw %cx, (%rdi) + movb %dh, 2(%rdi) + movb $0, 3(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit5): + movl (%rsi), %edx + movl %edx, (%rdi) + movb $0, 4(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit6): + movl (%rsi), %ecx + movb 4(%rsi), %dh + movl %ecx, (%rdi) + movb %dh, 4(%rdi) + movb $0, 5(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit7): + movl (%rsi), %ecx + movw 4(%rsi), %dx + movl %ecx, (%rdi) + movw %dx, 4(%rdi) + movb $0, 6(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit8): + movl (%rsi), %ecx + movl 3(%rsi), %edx + movl %ecx, (%rdi) + movl %edx, 3(%rdi) + movb $0, 7(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit9): + movq (%rsi), %rdx + movq %rdx, (%rdi) + movb $0, 8(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit10): + movq (%rsi), %rcx + movb 8(%rsi), %dh + movq %rcx, (%rdi) + movb %dh, 8(%rdi) + movb $0, 9(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit11): + movq (%rsi), %rcx + movw 8(%rsi), %dx + movq %rcx, (%rdi) + movw %dx, 8(%rdi) + movb $0, 10(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit12): + movq (%rsi), %rcx + movl 7(%rsi), %edx + movq %rcx, (%rdi) + movl %edx, 7(%rdi) + movb $0, 11(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit13): + movq (%rsi), %rcx + movl 8(%rsi), %edx + movq %rcx, (%rdi) + movl %edx, 8(%rdi) + movb $0, 12(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit14): + movq (%rsi), %rcx + movq 5(%rsi), %rdx + movq %rcx, (%rdi) + movq %rdx, 5(%rdi) + movb $0, 13(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit15): + movq (%rsi), %rcx + movq 6(%rsi), %rdx + movq %rcx, (%rdi) + movq %rdx, 6(%rdi) + movb $0, 14(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit16): + movq (%rsi), %rcx + movq 7(%rsi), %rdx + movq %rcx, (%rdi) + movq %rdx, 7(%rdi) + movb $0, 15(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + movb $0, 16(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + movb 16(%rsi), %dh + movdqu %xmm0, (%rdi) + movb %dh, 16(%rdi) + movb $0, 17(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + movw 16(%rsi), %cx + movdqu %xmm0, (%rdi) + movw %cx, 16(%rdi) + movb $0, 18(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + movl 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + movl %ecx, 15(%rdi) + movb $0, 19(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + movl 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + movl %ecx, 16(%rdi) + movb $0, 20(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + movl 16(%rsi), %ecx + movb 20(%rsi), %dh + movdqu %xmm0, (%rdi) + movl %ecx, 16(%rdi) + movb %dh, 20(%rdi) + movb $0, 21(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + movq 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + movq %rcx, 14(%rdi) + movb $0, 22(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + movq 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + movq %rcx, 15(%rdi) + movb $0, 23(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + movq 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + movq %rcx, 16(%rdi) + movb $0, 24(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + movq 16(%rsi), %rcx + movb 24(%rsi), %dh + movdqu %xmm0, (%rdi) + movq %rcx, 16(%rdi) + mov %dh, 24(%rdi) + movb $0, 25(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + movq 16(%rsi), %rdx + movw 24(%rsi), %cx + movdqu %xmm0, (%rdi) + movq %rdx, 16(%rdi) + movw %cx, 24(%rdi) + movb $0, 26(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + movq 16(%rsi), %rdx + movl 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + movq %rdx, 16(%rdi) + movl %ecx, 23(%rdi) + movb $0, 27(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movq 16(%rsi), %rdx + movl 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + movq %rdx, 16(%rdi) + movl %ecx, 24(%rdi) + movb $0, 28(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) + movb $0, 29(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) + movb $0, 30(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) + movb $0, 31(%rdi) + jmp L(CalculateSrcLen) + + .p2align 4 +L(StringTail0): + mov (%rsi), %dl + mov %dl, (%rdi) + RETURN + + .p2align 4 +L(StringTail1): + mov (%rsi), %dx + mov %dx, (%rdi) + RETURN + + .p2align 4 +L(StringTail2): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) + RETURN + + .p2align 4 +L(StringTail3): + mov (%rsi), %edx + mov %edx, (%rdi) + RETURN + + .p2align 4 +L(StringTail4): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) + RETURN + + .p2align 4 +L(StringTail5): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) + RETURN + + .p2align 4 +L(StringTail6): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) + RETURN + + .p2align 4 +L(StringTail7): + mov (%rsi), %rdx + mov %rdx, (%rdi) + RETURN + + .p2align 4 +L(StringTail8): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) + RETURN + + .p2align 4 +L(StringTail9): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) + RETURN + + .p2align 4 +L(StringTail10): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) + RETURN + + .p2align 4 +L(StringTail11): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) + RETURN + + .p2align 4 +L(StringTail12): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) + RETURN + + .p2align 4 +L(StringTail13): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) + RETURN + + .p2align 4 +L(StringTail14): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) + RETURN + + .p2align 4 +L(StringTail15): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + RETURN + + .p2align 4 +L(StringTail16): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) + RETURN + + .p2align 4 +L(StringTail17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) + RETURN + + .p2align 4 +L(StringTail18): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) + RETURN + + .p2align 4 +L(StringTail19): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + RETURN + + .p2align 4 +L(StringTail20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) + RETURN + + .p2align 4 +L(StringTail21): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) + RETURN + + .p2align 4 +L(StringTail22): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) + RETURN + + .p2align 4 +L(StringTail23): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + RETURN + + .p2align 4 +L(StringTail24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) + RETURN + + .p2align 4 +L(StringTail25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) + RETURN + + .p2align 4 +L(StringTail26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) + RETURN + + .p2align 4 +L(StringTail27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) + RETURN + + .p2align 4 +L(StringTail28): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) + RETURN + + .p2align 4 +L(StringTail29): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) + RETURN + + .p2align 4 +L(StringTail30): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) + RETURN + + .p2align 4 +L(StringTail31): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + RETURN + + .p2align 4 +L(StringTail32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) + RETURN + + .p2align 4 +L(StringTail33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) + RETURN + + .p2align 4 +L(CalculateSrcLenCase1): + xor %r8, %r8 + xor %rax, %rax +L(CalculateSrcLen): + pxor %xmm0, %xmm0 + xor %rcx, %rcx + add %r8, %rsi + movdqu (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + test %rdx, %rdx + jnz L(SrcLenLoopEnd) + + add %rax, %r9 + mov $16, %rax + mov %rsi, %rcx + and $15, %rcx + and $-16, %rsi +L(SrcLenLoop): + movdqa (%rsi, %rax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + test %rdx, %rdx + jnz L(SrcLenLoopEnd) + add $16, %rax + jmp L(SrcLenLoop) + + .p2align 4 +L(SrcLenLoopEnd): + bsf %rdx, %rdx + add %rdx, %rax + sub %rcx, %rax + RETURN + +END (STRLCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit0), L(ExitTable)) + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +L(ExitStringTailTable): + .int JMPTBL(L(StringTail0), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail1), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail2), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail3), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail4), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail5), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail6), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail7), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail8), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail9), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail10), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail11), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail12), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail13), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail14), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail15), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail16), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail17), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail18), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail19), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail20), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail21), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail22), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail23), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail24), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail25), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail26), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail27), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail28), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail29), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail30), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail31), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail32), L(ExitStringTailTable)) + .int JMPTBL(L(StringTail33), L(ExitStringTailTable)) diff --git a/libc/arch-x86_64/x86_64.mk b/libc/arch-x86_64/x86_64.mk index 8675ef45a..00a376300 100644 --- a/libc/arch-x86_64/x86_64.mk +++ b/libc/arch-x86_64/x86_64.mk @@ -25,10 +25,6 @@ libc_freebsd_src_files_x86_64 += \ upstream-freebsd/lib/libc/string/wmemcmp.c \ upstream-freebsd/lib/libc/string/wmemmove.c \ -libc_openbsd_src_files_x86_64 += \ - upstream-openbsd/lib/libc/string/strlcat.c \ - upstream-openbsd/lib/libc/string/strlcpy.c \ - # # Inherently architecture-specific code. # @@ -56,6 +52,8 @@ libc_bionic_src_files_x86_64 += \ arch-x86_64/string/sse2-stpncpy-slm.S \ arch-x86_64/string/sse2-strcat-slm.S \ arch-x86_64/string/sse2-strcpy-slm.S \ + arch-x86_64/string/sse2-strlcat-slm.S \ + arch-x86_64/string/sse2-strlcpy-slm.S \ arch-x86_64/string/sse2-strlen-slm.S \ arch-x86_64/string/sse2-strncat-slm.S \ arch-x86_64/string/sse2-strncpy-slm.S \