Optimized strcpy, strcat, strncpy, strncat, strlcpy, strlcat, memchr, memrchr, strchr, strrchr, index, strnlen, strlen, wcslen, wmemcmp, wcscmp, wcschr, wcsrchr, wcscpy, wcscat Change-Id: I82b29132edf9a2e144e0bb3ee4ff5217df8d2a6d Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
		
			
				
	
	
		
			653 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			653 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*
 | 
						|
Copyright (c) 2011, Intel Corporation
 | 
						|
All rights reserved.
 | 
						|
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are met:
 | 
						|
 | 
						|
    * Redistributions of source code must retain the above copyright notice,
 | 
						|
    * this list of conditions and the following disclaimer.
 | 
						|
 | 
						|
    * Redistributions in binary form must reproduce the above copyright notice,
 | 
						|
    * this list of conditions and the following disclaimer in the documentation
 | 
						|
    * and/or other materials provided with the distribution.
 | 
						|
 | 
						|
    * Neither the name of Intel Corporation nor the names of its contributors
 | 
						|
    * may be used to endorse or promote products derived from this software
 | 
						|
    * without specific prior written permission.
 | 
						|
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 | 
						|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 | 
						|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | 
						|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 | 
						|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 | 
						|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 | 
						|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 | 
						|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
						|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 | 
						|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*/
 | 
						|
 | 
						|
#ifndef USE_AS_WCSCAT
 | 
						|
 | 
						|
# ifndef L
 | 
						|
#  define L(label)	.L##label
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef cfi_startproc
 | 
						|
#  define cfi_startproc	.cfi_startproc
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef cfi_endproc
 | 
						|
#  define cfi_endproc	.cfi_endproc
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef cfi_rel_offset
 | 
						|
#  define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef cfi_restore
 | 
						|
#  define cfi_restore(reg)	.cfi_restore reg
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef cfi_adjust_cfa_offset
 | 
						|
#  define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef ENTRY
 | 
						|
#  define ENTRY(name)	\
 | 
						|
	.type name, @function;	\
 | 
						|
	.globl name;	\
 | 
						|
	.p2align 4;	\
 | 
						|
name:	\
 | 
						|
	cfi_startproc
 | 
						|
# endif
 | 
						|
 | 
						|
# ifndef END
 | 
						|
#  define END(name)	\
 | 
						|
	cfi_endproc;	\
 | 
						|
	.size name, .-name
 | 
						|
# endif
 | 
						|
 | 
						|
# define CFI_PUSH(REG)	\
 | 
						|
	cfi_adjust_cfa_offset (4);	\
 | 
						|
	cfi_rel_offset (REG, 0)
 | 
						|
 | 
						|
# define CFI_POP(REG)	\
 | 
						|
	cfi_adjust_cfa_offset (-4);	\
 | 
						|
	cfi_restore (REG)
 | 
						|
 | 
						|
# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
 | 
						|
# define POP(REG)	popl REG; CFI_POP (REG)
 | 
						|
 | 
						|
# define PARMS	4
 | 
						|
# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
 | 
						|
 | 
						|
# define STR1	PARMS
 | 
						|
# define STR2	STR1+4
 | 
						|
# define LEN	STR2+4
 | 
						|
 | 
						|
.text
 | 
						|
ENTRY (wcscpy)
 | 
						|
	mov	STR1(%esp), %edx
 | 
						|
	mov	STR2(%esp), %ecx
 | 
						|
 | 
						|
	cmp	$0, (%ecx)
 | 
						|
	jz	L(ExitTail4)
 | 
						|
	cmp	$0, 4(%ecx)
 | 
						|
	jz	L(ExitTail8)
 | 
						|
	cmp	$0, 8(%ecx)
 | 
						|
	jz	L(ExitTail12)
 | 
						|
	cmp	$0, 12(%ecx)
 | 
						|
	jz	L(ExitTail16)
 | 
						|
 | 
						|
	PUSH	(%edi)
 | 
						|
	mov	%edx, %edi
 | 
						|
#endif
 | 
						|
	PUSH	(%esi)
 | 
						|
	lea	16(%ecx), %esi
 | 
						|
 | 
						|
	and	$-16, %esi
 | 
						|
 | 
						|
	pxor	%xmm0, %xmm0
 | 
						|
	pcmpeqd	(%esi), %xmm0
 | 
						|
	movdqu	(%ecx), %xmm1
 | 
						|
	movdqu	%xmm1, (%edx)
 | 
						|
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	sub	%ecx, %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	mov	%edx, %eax
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	and	$-16, %edx
 | 
						|
	sub	%edx, %eax
 | 
						|
 | 
						|
	sub	%eax, %ecx
 | 
						|
	mov	%ecx, %eax
 | 
						|
	and	$0xf, %eax
 | 
						|
	mov	$0, %esi
 | 
						|
 | 
						|
	jz	L(Align16Both)
 | 
						|
	cmp	$4, %eax
 | 
						|
	je	L(Shl4)
 | 
						|
	cmp	$8, %eax
 | 
						|
	je	L(Shl8)
 | 
						|
	jmp	L(Shl12)
 | 
						|
 | 
						|
L(Align16Both):
 | 
						|
	movaps	(%ecx), %xmm1
 | 
						|
	movaps	16(%ecx), %xmm2
 | 
						|
	movaps	%xmm1, (%edx)
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	16(%ecx, %esi), %xmm3
 | 
						|
	movaps	%xmm2, (%edx, %esi)
 | 
						|
	pcmpeqd	%xmm3, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	16(%ecx, %esi), %xmm4
 | 
						|
	movaps	%xmm3, (%edx, %esi)
 | 
						|
	pcmpeqd	%xmm4, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	16(%ecx, %esi), %xmm1
 | 
						|
	movaps	%xmm4, (%edx, %esi)
 | 
						|
	pcmpeqd	%xmm1, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	16(%ecx, %esi), %xmm2
 | 
						|
	movaps	%xmm1, (%edx, %esi)
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	16(%ecx, %esi), %xmm3
 | 
						|
	movaps	%xmm2, (%edx, %esi)
 | 
						|
	pcmpeqd	%xmm3, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	%xmm3, (%edx, %esi)
 | 
						|
	mov	%ecx, %eax
 | 
						|
	lea	16(%ecx, %esi), %ecx
 | 
						|
	and	$-0x40, %ecx
 | 
						|
	sub	%ecx, %eax
 | 
						|
	sub	%eax, %edx
 | 
						|
 | 
						|
	mov	$-0x40, %esi
 | 
						|
 | 
						|
L(Aligned64Loop):
 | 
						|
	movaps	(%ecx), %xmm2
 | 
						|
	movaps	32(%ecx), %xmm3
 | 
						|
	movaps	%xmm2, %xmm4
 | 
						|
	movaps	16(%ecx), %xmm5
 | 
						|
	movaps	%xmm3, %xmm6
 | 
						|
	movaps	48(%ecx), %xmm7
 | 
						|
	pminub	%xmm5, %xmm2
 | 
						|
	pminub	%xmm7, %xmm3
 | 
						|
	pminub	%xmm2, %xmm3
 | 
						|
	lea	64(%edx), %edx
 | 
						|
	pcmpeqd	%xmm0, %xmm3
 | 
						|
	lea	64(%ecx), %ecx
 | 
						|
	pmovmskb %xmm3, %eax
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Aligned64Leave)
 | 
						|
	movaps	%xmm4, -64(%edx)
 | 
						|
	movaps	%xmm5, -48(%edx)
 | 
						|
	movaps	%xmm6, -32(%edx)
 | 
						|
	movaps	%xmm7, -16(%edx)
 | 
						|
	jmp	L(Aligned64Loop)
 | 
						|
 | 
						|
L(Aligned64Leave):
 | 
						|
	pcmpeqd	%xmm4, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	pcmpeqd	%xmm5, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	movaps	%xmm4, -64(%edx)
 | 
						|
	lea	16(%esi), %esi
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	pcmpeqd	%xmm6, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	movaps	%xmm5, -48(%edx)
 | 
						|
	lea	16(%esi), %esi
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	movaps	%xmm6, -32(%edx)
 | 
						|
	pcmpeqd	%xmm7, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%esi), %esi
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(CopyFrom1To16Bytes)
 | 
						|
 | 
						|
	mov	$-0x40, %esi
 | 
						|
	movaps	%xmm7, -16(%edx)
 | 
						|
	jmp	L(Aligned64Loop)
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(Shl4):
 | 
						|
	movaps	-4(%ecx), %xmm1
 | 
						|
	movaps	12(%ecx), %xmm2
 | 
						|
L(Shl4Start):
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl4LoopExit)
 | 
						|
 | 
						|
	palignr	$4, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	28(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm1
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl4LoopExit)
 | 
						|
 | 
						|
	palignr	$4, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	28(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl4LoopExit)
 | 
						|
 | 
						|
	palignr	$4, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	28(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl4LoopExit)
 | 
						|
 | 
						|
	palignr	$4, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	28(%ecx), %ecx
 | 
						|
	lea	16(%edx), %edx
 | 
						|
 | 
						|
	mov	%ecx, %eax
 | 
						|
	and	$-0x40, %ecx
 | 
						|
	sub	%ecx, %eax
 | 
						|
	lea	-12(%ecx), %ecx
 | 
						|
	sub	%eax, %edx
 | 
						|
 | 
						|
	movaps	-4(%ecx), %xmm1
 | 
						|
 | 
						|
L(Shl4LoopStart):
 | 
						|
	movaps	12(%ecx), %xmm2
 | 
						|
	movaps	28(%ecx), %xmm3
 | 
						|
	movaps	%xmm3, %xmm6
 | 
						|
	movaps	44(%ecx), %xmm4
 | 
						|
	movaps	%xmm4, %xmm7
 | 
						|
	movaps	60(%ecx), %xmm5
 | 
						|
	pminub	%xmm2, %xmm6
 | 
						|
	pminub	%xmm5, %xmm7
 | 
						|
	pminub	%xmm6, %xmm7
 | 
						|
	pcmpeqd	%xmm0, %xmm7
 | 
						|
	pmovmskb %xmm7, %eax
 | 
						|
	movaps	%xmm5, %xmm7
 | 
						|
	palignr	$4, %xmm4, %xmm5
 | 
						|
	palignr	$4, %xmm3, %xmm4
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl4Start)
 | 
						|
 | 
						|
	palignr	$4, %xmm2, %xmm3
 | 
						|
	lea	64(%ecx), %ecx
 | 
						|
	palignr	$4, %xmm1, %xmm2
 | 
						|
	movaps	%xmm7, %xmm1
 | 
						|
	movaps	%xmm5, 48(%edx)
 | 
						|
	movaps	%xmm4, 32(%edx)
 | 
						|
	movaps	%xmm3, 16(%edx)
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	64(%edx), %edx
 | 
						|
	jmp	L(Shl4LoopStart)
 | 
						|
 | 
						|
L(Shl4LoopExit):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movl	8(%ecx), %esi
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	%esi, 8(%edx)
 | 
						|
	POP	(%esi)
 | 
						|
	add	$12, %edx
 | 
						|
	add	$12, %ecx
 | 
						|
	test	%al, %al
 | 
						|
	jz	L(ExitHigh)
 | 
						|
	test	$0x01, %al
 | 
						|
	jnz	L(Exit4)
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
	CFI_PUSH	(%esi)
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(Shl8):
 | 
						|
	movaps	-8(%ecx), %xmm1
 | 
						|
	movaps	8(%ecx), %xmm2
 | 
						|
L(Shl8Start):
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl8LoopExit)
 | 
						|
 | 
						|
	palignr	$8, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	24(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm1
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl8LoopExit)
 | 
						|
 | 
						|
	palignr	$8, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	24(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl8LoopExit)
 | 
						|
 | 
						|
	palignr	$8, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	24(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl8LoopExit)
 | 
						|
 | 
						|
	palignr	$8, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	24(%ecx), %ecx
 | 
						|
	lea	16(%edx), %edx
 | 
						|
 | 
						|
	mov	%ecx, %eax
 | 
						|
	and	$-0x40, %ecx
 | 
						|
	sub	%ecx, %eax
 | 
						|
	lea	-8(%ecx), %ecx
 | 
						|
	sub	%eax, %edx
 | 
						|
 | 
						|
	movaps	-8(%ecx), %xmm1
 | 
						|
 | 
						|
L(Shl8LoopStart):
 | 
						|
	movaps	8(%ecx), %xmm2
 | 
						|
	movaps	24(%ecx), %xmm3
 | 
						|
	movaps	%xmm3, %xmm6
 | 
						|
	movaps	40(%ecx), %xmm4
 | 
						|
	movaps	%xmm4, %xmm7
 | 
						|
	movaps	56(%ecx), %xmm5
 | 
						|
	pminub	%xmm2, %xmm6
 | 
						|
	pminub	%xmm5, %xmm7
 | 
						|
	pminub	%xmm6, %xmm7
 | 
						|
	pcmpeqd	%xmm0, %xmm7
 | 
						|
	pmovmskb %xmm7, %eax
 | 
						|
	movaps	%xmm5, %xmm7
 | 
						|
	palignr	$8, %xmm4, %xmm5
 | 
						|
	palignr	$8, %xmm3, %xmm4
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl8Start)
 | 
						|
 | 
						|
	palignr	$8, %xmm2, %xmm3
 | 
						|
	lea	64(%ecx), %ecx
 | 
						|
	palignr	$8, %xmm1, %xmm2
 | 
						|
	movaps	%xmm7, %xmm1
 | 
						|
	movaps	%xmm5, 48(%edx)
 | 
						|
	movaps	%xmm4, 32(%edx)
 | 
						|
	movaps	%xmm3, 16(%edx)
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	64(%edx), %edx
 | 
						|
	jmp	L(Shl8LoopStart)
 | 
						|
 | 
						|
L(Shl8LoopExit):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	POP	(%esi)
 | 
						|
	add	$8, %edx
 | 
						|
	add	$8, %ecx
 | 
						|
	test	%al, %al
 | 
						|
	jz	L(ExitHigh)
 | 
						|
	test	$0x01, %al
 | 
						|
	jnz	L(Exit4)
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
	CFI_PUSH	(%esi)
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(Shl12):
 | 
						|
	movaps	-12(%ecx), %xmm1
 | 
						|
	movaps	4(%ecx), %xmm2
 | 
						|
L(Shl12Start):
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl12LoopExit)
 | 
						|
 | 
						|
	palignr	$12, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	20(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm1
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl12LoopExit)
 | 
						|
 | 
						|
	palignr	$12, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	20(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
	movaps	%xmm2, %xmm3
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl12LoopExit)
 | 
						|
 | 
						|
	palignr	$12, %xmm1, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	movaps	20(%ecx), %xmm2
 | 
						|
 | 
						|
	pcmpeqd	%xmm2, %xmm0
 | 
						|
	lea	16(%edx), %edx
 | 
						|
	pmovmskb %xmm0, %eax
 | 
						|
	lea	16(%ecx), %ecx
 | 
						|
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl12LoopExit)
 | 
						|
 | 
						|
	palignr	$12, %xmm3, %xmm2
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	20(%ecx), %ecx
 | 
						|
	lea	16(%edx), %edx
 | 
						|
 | 
						|
	mov	%ecx, %eax
 | 
						|
	and	$-0x40, %ecx
 | 
						|
	sub	%ecx, %eax
 | 
						|
	lea	-4(%ecx), %ecx
 | 
						|
	sub	%eax, %edx
 | 
						|
 | 
						|
	movaps	-12(%ecx), %xmm1
 | 
						|
 | 
						|
L(Shl12LoopStart):
 | 
						|
	movaps	4(%ecx), %xmm2
 | 
						|
	movaps	20(%ecx), %xmm3
 | 
						|
	movaps	%xmm3, %xmm6
 | 
						|
	movaps	36(%ecx), %xmm4
 | 
						|
	movaps	%xmm4, %xmm7
 | 
						|
	movaps	52(%ecx), %xmm5
 | 
						|
	pminub	%xmm2, %xmm6
 | 
						|
	pminub	%xmm5, %xmm7
 | 
						|
	pminub	%xmm6, %xmm7
 | 
						|
	pcmpeqd	%xmm0, %xmm7
 | 
						|
	pmovmskb %xmm7, %eax
 | 
						|
	movaps	%xmm5, %xmm7
 | 
						|
	palignr	$12, %xmm4, %xmm5
 | 
						|
	palignr	$12, %xmm3, %xmm4
 | 
						|
	test	%eax, %eax
 | 
						|
	jnz	L(Shl12Start)
 | 
						|
 | 
						|
	palignr	$12, %xmm2, %xmm3
 | 
						|
	lea	64(%ecx), %ecx
 | 
						|
	palignr	$12, %xmm1, %xmm2
 | 
						|
	movaps	%xmm7, %xmm1
 | 
						|
	movaps	%xmm5, 48(%edx)
 | 
						|
	movaps	%xmm4, 32(%edx)
 | 
						|
	movaps	%xmm3, 16(%edx)
 | 
						|
	movaps	%xmm2, (%edx)
 | 
						|
	lea	64(%edx), %edx
 | 
						|
	jmp	L(Shl12LoopStart)
 | 
						|
 | 
						|
L(Shl12LoopExit):
 | 
						|
	movl	(%ecx), %esi
 | 
						|
	movl	%esi, (%edx)
 | 
						|
	mov	$4, %esi
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(CopyFrom1To16Bytes):
 | 
						|
	add	%esi, %edx
 | 
						|
	add	%esi, %ecx
 | 
						|
 | 
						|
	POP	(%esi)
 | 
						|
	test	%al, %al
 | 
						|
	jz	L(ExitHigh)
 | 
						|
	test	$0x01, %al
 | 
						|
	jnz	L(Exit4)
 | 
						|
L(Exit8):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(ExitHigh):
 | 
						|
	test	$0x01, %ah
 | 
						|
	jnz	L(Exit12)
 | 
						|
L(Exit16):
 | 
						|
	movdqu	(%ecx), %xmm0
 | 
						|
	movdqu	%xmm0, (%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(Exit4):
 | 
						|
	movl	(%ecx), %eax
 | 
						|
	movl	%eax, (%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(Exit12):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	8(%ecx), %eax
 | 
						|
	movl	%eax, 8(%edx)
 | 
						|
	movl	%edi, %eax
 | 
						|
	RETURN
 | 
						|
 | 
						|
CFI_POP	(%edi)
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(ExitTail4):
 | 
						|
	movl	(%ecx), %eax
 | 
						|
	movl	%eax, (%edx)
 | 
						|
	movl	%edx, %eax
 | 
						|
	ret
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(ExitTail8):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	%edx, %eax
 | 
						|
	ret
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(ExitTail12):
 | 
						|
	movlpd	(%ecx), %xmm0
 | 
						|
	movlpd	%xmm0, (%edx)
 | 
						|
	movl	8(%ecx), %eax
 | 
						|
	movl	%eax, 8(%edx)
 | 
						|
	movl	%edx, %eax
 | 
						|
	ret
 | 
						|
 | 
						|
	.p2align 4
 | 
						|
L(ExitTail16):
 | 
						|
	movdqu	(%ecx), %xmm0
 | 
						|
	movdqu	%xmm0, (%edx)
 | 
						|
	movl	%edx, %eax
 | 
						|
	ret
 | 
						|
 | 
						|
#ifndef USE_AS_WCSCAT
 | 
						|
END (wcscpy)
 | 
						|
#endif
 |