1771 lines
40 KiB
ArmAsm
1771 lines
40 KiB
ArmAsm
|
/*
|
||
|
Copyright (c) 2010, Intel Corporation
|
||
|
All rights reserved.
|
||
|
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
||
|
* Redistributions of source code must retain the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer.
|
||
|
|
||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer in the documentation
|
||
|
* and/or other materials provided with the distribution.
|
||
|
|
||
|
* Neither the name of Intel Corporation nor the names of its contributors
|
||
|
* may be used to endorse or promote products derived from this software
|
||
|
* without specific prior written permission.
|
||
|
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#ifndef MEMCPY
|
||
|
# define MEMCPY ssse3_memcpy5
|
||
|
#endif
|
||
|
|
||
|
#ifndef L
|
||
|
# define L(label) .L##label
|
||
|
#endif
|
||
|
|
||
|
#ifndef ALIGN
|
||
|
# define ALIGN(n) .p2align n
|
||
|
#endif
|
||
|
|
||
|
#ifndef cfi_startproc
|
||
|
# define cfi_startproc .cfi_startproc
|
||
|
#endif
|
||
|
|
||
|
#ifndef cfi_endproc
|
||
|
# define cfi_endproc .cfi_endproc
|
||
|
#endif
|
||
|
|
||
|
#ifndef cfi_rel_offset
|
||
|
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||
|
#endif
|
||
|
|
||
|
#ifndef cfi_restore
|
||
|
# define cfi_restore(reg) .cfi_restore (reg)
|
||
|
#endif
|
||
|
|
||
|
#ifndef cfi_adjust_cfa_offset
|
||
|
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||
|
#endif
|
||
|
|
||
|
#ifndef ENTRY
|
||
|
# define ENTRY(name) \
|
||
|
.type name, @function; \
|
||
|
.globl name; \
|
||
|
.p2align 4; \
|
||
|
name: \
|
||
|
cfi_startproc
|
||
|
#endif
|
||
|
|
||
|
#ifndef END
|
||
|
# define END(name) \
|
||
|
cfi_endproc; \
|
||
|
.size name, .-name
|
||
|
#endif
|
||
|
|
||
|
#ifdef USE_AS_BCOPY
|
||
|
# define SRC PARMS
|
||
|
# define DEST SRC+4
|
||
|
# define LEN DEST+4
|
||
|
#else
|
||
|
# define DEST PARMS
|
||
|
# define SRC DEST+4
|
||
|
# define LEN SRC+4
|
||
|
#endif
|
||
|
|
||
|
#define CFI_PUSH(REG) \
|
||
|
cfi_adjust_cfa_offset (4); \
|
||
|
cfi_rel_offset (REG, 0)
|
||
|
|
||
|
#define CFI_POP(REG) \
|
||
|
cfi_adjust_cfa_offset (-4); \
|
||
|
cfi_restore (REG)
|
||
|
|
||
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||
|
#define POP(REG) popl REG; CFI_POP (REG)
|
||
|
|
||
|
#ifdef SHARED
|
||
|
# define PARMS 8 /* Preserve EBX. */
|
||
|
# define ENTRANCE PUSH (%ebx);
|
||
|
# define RETURN_END POP (%ebx); ret
|
||
|
# define RETURN RETURN_END; CFI_PUSH (%ebx)
|
||
|
# define JMPTBL(I, B) I - B
|
||
|
|
||
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
||
|
jump table with relative offsets. INDEX is a register contains the
|
||
|
index into the jump table. SCALE is the scale of INDEX. */
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
||
|
/* We first load PC into EBX. */ \
|
||
|
call __i686.get_pc_thunk.bx; \
|
||
|
/* Get the address of the jump table. */ \
|
||
|
addl $(TABLE - .), %ebx; \
|
||
|
/* Get the entry and convert the relative offset to the \
|
||
|
absolute address. */ \
|
||
|
addl (%ebx,INDEX,SCALE), %ebx; \
|
||
|
/* We loaded the jump table. Go. */ \
|
||
|
jmp *%ebx
|
||
|
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
|
||
|
addl $(TABLE - .), %ebx
|
||
|
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
|
||
|
addl (%ebx,INDEX,SCALE), %ebx; \
|
||
|
/* We loaded the jump table. Go. */ \
|
||
|
jmp *%ebx
|
||
|
|
||
|
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
|
||
|
.globl __i686.get_pc_thunk.bx
|
||
|
.hidden __i686.get_pc_thunk.bx
|
||
|
ALIGN (4)
|
||
|
.type __i686.get_pc_thunk.bx,@function
|
||
|
__i686.get_pc_thunk.bx:
|
||
|
movl (%esp), %ebx
|
||
|
ret
|
||
|
#else
|
||
|
# define PARMS 4
|
||
|
# define ENTRANCE
|
||
|
# define RETURN_END ret
|
||
|
# define RETURN RETURN_END
|
||
|
# define JMPTBL(I, B) I
|
||
|
|
||
|
/* Branch to an entry in a jump table. TABLE is a jump table with
|
||
|
absolute offsets. INDEX is a register contains the index into the
|
||
|
jump table. SCALE is the scale of INDEX. */
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
||
|
jmp *TABLE(,INDEX,SCALE)
|
||
|
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
|
||
|
|
||
|
# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
|
||
|
jmp *TABLE(,INDEX,SCALE)
|
||
|
#endif
|
||
|
|
||
|
.section .text.ssse3,"ax",@progbits
|
||
|
ENTRY (MEMCPY)
|
||
|
ENTRANCE
|
||
|
movl LEN(%esp), %ecx
|
||
|
movl SRC(%esp), %eax
|
||
|
movl DEST(%esp), %edx
|
||
|
|
||
|
#ifdef USE_AS_MEMMOVE
|
||
|
cmp %eax, %edx
|
||
|
jb L(copy_forward)
|
||
|
je L(fwd_write_0bytes)
|
||
|
cmp $32, %ecx
|
||
|
jae L(memmove_bwd)
|
||
|
jmp L(bk_write_less32bytes_2)
|
||
|
L(memmove_bwd):
|
||
|
add %ecx, %eax
|
||
|
cmp %eax, %edx
|
||
|
movl SRC(%esp), %eax
|
||
|
jb L(copy_backward)
|
||
|
|
||
|
L(copy_forward):
|
||
|
#endif
|
||
|
cmp $48, %ecx
|
||
|
jae L(48bytesormore)
|
||
|
|
||
|
L(fwd_write_less32bytes):
|
||
|
#ifndef USE_AS_MEMMOVE
|
||
|
cmp %dl, %al
|
||
|
jb L(bk_write)
|
||
|
#endif
|
||
|
add %ecx, %edx
|
||
|
add %ecx, %eax
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
||
|
#ifndef USE_AS_MEMMOVE
|
||
|
L(bk_write):
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
|
||
|
#endif
|
||
|
|
||
|
ALIGN (4)
|
||
|
/* ECX > 32 and EDX is 4 byte aligned. */
|
||
|
L(48bytesormore):
|
||
|
movdqu (%eax), %xmm0
|
||
|
PUSH (%edi)
|
||
|
movl %edx, %edi
|
||
|
and $-16, %edx
|
||
|
PUSH (%esi)
|
||
|
add $16, %edx
|
||
|
movl %edi, %esi
|
||
|
sub %edx, %edi
|
||
|
add %edi, %ecx
|
||
|
sub %edi, %eax
|
||
|
|
||
|
#ifdef SHARED_CACHE_SIZE_HALF
|
||
|
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
||
|
#else
|
||
|
# ifdef SHARED
|
||
|
call __i686.get_pc_thunk.bx
|
||
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||
|
cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
|
||
|
# else
|
||
|
cmp __x86_shared_cache_size_half, %ecx
|
||
|
# endif
|
||
|
#endif
|
||
|
|
||
|
mov %eax, %edi
|
||
|
jae L(large_page)
|
||
|
and $0xf, %edi
|
||
|
jz L(shl_0)
|
||
|
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_0):
|
||
|
movdqu %xmm0, (%esi)
|
||
|
xor %edi, %edi
|
||
|
POP (%esi)
|
||
|
cmp $127, %ecx
|
||
|
ja L(shl_0_gobble)
|
||
|
lea -32(%ecx), %ecx
|
||
|
L(shl_0_loop):
|
||
|
movdqa (%eax, %edi), %xmm0
|
||
|
movdqa 16(%eax, %edi), %xmm1
|
||
|
sub $32, %ecx
|
||
|
movdqa %xmm0, (%edx, %edi)
|
||
|
movdqa %xmm1, 16(%edx, %edi)
|
||
|
lea 32(%edi), %edi
|
||
|
jb L(shl_0_end)
|
||
|
|
||
|
movdqa (%eax, %edi), %xmm0
|
||
|
movdqa 16(%eax, %edi), %xmm1
|
||
|
sub $32, %ecx
|
||
|
movdqa %xmm0, (%edx, %edi)
|
||
|
movdqa %xmm1, 16(%edx, %edi)
|
||
|
lea 32(%edi), %edi
|
||
|
jb L(shl_0_end)
|
||
|
|
||
|
movdqa (%eax, %edi), %xmm0
|
||
|
movdqa 16(%eax, %edi), %xmm1
|
||
|
sub $32, %ecx
|
||
|
movdqa %xmm0, (%edx, %edi)
|
||
|
movdqa %xmm1, 16(%edx, %edi)
|
||
|
lea 32(%edi), %edi
|
||
|
jb L(shl_0_end)
|
||
|
|
||
|
movdqa (%eax, %edi), %xmm0
|
||
|
movdqa 16(%eax, %edi), %xmm1
|
||
|
sub $32, %ecx
|
||
|
movdqa %xmm0, (%edx, %edi)
|
||
|
movdqa %xmm1, 16(%edx, %edi)
|
||
|
lea 32(%edi), %edi
|
||
|
L(shl_0_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
add %edi, %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
L(shl_0_gobble):
|
||
|
|
||
|
#ifdef DATA_CACHE_SIZE_HALF
|
||
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
||
|
#else
|
||
|
# ifdef SHARED
|
||
|
call __i686.get_pc_thunk.bx
|
||
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
||
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
||
|
# else
|
||
|
cmp __x86_data_cache_size_half, %ecx
|
||
|
# endif
|
||
|
#endif
|
||
|
|
||
|
POP (%edi)
|
||
|
lea -128(%ecx), %ecx
|
||
|
jae L(shl_0_gobble_mem_loop)
|
||
|
L(shl_0_gobble_cache_loop):
|
||
|
movdqa (%eax), %xmm0
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
movdqa 0x20(%eax), %xmm2
|
||
|
movdqa 0x30(%eax), %xmm3
|
||
|
movdqa 0x40(%eax), %xmm4
|
||
|
movdqa 0x50(%eax), %xmm5
|
||
|
movdqa 0x60(%eax), %xmm6
|
||
|
movdqa 0x70(%eax), %xmm7
|
||
|
lea 0x80(%eax), %eax
|
||
|
sub $128, %ecx
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
movdqa %xmm2, 0x20(%edx)
|
||
|
movdqa %xmm3, 0x30(%edx)
|
||
|
movdqa %xmm4, 0x40(%edx)
|
||
|
movdqa %xmm5, 0x50(%edx)
|
||
|
movdqa %xmm6, 0x60(%edx)
|
||
|
movdqa %xmm7, 0x70(%edx)
|
||
|
lea 0x80(%edx), %edx
|
||
|
|
||
|
jae L(shl_0_gobble_cache_loop)
|
||
|
cmp $-0x40, %ecx
|
||
|
lea 0x80(%ecx), %ecx
|
||
|
jl L(shl_0_cache_less_64bytes)
|
||
|
|
||
|
movdqa (%eax), %xmm0
|
||
|
sub $0x40, %ecx
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
|
||
|
movdqa 0x20(%eax), %xmm0
|
||
|
movdqa 0x30(%eax), %xmm1
|
||
|
add $0x40, %eax
|
||
|
|
||
|
movdqa %xmm0, 0x20(%edx)
|
||
|
movdqa %xmm1, 0x30(%edx)
|
||
|
add $0x40, %edx
|
||
|
L(shl_0_cache_less_64bytes):
|
||
|
cmp $0x20, %ecx
|
||
|
jb L(shl_0_cache_less_32bytes)
|
||
|
movdqa (%eax), %xmm0
|
||
|
sub $0x20, %ecx
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
add $0x20, %eax
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
add $0x20, %edx
|
||
|
L(shl_0_cache_less_32bytes):
|
||
|
cmp $0x10, %ecx
|
||
|
jb L(shl_0_cache_less_16bytes)
|
||
|
sub $0x10, %ecx
|
||
|
movdqa (%eax), %xmm0
|
||
|
add $0x10, %eax
|
||
|
movdqa %xmm0, (%edx)
|
||
|
add $0x10, %edx
|
||
|
L(shl_0_cache_less_16bytes):
|
||
|
add %ecx, %edx
|
||
|
add %ecx, %eax
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_0_gobble_mem_loop):
|
||
|
prefetcht0 0x1c0(%eax)
|
||
|
prefetcht0 0x280(%eax)
|
||
|
prefetcht0 0x1c0(%edx)
|
||
|
|
||
|
movdqa (%eax), %xmm0
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
movdqa 0x20(%eax), %xmm2
|
||
|
movdqa 0x30(%eax), %xmm3
|
||
|
movdqa 0x40(%eax), %xmm4
|
||
|
movdqa 0x50(%eax), %xmm5
|
||
|
movdqa 0x60(%eax), %xmm6
|
||
|
movdqa 0x70(%eax), %xmm7
|
||
|
lea 0x80(%eax), %eax
|
||
|
sub $0x80, %ecx
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
movdqa %xmm2, 0x20(%edx)
|
||
|
movdqa %xmm3, 0x30(%edx)
|
||
|
movdqa %xmm4, 0x40(%edx)
|
||
|
movdqa %xmm5, 0x50(%edx)
|
||
|
movdqa %xmm6, 0x60(%edx)
|
||
|
movdqa %xmm7, 0x70(%edx)
|
||
|
lea 0x80(%edx), %edx
|
||
|
|
||
|
jae L(shl_0_gobble_mem_loop)
|
||
|
cmp $-0x40, %ecx
|
||
|
lea 0x80(%ecx), %ecx
|
||
|
jl L(shl_0_mem_less_64bytes)
|
||
|
|
||
|
movdqa (%eax), %xmm0
|
||
|
sub $0x40, %ecx
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
|
||
|
movdqa 0x20(%eax), %xmm0
|
||
|
movdqa 0x30(%eax), %xmm1
|
||
|
add $0x40, %eax
|
||
|
|
||
|
movdqa %xmm0, 0x20(%edx)
|
||
|
movdqa %xmm1, 0x30(%edx)
|
||
|
add $0x40, %edx
|
||
|
L(shl_0_mem_less_64bytes):
|
||
|
cmp $0x20, %ecx
|
||
|
jb L(shl_0_mem_less_32bytes)
|
||
|
movdqa (%eax), %xmm0
|
||
|
sub $0x20, %ecx
|
||
|
movdqa 0x10(%eax), %xmm1
|
||
|
add $0x20, %eax
|
||
|
movdqa %xmm0, (%edx)
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
add $0x20, %edx
|
||
|
L(shl_0_mem_less_32bytes):
|
||
|
cmp $0x10, %ecx
|
||
|
jb L(shl_0_mem_less_16bytes)
|
||
|
sub $0x10, %ecx
|
||
|
movdqa (%eax), %xmm0
|
||
|
add $0x10, %eax
|
||
|
movdqa %xmm0, (%edx)
|
||
|
add $0x10, %edx
|
||
|
L(shl_0_mem_less_16bytes):
|
||
|
add %ecx, %edx
|
||
|
add %ecx, %eax
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_1):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -1(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_1_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $1, %xmm2, %xmm3
|
||
|
palignr $1, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_1_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $1, %xmm2, %xmm3
|
||
|
palignr $1, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_1_loop)
|
||
|
|
||
|
L(shl_1_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 1(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_2):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -2(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_2_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $2, %xmm2, %xmm3
|
||
|
palignr $2, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_2_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $2, %xmm2, %xmm3
|
||
|
palignr $2, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_2_loop)
|
||
|
|
||
|
L(shl_2_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 2(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_3):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -3(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_3_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $3, %xmm2, %xmm3
|
||
|
palignr $3, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_3_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $3, %xmm2, %xmm3
|
||
|
palignr $3, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_3_loop)
|
||
|
|
||
|
L(shl_3_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 3(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_4):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -4(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_4_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $4, %xmm2, %xmm3
|
||
|
palignr $4, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_4_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $4, %xmm2, %xmm3
|
||
|
palignr $4, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_4_loop)
|
||
|
|
||
|
L(shl_4_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 4(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_5):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -5(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_5_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $5, %xmm2, %xmm3
|
||
|
palignr $5, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_5_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $5, %xmm2, %xmm3
|
||
|
palignr $5, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_5_loop)
|
||
|
|
||
|
L(shl_5_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 5(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_6):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -6(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_6_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $6, %xmm2, %xmm3
|
||
|
palignr $6, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_6_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $6, %xmm2, %xmm3
|
||
|
palignr $6, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_6_loop)
|
||
|
|
||
|
L(shl_6_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 6(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_7):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -7(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_7_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $7, %xmm2, %xmm3
|
||
|
palignr $7, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_7_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $7, %xmm2, %xmm3
|
||
|
palignr $7, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_7_loop)
|
||
|
|
||
|
L(shl_7_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 7(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_8):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -8(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_8_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $8, %xmm2, %xmm3
|
||
|
palignr $8, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_8_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $8, %xmm2, %xmm3
|
||
|
palignr $8, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_8_loop)
|
||
|
|
||
|
L(shl_8_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 8(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_9):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -9(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_9_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $9, %xmm2, %xmm3
|
||
|
palignr $9, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_9_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $9, %xmm2, %xmm3
|
||
|
palignr $9, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_9_loop)
|
||
|
|
||
|
L(shl_9_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 9(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_10):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -10(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_10_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $10, %xmm2, %xmm3
|
||
|
palignr $10, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_10_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $10, %xmm2, %xmm3
|
||
|
palignr $10, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_10_loop)
|
||
|
|
||
|
L(shl_10_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 10(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_11):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -11(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_11_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $11, %xmm2, %xmm3
|
||
|
palignr $11, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_11_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $11, %xmm2, %xmm3
|
||
|
palignr $11, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_11_loop)
|
||
|
|
||
|
L(shl_11_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 11(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_12):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -12(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_12_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $12, %xmm2, %xmm3
|
||
|
palignr $12, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_12_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $12, %xmm2, %xmm3
|
||
|
palignr $12, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_12_loop)
|
||
|
|
||
|
L(shl_12_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 12(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_13):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -13(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_13_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $13, %xmm2, %xmm3
|
||
|
palignr $13, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_13_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $13, %xmm2, %xmm3
|
||
|
palignr $13, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_13_loop)
|
||
|
|
||
|
L(shl_13_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 13(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_14):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -14(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_14_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $14, %xmm2, %xmm3
|
||
|
palignr $14, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_14_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $14, %xmm2, %xmm3
|
||
|
palignr $14, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_14_loop)
|
||
|
|
||
|
L(shl_14_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 14(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(shl_15):
|
||
|
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
|
||
|
lea -15(%eax), %eax
|
||
|
movaps (%eax), %xmm1
|
||
|
xor %edi, %edi
|
||
|
lea -32(%ecx), %ecx
|
||
|
movdqu %xmm0, (%esi)
|
||
|
POP (%esi)
|
||
|
L(shl_15_loop):
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm4
|
||
|
palignr $15, %xmm2, %xmm3
|
||
|
palignr $15, %xmm1, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jb L(shl_15_end)
|
||
|
|
||
|
movdqa 16(%eax, %edi), %xmm2
|
||
|
sub $32, %ecx
|
||
|
movdqa 32(%eax, %edi), %xmm3
|
||
|
movdqa %xmm3, %xmm1
|
||
|
palignr $15, %xmm2, %xmm3
|
||
|
palignr $15, %xmm4, %xmm2
|
||
|
lea 32(%edi), %edi
|
||
|
movdqa %xmm2, -32(%edx, %edi)
|
||
|
movdqa %xmm3, -16(%edx, %edi)
|
||
|
|
||
|
jae L(shl_15_loop)
|
||
|
|
||
|
L(shl_15_end):
|
||
|
lea 32(%ecx), %ecx
|
||
|
add %ecx, %edi
|
||
|
add %edi, %edx
|
||
|
lea 15(%edi, %eax), %eax
|
||
|
POP (%edi)
|
||
|
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(fwd_write_44bytes):
|
||
|
movl -44(%eax), %ecx
|
||
|
movl %ecx, -44(%edx)
|
||
|
L(fwd_write_40bytes):
|
||
|
movl -40(%eax), %ecx
|
||
|
movl %ecx, -40(%edx)
|
||
|
L(fwd_write_36bytes):
|
||
|
movl -36(%eax), %ecx
|
||
|
movl %ecx, -36(%edx)
|
||
|
L(fwd_write_32bytes):
|
||
|
movl -32(%eax), %ecx
|
||
|
movl %ecx, -32(%edx)
|
||
|
L(fwd_write_28bytes):
|
||
|
movl -28(%eax), %ecx
|
||
|
movl %ecx, -28(%edx)
|
||
|
L(fwd_write_24bytes):
|
||
|
movl -24(%eax), %ecx
|
||
|
movl %ecx, -24(%edx)
|
||
|
L(fwd_write_20bytes):
|
||
|
movl -20(%eax), %ecx
|
||
|
movl %ecx, -20(%edx)
|
||
|
L(fwd_write_16bytes):
|
||
|
movl -16(%eax), %ecx
|
||
|
movl %ecx, -16(%edx)
|
||
|
L(fwd_write_12bytes):
|
||
|
movl -12(%eax), %ecx
|
||
|
movl %ecx, -12(%edx)
|
||
|
L(fwd_write_8bytes):
|
||
|
movl -8(%eax), %ecx
|
||
|
movl %ecx, -8(%edx)
|
||
|
L(fwd_write_4bytes):
|
||
|
movl -4(%eax), %ecx
|
||
|
movl %ecx, -4(%edx)
|
||
|
L(fwd_write_0bytes):
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl %edx, %eax
|
||
|
# else
|
||
|
movl DEST(%esp), %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(fwd_write_5bytes):
|
||
|
movl -5(%eax), %ecx
|
||
|
movl -4(%eax), %eax
|
||
|
movl %ecx, -5(%edx)
|
||
|
movl %eax, -4(%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl %edx, %eax
|
||
|
# else
|
||
|
movl DEST(%esp), %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(fwd_write_45bytes):
|
||
|
movl -45(%eax), %ecx
|
||
|
movl %ecx, -45(%edx)
|
||
|
L(fwd_write_41bytes):
|
||
|
movl -41(%eax), %ecx
|
||
|
movl %ecx, -41(%edx)
|
||
|
L(fwd_write_37bytes):
|
||
|
movl -37(%eax), %ecx
|
||
|
movl %ecx, -37(%edx)
|
||
|
L(fwd_write_33bytes):
|
||
|
movl -33(%eax), %ecx
|
||
|
movl %ecx, -33(%edx)
|
||
|
L(fwd_write_29bytes):
|
||
|
movl -29(%eax), %ecx
|
||
|
movl %ecx, -29(%edx)
|
||
|
L(fwd_write_25bytes):
|
||
|
movl -25(%eax), %ecx
|
||
|
movl %ecx, -25(%edx)
|
||
|
L(fwd_write_21bytes):
|
||
|
movl -21(%eax), %ecx
|
||
|
movl %ecx, -21(%edx)
|
||
|
L(fwd_write_17bytes):
|
||
|
movl -17(%eax), %ecx
|
||
|
movl %ecx, -17(%edx)
|
||
|
L(fwd_write_13bytes):
|
||
|
movl -13(%eax), %ecx
|
||
|
movl %ecx, -13(%edx)
|
||
|
L(fwd_write_9bytes):
|
||
|
movl -9(%eax), %ecx
|
||
|
movl %ecx, -9(%edx)
|
||
|
movl -5(%eax), %ecx
|
||
|
movl %ecx, -5(%edx)
|
||
|
L(fwd_write_1bytes):
|
||
|
movzbl -1(%eax), %ecx
|
||
|
movb %cl, -1(%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl %edx, %eax
|
||
|
# else
|
||
|
movl DEST(%esp), %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(fwd_write_46bytes):
|
||
|
movl -46(%eax), %ecx
|
||
|
movl %ecx, -46(%edx)
|
||
|
L(fwd_write_42bytes):
|
||
|
movl -42(%eax), %ecx
|
||
|
movl %ecx, -42(%edx)
|
||
|
L(fwd_write_38bytes):
|
||
|
movl -38(%eax), %ecx
|
||
|
movl %ecx, -38(%edx)
|
||
|
L(fwd_write_34bytes):
|
||
|
movl -34(%eax), %ecx
|
||
|
movl %ecx, -34(%edx)
|
||
|
L(fwd_write_30bytes):
|
||
|
movl -30(%eax), %ecx
|
||
|
movl %ecx, -30(%edx)
|
||
|
L(fwd_write_26bytes):
|
||
|
movl -26(%eax), %ecx
|
||
|
movl %ecx, -26(%edx)
|
||
|
L(fwd_write_22bytes):
|
||
|
movl -22(%eax), %ecx
|
||
|
movl %ecx, -22(%edx)
|
||
|
L(fwd_write_18bytes):
|
||
|
movl -18(%eax), %ecx
|
||
|
movl %ecx, -18(%edx)
|
||
|
L(fwd_write_14bytes):
|
||
|
movl -14(%eax), %ecx
|
||
|
movl %ecx, -14(%edx)
|
||
|
L(fwd_write_10bytes):
|
||
|
movl -10(%eax), %ecx
|
||
|
movl %ecx, -10(%edx)
|
||
|
L(fwd_write_6bytes):
|
||
|
movl -6(%eax), %ecx
|
||
|
movl %ecx, -6(%edx)
|
||
|
L(fwd_write_2bytes):
|
||
|
movzwl -2(%eax), %ecx
|
||
|
movw %cx, -2(%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl %edx, %eax
|
||
|
# else
|
||
|
movl DEST(%esp), %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(fwd_write_47bytes):
|
||
|
movl -47(%eax), %ecx
|
||
|
movl %ecx, -47(%edx)
|
||
|
L(fwd_write_43bytes):
|
||
|
movl -43(%eax), %ecx
|
||
|
movl %ecx, -43(%edx)
|
||
|
L(fwd_write_39bytes):
|
||
|
movl -39(%eax), %ecx
|
||
|
movl %ecx, -39(%edx)
|
||
|
L(fwd_write_35bytes):
|
||
|
movl -35(%eax), %ecx
|
||
|
movl %ecx, -35(%edx)
|
||
|
L(fwd_write_31bytes):
|
||
|
movl -31(%eax), %ecx
|
||
|
movl %ecx, -31(%edx)
|
||
|
L(fwd_write_27bytes):
|
||
|
movl -27(%eax), %ecx
|
||
|
movl %ecx, -27(%edx)
|
||
|
L(fwd_write_23bytes):
|
||
|
movl -23(%eax), %ecx
|
||
|
movl %ecx, -23(%edx)
|
||
|
L(fwd_write_19bytes):
|
||
|
movl -19(%eax), %ecx
|
||
|
movl %ecx, -19(%edx)
|
||
|
L(fwd_write_15bytes):
|
||
|
movl -15(%eax), %ecx
|
||
|
movl %ecx, -15(%edx)
|
||
|
L(fwd_write_11bytes):
|
||
|
movl -11(%eax), %ecx
|
||
|
movl %ecx, -11(%edx)
|
||
|
L(fwd_write_7bytes):
|
||
|
movl -7(%eax), %ecx
|
||
|
movl %ecx, -7(%edx)
|
||
|
L(fwd_write_3bytes):
|
||
|
movzwl -3(%eax), %ecx
|
||
|
movzbl -1(%eax), %eax
|
||
|
movw %cx, -3(%edx)
|
||
|
movb %al, -1(%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl %edx, %eax
|
||
|
# else
|
||
|
movl DEST(%esp), %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(large_page):
|
||
|
movdqu (%eax), %xmm1
|
||
|
lea 16(%eax), %eax
|
||
|
movdqu %xmm0, (%esi)
|
||
|
movntdq %xmm1, (%edx)
|
||
|
lea 16(%edx), %edx
|
||
|
POP (%esi)
|
||
|
lea -0x90(%ecx), %ecx
|
||
|
POP (%edi)
|
||
|
L(large_page_loop):
|
||
|
movdqu (%eax), %xmm0
|
||
|
movdqu 0x10(%eax), %xmm1
|
||
|
movdqu 0x20(%eax), %xmm2
|
||
|
movdqu 0x30(%eax), %xmm3
|
||
|
movdqu 0x40(%eax), %xmm4
|
||
|
movdqu 0x50(%eax), %xmm5
|
||
|
movdqu 0x60(%eax), %xmm6
|
||
|
movdqu 0x70(%eax), %xmm7
|
||
|
lea 0x80(%eax), %eax
|
||
|
|
||
|
sub $0x80, %ecx
|
||
|
movntdq %xmm0, (%edx)
|
||
|
movntdq %xmm1, 0x10(%edx)
|
||
|
movntdq %xmm2, 0x20(%edx)
|
||
|
movntdq %xmm3, 0x30(%edx)
|
||
|
movntdq %xmm4, 0x40(%edx)
|
||
|
movntdq %xmm5, 0x50(%edx)
|
||
|
movntdq %xmm6, 0x60(%edx)
|
||
|
movntdq %xmm7, 0x70(%edx)
|
||
|
lea 0x80(%edx), %edx
|
||
|
jae L(large_page_loop)
|
||
|
cmp $-0x40, %ecx
|
||
|
lea 0x80(%ecx), %ecx
|
||
|
jl L(large_page_less_64bytes)
|
||
|
|
||
|
movdqu (%eax), %xmm0
|
||
|
movdqu 0x10(%eax), %xmm1
|
||
|
movdqu 0x20(%eax), %xmm2
|
||
|
movdqu 0x30(%eax), %xmm3
|
||
|
lea 0x40(%eax), %eax
|
||
|
|
||
|
movntdq %xmm0, (%edx)
|
||
|
movntdq %xmm1, 0x10(%edx)
|
||
|
movntdq %xmm2, 0x20(%edx)
|
||
|
movntdq %xmm3, 0x30(%edx)
|
||
|
lea 0x40(%edx), %edx
|
||
|
sub $0x40, %ecx
|
||
|
L(large_page_less_64bytes):
|
||
|
cmp $32, %ecx
|
||
|
jb L(large_page_less_32bytes)
|
||
|
movdqu (%eax), %xmm0
|
||
|
movdqu 0x10(%eax), %xmm1
|
||
|
lea 0x20(%eax), %eax
|
||
|
movntdq %xmm0, (%edx)
|
||
|
movntdq %xmm1, 0x10(%edx)
|
||
|
lea 0x20(%edx), %edx
|
||
|
sub $0x20, %ecx
|
||
|
L(large_page_less_32bytes):
|
||
|
add %ecx, %edx
|
||
|
add %ecx, %eax
|
||
|
sfence
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
||
|
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_write_44bytes):
|
||
|
movl 40(%eax), %ecx
|
||
|
movl %ecx, 40(%edx)
|
||
|
L(bk_write_40bytes):
|
||
|
movl 36(%eax), %ecx
|
||
|
movl %ecx, 36(%edx)
|
||
|
L(bk_write_36bytes):
|
||
|
movl 32(%eax), %ecx
|
||
|
movl %ecx, 32(%edx)
|
||
|
L(bk_write_32bytes):
|
||
|
movl 28(%eax), %ecx
|
||
|
movl %ecx, 28(%edx)
|
||
|
L(bk_write_28bytes):
|
||
|
movl 24(%eax), %ecx
|
||
|
movl %ecx, 24(%edx)
|
||
|
L(bk_write_24bytes):
|
||
|
movl 20(%eax), %ecx
|
||
|
movl %ecx, 20(%edx)
|
||
|
L(bk_write_20bytes):
|
||
|
movl 16(%eax), %ecx
|
||
|
movl %ecx, 16(%edx)
|
||
|
L(bk_write_16bytes):
|
||
|
movl 12(%eax), %ecx
|
||
|
movl %ecx, 12(%edx)
|
||
|
L(bk_write_12bytes):
|
||
|
movl 8(%eax), %ecx
|
||
|
movl %ecx, 8(%edx)
|
||
|
L(bk_write_8bytes):
|
||
|
movl 4(%eax), %ecx
|
||
|
movl %ecx, 4(%edx)
|
||
|
L(bk_write_4bytes):
|
||
|
movl (%eax), %ecx
|
||
|
movl %ecx, (%edx)
|
||
|
L(bk_write_0bytes):
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
movl DEST(%esp), %eax
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl LEN(%esp), %ecx
|
||
|
add %ecx, %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_write_45bytes):
|
||
|
movl 41(%eax), %ecx
|
||
|
movl %ecx, 41(%edx)
|
||
|
L(bk_write_41bytes):
|
||
|
movl 37(%eax), %ecx
|
||
|
movl %ecx, 37(%edx)
|
||
|
L(bk_write_37bytes):
|
||
|
movl 33(%eax), %ecx
|
||
|
movl %ecx, 33(%edx)
|
||
|
L(bk_write_33bytes):
|
||
|
movl 29(%eax), %ecx
|
||
|
movl %ecx, 29(%edx)
|
||
|
L(bk_write_29bytes):
|
||
|
movl 25(%eax), %ecx
|
||
|
movl %ecx, 25(%edx)
|
||
|
L(bk_write_25bytes):
|
||
|
movl 21(%eax), %ecx
|
||
|
movl %ecx, 21(%edx)
|
||
|
L(bk_write_21bytes):
|
||
|
movl 17(%eax), %ecx
|
||
|
movl %ecx, 17(%edx)
|
||
|
L(bk_write_17bytes):
|
||
|
movl 13(%eax), %ecx
|
||
|
movl %ecx, 13(%edx)
|
||
|
L(bk_write_13bytes):
|
||
|
movl 9(%eax), %ecx
|
||
|
movl %ecx, 9(%edx)
|
||
|
L(bk_write_9bytes):
|
||
|
movl 5(%eax), %ecx
|
||
|
movl %ecx, 5(%edx)
|
||
|
L(bk_write_5bytes):
|
||
|
movl 1(%eax), %ecx
|
||
|
movl %ecx, 1(%edx)
|
||
|
L(bk_write_1bytes):
|
||
|
movzbl (%eax), %ecx
|
||
|
movb %cl, (%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
movl DEST(%esp), %eax
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl LEN(%esp), %ecx
|
||
|
add %ecx, %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_write_46bytes):
|
||
|
movl 42(%eax), %ecx
|
||
|
movl %ecx, 42(%edx)
|
||
|
L(bk_write_42bytes):
|
||
|
movl 38(%eax), %ecx
|
||
|
movl %ecx, 38(%edx)
|
||
|
L(bk_write_38bytes):
|
||
|
movl 34(%eax), %ecx
|
||
|
movl %ecx, 34(%edx)
|
||
|
L(bk_write_34bytes):
|
||
|
movl 30(%eax), %ecx
|
||
|
movl %ecx, 30(%edx)
|
||
|
L(bk_write_30bytes):
|
||
|
movl 26(%eax), %ecx
|
||
|
movl %ecx, 26(%edx)
|
||
|
L(bk_write_26bytes):
|
||
|
movl 22(%eax), %ecx
|
||
|
movl %ecx, 22(%edx)
|
||
|
L(bk_write_22bytes):
|
||
|
movl 18(%eax), %ecx
|
||
|
movl %ecx, 18(%edx)
|
||
|
L(bk_write_18bytes):
|
||
|
movl 14(%eax), %ecx
|
||
|
movl %ecx, 14(%edx)
|
||
|
L(bk_write_14bytes):
|
||
|
movl 10(%eax), %ecx
|
||
|
movl %ecx, 10(%edx)
|
||
|
L(bk_write_10bytes):
|
||
|
movl 6(%eax), %ecx
|
||
|
movl %ecx, 6(%edx)
|
||
|
L(bk_write_6bytes):
|
||
|
movl 2(%eax), %ecx
|
||
|
movl %ecx, 2(%edx)
|
||
|
L(bk_write_2bytes):
|
||
|
movzwl (%eax), %ecx
|
||
|
movw %cx, (%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
movl DEST(%esp), %eax
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl LEN(%esp), %ecx
|
||
|
add %ecx, %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_write_47bytes):
|
||
|
movl 43(%eax), %ecx
|
||
|
movl %ecx, 43(%edx)
|
||
|
L(bk_write_43bytes):
|
||
|
movl 39(%eax), %ecx
|
||
|
movl %ecx, 39(%edx)
|
||
|
L(bk_write_39bytes):
|
||
|
movl 35(%eax), %ecx
|
||
|
movl %ecx, 35(%edx)
|
||
|
L(bk_write_35bytes):
|
||
|
movl 31(%eax), %ecx
|
||
|
movl %ecx, 31(%edx)
|
||
|
L(bk_write_31bytes):
|
||
|
movl 27(%eax), %ecx
|
||
|
movl %ecx, 27(%edx)
|
||
|
L(bk_write_27bytes):
|
||
|
movl 23(%eax), %ecx
|
||
|
movl %ecx, 23(%edx)
|
||
|
L(bk_write_23bytes):
|
||
|
movl 19(%eax), %ecx
|
||
|
movl %ecx, 19(%edx)
|
||
|
L(bk_write_19bytes):
|
||
|
movl 15(%eax), %ecx
|
||
|
movl %ecx, 15(%edx)
|
||
|
L(bk_write_15bytes):
|
||
|
movl 11(%eax), %ecx
|
||
|
movl %ecx, 11(%edx)
|
||
|
L(bk_write_11bytes):
|
||
|
movl 7(%eax), %ecx
|
||
|
movl %ecx, 7(%edx)
|
||
|
L(bk_write_7bytes):
|
||
|
movl 3(%eax), %ecx
|
||
|
movl %ecx, 3(%edx)
|
||
|
L(bk_write_3bytes):
|
||
|
movzwl 1(%eax), %ecx
|
||
|
movw %cx, 1(%edx)
|
||
|
movzbl (%eax), %eax
|
||
|
movb %al, (%edx)
|
||
|
#ifndef USE_AS_BCOPY
|
||
|
movl DEST(%esp), %eax
|
||
|
# ifdef USE_AS_MEMPCPY
|
||
|
movl LEN(%esp), %ecx
|
||
|
add %ecx, %eax
|
||
|
# endif
|
||
|
#endif
|
||
|
RETURN_END
|
||
|
|
||
|
|
||
|
.pushsection .rodata.ssse3,"a",@progbits
|
||
|
ALIGN (2)
|
||
|
L(table_48bytes_fwd):
|
||
|
.int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
|
||
|
.int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
|
||
|
|
||
|
ALIGN (2)
|
||
|
L(shl_table):
|
||
|
.int JMPTBL (L(shl_0), L(shl_table))
|
||
|
.int JMPTBL (L(shl_1), L(shl_table))
|
||
|
.int JMPTBL (L(shl_2), L(shl_table))
|
||
|
.int JMPTBL (L(shl_3), L(shl_table))
|
||
|
.int JMPTBL (L(shl_4), L(shl_table))
|
||
|
.int JMPTBL (L(shl_5), L(shl_table))
|
||
|
.int JMPTBL (L(shl_6), L(shl_table))
|
||
|
.int JMPTBL (L(shl_7), L(shl_table))
|
||
|
.int JMPTBL (L(shl_8), L(shl_table))
|
||
|
.int JMPTBL (L(shl_9), L(shl_table))
|
||
|
.int JMPTBL (L(shl_10), L(shl_table))
|
||
|
.int JMPTBL (L(shl_11), L(shl_table))
|
||
|
.int JMPTBL (L(shl_12), L(shl_table))
|
||
|
.int JMPTBL (L(shl_13), L(shl_table))
|
||
|
.int JMPTBL (L(shl_14), L(shl_table))
|
||
|
.int JMPTBL (L(shl_15), L(shl_table))
|
||
|
|
||
|
ALIGN (2)
|
||
|
L(table_48_bytes_bwd):
|
||
|
.int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
|
||
|
.int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
|
||
|
|
||
|
.popsection
|
||
|
|
||
|
#ifdef USE_AS_MEMMOVE
|
||
|
ALIGN (4)
|
||
|
L(copy_backward):
|
||
|
PUSH (%esi)
|
||
|
movl %eax, %esi
|
||
|
lea (%ecx,%edx,1),%edx
|
||
|
lea (%ecx,%esi,1),%esi
|
||
|
testl $0x3, %edx
|
||
|
jnz L(bk_align)
|
||
|
|
||
|
L(bk_aligned_4):
|
||
|
cmp $64, %ecx
|
||
|
jae L(bk_write_more64bytes)
|
||
|
|
||
|
L(bk_write_64bytesless):
|
||
|
cmp $32, %ecx
|
||
|
jb L(bk_write_less32bytes)
|
||
|
|
||
|
L(bk_write_more32bytes):
|
||
|
/* Copy 32 bytes at a time. */
|
||
|
sub $32, %ecx
|
||
|
movl -4(%esi), %eax
|
||
|
movl %eax, -4(%edx)
|
||
|
movl -8(%esi), %eax
|
||
|
movl %eax, -8(%edx)
|
||
|
movl -12(%esi), %eax
|
||
|
movl %eax, -12(%edx)
|
||
|
movl -16(%esi), %eax
|
||
|
movl %eax, -16(%edx)
|
||
|
movl -20(%esi), %eax
|
||
|
movl %eax, -20(%edx)
|
||
|
movl -24(%esi), %eax
|
||
|
movl %eax, -24(%edx)
|
||
|
movl -28(%esi), %eax
|
||
|
movl %eax, -28(%edx)
|
||
|
movl -32(%esi), %eax
|
||
|
movl %eax, -32(%edx)
|
||
|
sub $32, %edx
|
||
|
sub $32, %esi
|
||
|
|
||
|
L(bk_write_less32bytes):
|
||
|
movl %esi, %eax
|
||
|
sub %ecx, %edx
|
||
|
sub %ecx, %eax
|
||
|
POP (%esi)
|
||
|
L(bk_write_less32bytes_2):
|
||
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_align):
|
||
|
cmp $8, %ecx
|
||
|
jbe L(bk_write_less32bytes)
|
||
|
testl $1, %edx
|
||
|
/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
|
||
|
then (EDX & 2) must be != 0. */
|
||
|
jz L(bk_got2)
|
||
|
sub $1, %esi
|
||
|
sub $1, %ecx
|
||
|
sub $1, %edx
|
||
|
movzbl (%esi), %eax
|
||
|
movb %al, (%edx)
|
||
|
|
||
|
testl $2, %edx
|
||
|
jz L(bk_aligned_4)
|
||
|
|
||
|
L(bk_got2):
|
||
|
sub $2, %esi
|
||
|
sub $2, %ecx
|
||
|
sub $2, %edx
|
||
|
movzwl (%esi), %eax
|
||
|
movw %ax, (%edx)
|
||
|
jmp L(bk_aligned_4)
|
||
|
|
||
|
ALIGN (4)
|
||
|
L(bk_write_more64bytes):
|
||
|
/* Check alignment of last byte. */
|
||
|
testl $15, %edx
|
||
|
jz L(bk_ssse3_cpy_pre)
|
||
|
|
||
|
/* EDX is aligned 4 bytes, but not 16 bytes. */
|
||
|
L(bk_ssse3_align):
|
||
|
sub $4, %esi
|
||
|
sub $4, %ecx
|
||
|
sub $4, %edx
|
||
|
movl (%esi), %eax
|
||
|
movl %eax, (%edx)
|
||
|
|
||
|
testl $15, %edx
|
||
|
jz L(bk_ssse3_cpy_pre)
|
||
|
|
||
|
sub $4, %esi
|
||
|
sub $4, %ecx
|
||
|
sub $4, %edx
|
||
|
movl (%esi), %eax
|
||
|
movl %eax, (%edx)
|
||
|
|
||
|
testl $15, %edx
|
||
|
jz L(bk_ssse3_cpy_pre)
|
||
|
|
||
|
sub $4, %esi
|
||
|
sub $4, %ecx
|
||
|
sub $4, %edx
|
||
|
movl (%esi), %eax
|
||
|
movl %eax, (%edx)
|
||
|
|
||
|
L(bk_ssse3_cpy_pre):
|
||
|
cmp $64, %ecx
|
||
|
jb L(bk_write_more32bytes)
|
||
|
|
||
|
L(bk_ssse3_cpy):
|
||
|
sub $64, %esi
|
||
|
sub $64, %ecx
|
||
|
sub $64, %edx
|
||
|
movdqu 0x30(%esi), %xmm3
|
||
|
movdqa %xmm3, 0x30(%edx)
|
||
|
movdqu 0x20(%esi), %xmm2
|
||
|
movdqa %xmm2, 0x20(%edx)
|
||
|
movdqu 0x10(%esi), %xmm1
|
||
|
movdqa %xmm1, 0x10(%edx)
|
||
|
movdqu (%esi), %xmm0
|
||
|
movdqa %xmm0, (%edx)
|
||
|
cmp $64, %ecx
|
||
|
jae L(bk_ssse3_cpy)
|
||
|
jmp L(bk_write_64bytesless)
|
||
|
|
||
|
#endif
|
||
|
|
||
|
END (MEMCPY)
|