mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 17:33:50 +01:00
660f49b02d
Include fast DEFLATE compatable compression functions. Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
553 lines
14 KiB
NASM
553 lines
14 KiB
NASM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions
|
|
; are met:
|
|
; * Redistributions of source code must retain the above copyright
|
|
; notice, this list of conditions and the following disclaimer.
|
|
; * Redistributions in binary form must reproduce the above copyright
|
|
; notice, this list of conditions and the following disclaimer in
|
|
; the documentation and/or other materials provided with the
|
|
; distribution.
|
|
; * Neither the name of Intel Corporation nor the names of its
|
|
; contributors may be used to endorse or promote products derived
|
|
; from this software without specific prior written permission.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifndef BUFFER_UTILS
|
|
%define BUFFER_UTILS
|
|
|
|
%include "options.asm"
|
|
|
|
extern pshufb_shf_table
|
|
extern mask3
|
|
|
|
%ifdef FIX_CACHE_READ
|
|
%define vmovntdqa vmovdqa
|
|
%else
|
|
%macro prefetchnta 1
|
|
%endm
|
|
%endif
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; code for doing the CRC calculation as part of copy-in, using pclmulqdq
|
|
|
|
; "shift" 4 input registers down 4 places
|
|
; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
|
|
%macro FOLD4 7
|
|
%define %%xmm0 %1 ; xmm reg, in/out
|
|
%define %%xmm1 %2 ; xmm reg, in/out
|
|
%define %%xmm2 %3 ; xmm reg, in/out
|
|
%define %%xmm3 %4 ; xmm reg, in/out
|
|
%define %%const %5 ; xmm reg, in
|
|
%define %%tmp0 %6 ; xmm reg, tmp
|
|
%define %%tmp1 %7 ; xmm reg, tmp
|
|
|
|
vmovaps %%tmp0, %%xmm0
|
|
vmovaps %%tmp1, %%xmm1
|
|
|
|
vpclmulqdq %%xmm0, %%const, 0x01
|
|
vpclmulqdq %%xmm1, %%const, 0x01
|
|
|
|
vpclmulqdq %%tmp0, %%const, 0x10
|
|
vpclmulqdq %%tmp1, %%const, 0x10
|
|
|
|
vxorps %%xmm0, %%tmp0
|
|
vxorps %%xmm1, %%tmp1
|
|
|
|
|
|
vmovaps %%tmp0, %%xmm2
|
|
vmovaps %%tmp1, %%xmm3
|
|
|
|
vpclmulqdq %%xmm2, %%const, 0x01
|
|
vpclmulqdq %%xmm3, %%const, 0x01
|
|
|
|
vpclmulqdq %%tmp0, %%const, 0x10
|
|
vpclmulqdq %%tmp1, %%const, 0x10
|
|
|
|
vxorps %%xmm2, %%tmp0
|
|
vxorps %%xmm3, %%tmp1
|
|
%endm
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; "shift" 3 input registers down 4 places
|
|
; macro FOLD3 x0, x1, x2, x3, const, tmp0
|
|
; x0 x1 x2 x3
|
|
; In A B C D
|
|
; Out D A' B' C'
|
|
%macro FOLD3 6
|
|
%define %%x0 %1 ; xmm reg, in/out
|
|
%define %%x1 %2 ; xmm reg, in/out
|
|
%define %%x2 %3 ; xmm reg, in/out
|
|
%define %%x3 %4 ; xmm reg, in/out
|
|
%define %%const %5 ; xmm reg, in
|
|
%define %%tmp0 %6 ; xmm reg, tmp
|
|
|
|
vmovdqa %%tmp0, %%x3
|
|
|
|
vmovaps %%x3, %%x2
|
|
vpclmulqdq %%x2, %%const, 0x01
|
|
vpclmulqdq %%x3, %%const, 0x10
|
|
vxorps %%x3, %%x2
|
|
|
|
vmovaps %%x2, %%x1
|
|
vpclmulqdq %%x1, %%const, 0x01
|
|
vpclmulqdq %%x2, %%const, 0x10
|
|
vxorps %%x2, %%x1
|
|
|
|
vmovaps %%x1, %%x0
|
|
vpclmulqdq %%x0, %%const, 0x01
|
|
vpclmulqdq %%x1, %%const, 0x10
|
|
vxorps %%x1, %%x0
|
|
|
|
vmovdqa %%x0, %%tmp0
|
|
%endm
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; "shift" 2 input registers down 4 places
|
|
; macro FOLD2 x0, x1, x2, x3, const, tmp0
|
|
; x0 x1 x2 x3
|
|
; In A B C D
|
|
; Out C D A' B'
|
|
%macro FOLD2 6
|
|
%define %%x0 %1 ; xmm reg, in/out
|
|
%define %%x1 %2 ; xmm reg, in/out
|
|
%define %%x2 %3 ; xmm reg, in/out
|
|
%define %%x3 %4 ; xmm reg, in/out
|
|
%define %%const %5 ; xmm reg, in
|
|
%define %%tmp0 %6 ; xmm reg, tmp
|
|
|
|
vmovdqa %%tmp0, %%x3
|
|
|
|
vmovaps %%x3, %%x1
|
|
vpclmulqdq %%x1, %%const, 0x01
|
|
vpclmulqdq %%x3, %%const, 0x10
|
|
vxorps %%x3, %%x1
|
|
|
|
vmovdqa %%x1, %%tmp0
|
|
vmovdqa %%tmp0, %%x2
|
|
|
|
vmovaps %%x2, %%x0
|
|
vpclmulqdq %%x0, %%const, 0x01
|
|
vpclmulqdq %%x2, %%const, 0x10
|
|
vxorps %%x2, %%x0
|
|
|
|
vmovdqa %%x0, %%tmp0
|
|
%endm
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; "shift" 1 input registers down 4 places
|
|
; macro FOLD1 x0, x1, x2, x3, const, tmp0
|
|
; x0 x1 x2 x3
|
|
; In A B C D
|
|
; Out B C D A'
|
|
%macro FOLD1 6
|
|
%define %%x0 %1 ; xmm reg, in/out
|
|
%define %%x1 %2 ; xmm reg, in/out
|
|
%define %%x2 %3 ; xmm reg, in/out
|
|
%define %%x3 %4 ; xmm reg, in/out
|
|
%define %%const %5 ; xmm reg, in
|
|
%define %%tmp0 %6 ; xmm reg, tmp
|
|
|
|
vmovdqa %%tmp0, %%x3
|
|
|
|
vmovaps %%x3, %%x0
|
|
vpclmulqdq %%x0, %%const, 0x01
|
|
vpclmulqdq %%x3, %%const, 0x10
|
|
vxorps %%x3, %%x0
|
|
|
|
vmovdqa %%x0, %%x1
|
|
vmovdqa %%x1, %%x2
|
|
vmovdqa %%x2, %%tmp0
|
|
%endm
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
|
|
|
|
; XP X3 X2 X1 X0 tmp2
|
|
; Initial state xI HG FE DC BA
|
|
; after shift IH GF ED CB A0
|
|
; after fold ff GF ED CB ff = merge(IH, A0)
|
|
;
|
|
%macro PARTIAL_FOLD 12
|
|
%define %%x0 %1 ; xmm reg, in/out
|
|
%define %%x1 %2 ; xmm reg, in/out
|
|
%define %%x2 %3 ; xmm reg, in/out
|
|
%define %%x3 %4 ; xmm reg, in/out
|
|
%define %%xp %5 ; xmm partial reg, in/clobbered
|
|
%define %%size %6 ; GPR, in/clobbered (1...15)
|
|
%define %%const %7 ; xmm reg, in
|
|
%define %%shl %8 ; xmm reg, tmp
|
|
%define %%shr %9 ; xmm reg, tmp
|
|
%define %%tmp2 %10 ; xmm reg, tmp
|
|
%define %%tmp3 %11 ; xmm reg, tmp
|
|
%define %%gtmp %12 ; GPR, tmp
|
|
|
|
; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
|
|
shl %%size, 4 ; size *= 16
|
|
lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
|
|
vmovdqa %%shl, [%%gtmp + %%size] ; shl constant
|
|
vmovdqa %%shr, %%shl
|
|
vpxor %%shr, [mask3 WRT_OPT] ; shr constant
|
|
|
|
vmovdqa %%tmp2, %%x0 ; tmp2 = BA
|
|
vpshufb %%tmp2, %%shl ; tmp2 = A0
|
|
|
|
vpshufb %%x0, %%shr ; x0 = 0B
|
|
vmovdqa %%tmp3, %%x1 ; tmp3 = DC
|
|
vpshufb %%tmp3, %%shl ; tmp3 = C0
|
|
vpor %%x0, %%tmp3 ; x0 = CB
|
|
|
|
vpshufb %%x1, %%shr ; x1 = 0D
|
|
vmovdqa %%tmp3, %%x2 ; tmp3 = FE
|
|
vpshufb %%tmp3, %%shl ; tmp3 = E0
|
|
vpor %%x1, %%tmp3 ; x1 = ED
|
|
|
|
vpshufb %%x2, %%shr ; x2 = 0F
|
|
vmovdqa %%tmp3, %%x3 ; tmp3 = HG
|
|
vpshufb %%tmp3, %%shl ; tmp3 = G0
|
|
vpor %%x2, %%tmp3 ; x2 = GF
|
|
|
|
vpshufb %%x3, %%shr ; x3 = 0H
|
|
vpshufb %%xp, %%shl ; xp = I0
|
|
vpor %%x3, %%xp ; x3 = IH
|
|
|
|
; fold tmp2 into X3
|
|
vmovaps %%tmp3, %%tmp2
|
|
vpclmulqdq %%tmp2, %%const, 0x01
|
|
vpclmulqdq %%tmp3, %%const, 0x10
|
|
vxorps %%x3, %%tmp2
|
|
vxorps %%x3, %%tmp3
|
|
%endm
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
|
|
; Returns 0 if data has length 0.
|
|
; Input: The input data (src), that data's length (size).
|
|
; Output: The packed xmm register (xmm_out).
|
|
; size is clobbered.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
%macro LOAD_FRACTIONAL_XMM 3
|
|
%define %%xmm_out %1 ; %%xmm_out is an xmm register
|
|
%define %%src %2
|
|
%define %%size %3
|
|
|
|
vpxor %%xmm_out, %%xmm_out
|
|
|
|
cmp %%size, 0
|
|
je %%_done
|
|
|
|
add %%src, %%size
|
|
|
|
cmp %%size, 8
|
|
jl %%_byte_loop
|
|
|
|
sub %%src, 8
|
|
vpinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
|
|
sub %%size, 8
|
|
|
|
je %%_done
|
|
|
|
%%_byte_loop: ;Read in data 1 byte at a time while data is left
|
|
vpslldq %%xmm_out, 1
|
|
|
|
dec %%src
|
|
vpinsrb %%xmm_out, BYTE [%%src], 0
|
|
dec %%size
|
|
|
|
jg %%_byte_loop
|
|
|
|
%%_done:
|
|
|
|
%endmacro ; LOAD_FRACTIONAL_XMM
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
; copy x bytes (rounded up to 16 bytes) from src to dst
|
|
; src & dst are unaligned
|
|
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
|
|
; xt0, xt1, xt2, xt3, xt4
|
|
%macro COPY_IN_CRC 14
|
|
%define %%dst %1 ; reg, in/clobbered
|
|
%define %%src %2 ; reg, in/clobbered
|
|
%define %%size %3 ; reg, in/clobbered
|
|
%define %%tmp %4 ; reg, tmp
|
|
%define %%x0 %5 ; xmm, in/out: crc state
|
|
%define %%x1 %6 ; xmm, in/out: crc state
|
|
%define %%x2 %7 ; xmm, in/out: crc state
|
|
%define %%x3 %8 ; xmm, in/out: crc state
|
|
%define %%xfold %9 ; xmm, in: (loaded from fold4)
|
|
%define %%xtmp0 %10 ; xmm, tmp
|
|
%define %%xtmp1 %11 ; xmm, tmp
|
|
%define %%xtmp2 %12 ; xmm, tmp
|
|
%define %%xtmp3 %13 ; xmm, tmp
|
|
%define %%xtmp4 %14 ; xmm, tmp
|
|
|
|
cmp %%size, 16
|
|
jl %%lt_16
|
|
|
|
; align source
|
|
xor %%tmp, %%tmp
|
|
sub %%tmp, %%src
|
|
and %%tmp, 15
|
|
jz %%already_aligned
|
|
|
|
; need to align, tmp contains number of bytes to transfer
|
|
vmovdqu %%xtmp0, [%%src]
|
|
vmovdqu [%%dst], %%xtmp0
|
|
add %%dst, %%tmp
|
|
add %%src, %%tmp
|
|
sub %%size, %%tmp
|
|
|
|
%ifndef DEFLATE
|
|
push %%dst
|
|
|
|
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
|
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
|
pop %%dst
|
|
%endif
|
|
|
|
%%already_aligned:
|
|
sub %%size, 64
|
|
jl %%end_loop
|
|
jmp %%loop
|
|
align 16
|
|
%%loop:
|
|
vmovntdqa %%xtmp0, [%%src+0*16]
|
|
vmovntdqa %%xtmp1, [%%src+1*16]
|
|
vmovntdqa %%xtmp2, [%%src+2*16]
|
|
|
|
%ifndef DEFLATE
|
|
FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
|
|
%endif
|
|
vmovntdqa %%xtmp3, [%%src+3*16]
|
|
|
|
vmovdqu [%%dst+0*16], %%xtmp0
|
|
vmovdqu [%%dst+1*16], %%xtmp1
|
|
vmovdqu [%%dst+2*16], %%xtmp2
|
|
vmovdqu [%%dst+3*16], %%xtmp3
|
|
|
|
%ifndef DEFLATE
|
|
vpxor %%x0, %%xtmp0
|
|
vpxor %%x1, %%xtmp1
|
|
vpxor %%x2, %%xtmp2
|
|
vpxor %%x3, %%xtmp3
|
|
%endif
|
|
add %%src, 4*16
|
|
add %%dst, 4*16
|
|
sub %%size, 4*16
|
|
jge %%loop
|
|
|
|
%%end_loop:
|
|
; %%size contains (num bytes left - 64)
|
|
add %%size, 16
|
|
jge %%three_full_regs
|
|
add %%size, 16
|
|
jge %%two_full_regs
|
|
add %%size, 16
|
|
jge %%one_full_reg
|
|
add %%size, 16
|
|
|
|
%%no_full_regs: ; 0 <= %%size < 16, no full regs
|
|
jz %%done ; if no bytes left, we're done
|
|
jmp %%partial
|
|
|
|
;; Handle case where input is <16 bytes
|
|
%%lt_16:
|
|
test %%size, %%size
|
|
jz %%done ; if no bytes left, we're done
|
|
jmp %%partial
|
|
|
|
|
|
%%one_full_reg:
|
|
vmovntdqa %%xtmp0, [%%src+0*16]
|
|
|
|
%ifndef DEFLATE
|
|
FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
|
%endif
|
|
vmovdqu [%%dst+0*16], %%xtmp0
|
|
|
|
%ifndef DEFLATE
|
|
vpxor %%x3, %%xtmp0
|
|
%endif
|
|
test %%size, %%size
|
|
jz %%done ; if no bytes left, we're done
|
|
|
|
add %%dst, 1*16
|
|
add %%src, 1*16
|
|
jmp %%partial
|
|
|
|
|
|
%%two_full_regs:
|
|
vmovntdqa %%xtmp0, [%%src+0*16]
|
|
vmovntdqa %%xtmp1, [%%src+1*16]
|
|
|
|
%ifndef DEFLATE
|
|
FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
|
%endif
|
|
vmovdqu [%%dst+0*16], %%xtmp0
|
|
vmovdqu [%%dst+1*16], %%xtmp1
|
|
|
|
%ifndef DEFLATE
|
|
vpxor %%x2, %%xtmp0
|
|
vpxor %%x3, %%xtmp1
|
|
%endif
|
|
test %%size, %%size
|
|
jz %%done ; if no bytes left, we're done
|
|
|
|
add %%dst, 2*16
|
|
add %%src, 2*16
|
|
jmp %%partial
|
|
|
|
|
|
%%three_full_regs:
|
|
vmovntdqa %%xtmp0, [%%src+0*16]
|
|
vmovntdqa %%xtmp1, [%%src+1*16]
|
|
vmovntdqa %%xtmp2, [%%src+2*16]
|
|
|
|
%ifndef DEFLATE
|
|
FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
|
%endif
|
|
vmovdqu [%%dst+0*16], %%xtmp0
|
|
vmovdqu [%%dst+1*16], %%xtmp1
|
|
vmovdqu [%%dst+2*16], %%xtmp2
|
|
|
|
%ifndef DEFLATE
|
|
vpxor %%x1, %%xtmp0
|
|
vpxor %%x2, %%xtmp1
|
|
vpxor %%x3, %%xtmp2
|
|
%endif
|
|
test %%size, %%size
|
|
jz %%done ; if no bytes left, we're done
|
|
|
|
add %%dst, 3*16
|
|
add %%src, 3*16
|
|
|
|
; fall through to %%partial
|
|
%%partial: ; 0 <= %%size < 16
|
|
|
|
%ifndef DEFLATE
|
|
mov %%tmp, %%size
|
|
%endif
|
|
|
|
LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
|
|
|
|
vmovdqu [%%dst], %%xtmp0
|
|
|
|
%ifndef DEFLATE
|
|
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
|
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
|
%endif
|
|
|
|
%%done:
|
|
%endm
|
|
|
|
|
|
;%assign K 1024;
|
|
;%assign D 8 * K; ; Amount of history
|
|
;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary
|
|
|
|
; copy D + LA bytes from src to dst
|
|
; dst is aligned
|
|
;void copy_D_LA(uint8_t *dst, uint8_t *src);
|
|
; arg 1: rcx : dst
|
|
; arg 2: rdx : src
|
|
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
|
|
%macro copy_D_LA 7
|
|
%define %%dst %1 ; reg, clobbered
|
|
%define %%src %2 ; reg, clobbered
|
|
%define %%tmp %3
|
|
%define %%ytmp0 %4
|
|
%define %%ytmp1 %5
|
|
%define %%ytmp2 %6
|
|
%define %%ytmp3 %7
|
|
|
|
%define %%xtmp0 %4x
|
|
|
|
%assign %%SIZE (D + LA) / 32 ; number of DQ words to be copied
|
|
%assign %%SIZE4 %%SIZE/4
|
|
%assign %%MOD16 ((D + LA) - 32 * %%SIZE) / 16
|
|
|
|
lea %%tmp, [%%dst + 4 * 32 * %%SIZE4]
|
|
jmp %%copy_D_LA_1
|
|
align 16
|
|
%%copy_D_LA_1:
|
|
vmovdqu %%ytmp0, [%%src]
|
|
vmovdqu %%ytmp1, [%%src + 1 * 32]
|
|
vmovdqu %%ytmp2, [%%src + 2 * 32]
|
|
vmovdqu %%ytmp3, [%%src + 3 * 32]
|
|
vmovdqa [%%dst], %%ytmp0
|
|
vmovdqa [%%dst + 1 * 32], %%ytmp1
|
|
vmovdqa [%%dst + 2 * 32], %%ytmp2
|
|
vmovdqa [%%dst + 3 * 32], %%ytmp3
|
|
add %%src, 4*32
|
|
add %%dst, 4*32
|
|
cmp %%dst, %%tmp
|
|
jne %%copy_D_LA_1
|
|
%assign %%i 0
|
|
%rep (%%SIZE - 4 * %%SIZE4)
|
|
|
|
%if (%%i == 0)
|
|
vmovdqu %%ytmp0, [%%src + %%i*32]
|
|
%elif (%%i == 1)
|
|
vmovdqu %%ytmp1, [%%src + %%i*32]
|
|
%elif (%%i == 2)
|
|
vmovdqu %%ytmp2, [%%src + %%i*32]
|
|
%elif (%%i == 3)
|
|
vmovdqu %%ytmp3, [%%src + %%i*32]
|
|
%else
|
|
%error too many i
|
|
% error
|
|
%endif
|
|
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
%assign %%i 0
|
|
%rep (%%SIZE - 4 * %%SIZE4)
|
|
|
|
%if (%%i == 0)
|
|
vmovdqa [%%dst + %%i*32], %%ytmp0
|
|
%elif (%%i == 1)
|
|
vmovdqa [%%dst + %%i*32], %%ytmp1
|
|
%elif (%%i == 2)
|
|
vmovdqa [%%dst + %%i*32], %%ytmp2
|
|
%elif (%%i == 3)
|
|
vmovdqa [%%dst + %%i*32], %%ytmp3
|
|
%else
|
|
%error too many i
|
|
% error
|
|
%endif
|
|
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
%rep %%MOD16
|
|
vmovdqu %%xtmp0, [%%src + (%%SIZE - 4 * %%SIZE4)*32]
|
|
vmovdqa [%%dst + (%%SIZE - 4 * %%SIZE4)*32], %%xtmp0
|
|
%endrep
|
|
|
|
%endm
|
|
%endif
|