mirror of
https://github.com/intel/isa-l.git
synced 2025-12-11 20:37:28 +01:00
erasure_code: load xgfts with VBROADCASTI128 in x86 AVX2 impl
To generate the side-by-side pattern of two 128-bit xgfts within a YMM reg, loading them with VBROADCASTI128 from mem directly can be faster than loading then swapping them with VMOVDQU + VPERM2i128. Remove some out-of-date macro as well. Signed-off-by: Maodi Ma <mamaodi@hygon.cn>
This commit is contained in:
@@ -183,24 +183,19 @@ func(gf_2vect_dot_prod_avx2)
|
|||||||
.next_vect:
|
.next_vect:
|
||||||
SLDR src, src_m
|
SLDR src, src_m
|
||||||
mov ptr, [src+vec_i]
|
mov ptr, [src+vec_i]
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
add tmp, 32
|
|
||||||
add vec_i, PS
|
|
||||||
|
|
||||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
|
|
||||||
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
|
add vec_i, PS
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
|
||||||
|
add tmp, 32
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||||
|
|||||||
@@ -175,17 +175,14 @@ func(gf_2vect_mad_avx2)
|
|||||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||||
|
|
||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
sal vec, 5
|
|
||||||
lea tmp, [mul_array + vec_i]
|
lea tmp, [mul_array + vec_i]
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
sal vec, 5
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi
|
||||||
|
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
|
||||||
mov dest2, [dest1+PS] ; reuse mul_array
|
mov dest2, [dest1+PS] ; reuse mul_array
|
||||||
mov dest1, [dest1]
|
mov dest1, [dest1]
|
||||||
|
|
||||||
|
|||||||
@@ -199,29 +199,21 @@ func(gf_3vect_dot_prod_avx2)
|
|||||||
.next_vect:
|
.next_vect:
|
||||||
SLDR src, src_m
|
SLDR src, src_m
|
||||||
mov ptr, [src+vec_i]
|
mov ptr, [src+vec_i]
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
|
||||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
add tmp, 32
|
|
||||||
add vec_i, PS
|
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
|
|
||||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
|
|
||||||
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
|
||||||
|
add vec_i, PS
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi
|
||||||
|
add tmp, 32
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||||
|
|||||||
@@ -47,6 +47,7 @@
|
|||||||
%define tmp r11
|
%define tmp r11
|
||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
|
%define tmp2 r10
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
%define return.w eax
|
||||||
%define stack_size 16*10 + 3*8
|
%define stack_size 16*10 + 3*8
|
||||||
@@ -100,6 +101,7 @@
|
|||||||
%define tmp r11
|
%define tmp r11
|
||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
|
%define tmp2 r10
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
%define return.w eax
|
||||||
|
|
||||||
@@ -149,6 +151,7 @@ section .text
|
|||||||
%define xgft1_hi ymm13
|
%define xgft1_hi ymm13
|
||||||
%define xgft2_lo ymm12
|
%define xgft2_lo ymm12
|
||||||
%define xgft3_lo ymm11
|
%define xgft3_lo ymm11
|
||||||
|
%define xgft2_hi xgft3_lo ; Reuse ymm11
|
||||||
|
|
||||||
%define x0 ymm0
|
%define x0 ymm0
|
||||||
%define xtmpa ymm1
|
%define xtmpa ymm1
|
||||||
@@ -176,18 +179,15 @@ func(gf_3vect_mad_avx2)
|
|||||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||||
|
|
||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
sal vec, 5
|
|
||||||
lea tmp, [mul_array + vec_i]
|
lea tmp, [mul_array + vec_i]
|
||||||
|
mov tmp2, tmp
|
||||||
|
sal vec, 5
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo
|
||||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi
|
||||||
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
|
||||||
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
|
||||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
|
||||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
|
||||||
mov dest2, [dest1+PS] ; reuse mul_array
|
mov dest2, [dest1+PS] ; reuse mul_array
|
||||||
mov dest3, [dest1+2*PS] ; reuse vec_i
|
mov dest3, [dest1+2*PS] ; reuse vec_i
|
||||||
mov dest1, [dest1]
|
mov dest1, [dest1]
|
||||||
@@ -197,11 +197,9 @@ func(gf_3vect_mad_avx2)
|
|||||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||||
XLDR xd3, [dest3+pos] ;Get next dest vector
|
XLDR xd3, [dest3+pos] ;Get next dest vector
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
vbroadcasti128 xtmpl3, [tmp+2*vec] ;Load array: lo | lo
|
||||||
vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
vbroadcasti128 xtmph3, [tmp+2*vec+16] ; hi | hi
|
||||||
|
|
||||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
@@ -214,10 +212,10 @@ func(gf_3vect_mad_avx2)
|
|||||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||||
|
|
||||||
; dest2
|
; dest2
|
||||||
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph2, xtmpl2 ;GF add high and low partials
|
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||||
vpxor xd2, xtmph2 ;xd2 += partial
|
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||||
|
|
||||||
; dest3
|
; dest3
|
||||||
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||||
@@ -259,11 +257,10 @@ func(gf_3vect_mad_avx2)
|
|||||||
vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
|
vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
|
||||||
vpcmpgtb xtmpl3, xtmpl3, xtmph3
|
vpcmpgtb xtmpl3, xtmpl3, xtmph3
|
||||||
|
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
vbroadcasti128 xgft2_lo, [tmp2+vec] ; Load array: lo | lo
|
||||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
vbroadcasti128 xtmph2, [tmp2+vec+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp2+2*vec] ; Load array: lo | lo
|
||||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
vbroadcasti128 xtmph3, [tmp2+2*vec+16]; hi | hi
|
||||||
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
|
|||||||
@@ -228,26 +228,19 @@ func(gf_4vect_dot_prod_avx2)
|
|||||||
mov ptr, [src+vec_i]
|
mov ptr, [src+vec_i]
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
|
|
||||||
add vec_i, PS
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
|
||||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
add vec_i, PS
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo
|
||||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi
|
||||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
|
||||||
add tmp, 32
|
add tmp, 32
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
|
|||||||
@@ -46,9 +46,11 @@
|
|||||||
%define tmp r11
|
%define tmp r11
|
||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
|
%define tmp2 r13
|
||||||
|
%define tmp3 r14
|
||||||
|
%define tmp4 r10
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
%define stack_size 16*10 + 5*8
|
||||||
%define stack_size 16*10 + 3*8
|
|
||||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||||
%define func(x) proc_frame x
|
%define func(x) proc_frame x
|
||||||
|
|
||||||
@@ -66,6 +68,8 @@
|
|||||||
vmovdqa [rsp+16*9],xmm15
|
vmovdqa [rsp+16*9],xmm15
|
||||||
save_reg r12, 10*16 + 0*8
|
save_reg r12, 10*16 + 0*8
|
||||||
save_reg r15, 10*16 + 1*8
|
save_reg r15, 10*16 + 1*8
|
||||||
|
save_reg r13, 10*16 + 2*8
|
||||||
|
save_reg r14, 10*16 + 3*8
|
||||||
end_prolog
|
end_prolog
|
||||||
mov arg4, arg(4)
|
mov arg4, arg(4)
|
||||||
mov arg5, arg(5)
|
mov arg5, arg(5)
|
||||||
@@ -84,6 +88,8 @@
|
|||||||
vmovdqa xmm15, [rsp+16*9]
|
vmovdqa xmm15, [rsp+16*9]
|
||||||
mov r12, [rsp + 10*16 + 0*8]
|
mov r12, [rsp + 10*16 + 0*8]
|
||||||
mov r15, [rsp + 10*16 + 1*8]
|
mov r15, [rsp + 10*16 + 1*8]
|
||||||
|
mov r13, [rsp + 10*16 + 2*8]
|
||||||
|
mov r14, [rsp + 10*16 + 3*8]
|
||||||
add rsp, stack_size
|
add rsp, stack_size
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@@ -98,12 +104,20 @@
|
|||||||
%define tmp r11
|
%define tmp r11
|
||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
|
%define tmp2 r10
|
||||||
|
%define tmp3 r12 ; must be saved and restored
|
||||||
|
%define tmp4 r13 ; must be saved and restored
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
|
||||||
|
|
||||||
%define func(x) x: endbranch
|
%define func(x) x: endbranch
|
||||||
%define FUNC_SAVE
|
%macro FUNC_SAVE 0
|
||||||
%define FUNC_RESTORE
|
push r12
|
||||||
|
push r13
|
||||||
|
%endmacro
|
||||||
|
%macro FUNC_RESTORE 0
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
%endmacro
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
|
||||||
@@ -116,7 +130,6 @@
|
|||||||
%define src arg4
|
%define src arg4
|
||||||
%define dest1 arg5
|
%define dest1 arg5
|
||||||
%define pos return
|
%define pos return
|
||||||
%define pos.w return.w
|
|
||||||
|
|
||||||
%define dest2 mul_array
|
%define dest2 mul_array
|
||||||
%define dest3 vec
|
%define dest3 vec
|
||||||
@@ -176,24 +189,18 @@ func(gf_4vect_mad_avx2)
|
|||||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||||
|
|
||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
sal vec, 5 ;Multiply by 32
|
|
||||||
lea tmp, [mul_array + vec_i]
|
lea tmp, [mul_array + vec_i]
|
||||||
|
mov tmp2, tmp
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
sal vec, 5 ;Multiply by 32
|
||||||
; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
mov tmp3, vec
|
||||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
|
||||||
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
|
||||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
|
||||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
|
||||||
add tmp, vec
|
|
||||||
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
|
||||||
; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
|
||||||
|
|
||||||
mov dest2, [dest1+PS] ; reuse mul_array
|
mov dest2, [dest1+PS] ; reuse mul_array
|
||||||
mov dest3, [dest1+2*PS] ; reuse vec
|
mov dest3, [dest1+2*PS] ; reuse vec
|
||||||
mov dest4, [dest1+3*PS] ; reuse vec_i
|
mov dest4, [dest1+3*PS] ; reuse vec_i
|
||||||
mov dest1, [dest1]
|
mov dest1, [dest1]
|
||||||
|
|
||||||
|
lea tmp4, [tmp3+2*tmp3]
|
||||||
|
|
||||||
.loop32:
|
.loop32:
|
||||||
XLDR x0, [src+pos] ;Get next source vector
|
XLDR x0, [src+pos] ;Get next source vector
|
||||||
|
|
||||||
@@ -206,37 +213,40 @@ func(gf_4vect_mad_avx2)
|
|||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
|
|
||||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
vbroadcasti128 xtmph1, [tmp+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+tmp3] ;Load array: lo | lo
|
||||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xtmph2, [tmp+tmp3+16] ; hi | hi
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
|
||||||
|
|
||||||
; dest1
|
; dest1
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
vpxor xtmph1, xgft1_lo ;GF add high and low partials
|
||||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
vpxor xd1, xtmph1 ;xd1 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp+2*tmp3] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph3, [tmp+2*tmp3+16] ; hi | hi
|
||||||
|
|
||||||
; dest2
|
; dest2
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
vpxor xtmph2, xgft2_lo ;GF add high and low partials
|
||||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
vpxor xd2, xtmph2 ;xd2 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft4_lo, [tmp+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph4, [tmp+tmp4+16] ; hi | hi
|
||||||
|
|
||||||
; dest3
|
; dest3
|
||||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials
|
vpxor xtmph3, xgft3_lo ;GF add high and low partials
|
||||||
vpxor xd3, xd3, xtmph3 ;xd3 += partial
|
vpxor xd3, xtmph3 ;xd3 += partial
|
||||||
|
|
||||||
; dest4
|
; dest4
|
||||||
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph4, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials
|
vpxor xtmph4, xgft4_lo ;GF add high and low partials
|
||||||
vpxor xd4, xd4, xtmph4 ;xd4 += partial
|
vpxor xd4, xtmph4 ;xd4 += partial
|
||||||
|
|
||||||
XSTR [dest1+pos], xd1
|
XSTR [dest1+pos], xd1
|
||||||
XSTR [dest2+pos], xd2
|
XSTR [dest2+pos], xd2
|
||||||
@@ -275,17 +285,14 @@ func(gf_4vect_mad_avx2)
|
|||||||
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
||||||
vpcmpgtb xtmpl, xtmpl, xtmph2
|
vpcmpgtb xtmpl, xtmpl, xtmph2
|
||||||
|
|
||||||
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
|
|
||||||
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
vbroadcasti128 xgft1_lo, [tmp2] ;Load array: lo | lo
|
||||||
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
vbroadcasti128 xtmph1, [tmp2+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp2+tmp3] ;Load array: lo | lo
|
||||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xtmph2, [tmp2+tmp3+16]; hi | hi
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
|
||||||
|
|
||||||
; dest1
|
; dest1
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
@@ -294,6 +301,9 @@ func(gf_4vect_mad_avx2)
|
|||||||
vpand xtmph1, xtmph1, xtmpl
|
vpand xtmph1, xtmph1, xtmpl
|
||||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp2+2*tmp3] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph3, [tmp2+2*tmp3+16] ; hi | hi
|
||||||
|
|
||||||
; dest2
|
; dest2
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
@@ -301,6 +311,9 @@ func(gf_4vect_mad_avx2)
|
|||||||
vpand xtmph2, xtmph2, xtmpl
|
vpand xtmph2, xtmph2, xtmpl
|
||||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft4_lo, [tmp2+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph4, [tmp2+tmp4+16]; hi | hi
|
||||||
|
|
||||||
; dest3
|
; dest3
|
||||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
|
|||||||
@@ -220,43 +220,35 @@ func(gf_5vect_dot_prod_avx2)
|
|||||||
.next_vect:
|
.next_vect:
|
||||||
mov ptr, [src+vec_i]
|
mov ptr, [src+vec_i]
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
add vec_i, PS
|
|
||||||
|
|
||||||
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi
|
||||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
add vec_i, PS
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo
|
||||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi
|
||||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft1_lo, [tmp+vskip1*4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft1_hi, [tmp+vskip1*4+16] ; hi | hi
|
||||||
|
add tmp, 32
|
||||||
|
|
||||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
|
||||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
|
||||||
add tmp, 32
|
|
||||||
|
|
||||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||||
|
|||||||
@@ -47,9 +47,11 @@
|
|||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
%define tmp2 r10
|
%define tmp2 r10
|
||||||
|
%define tmp3 r13
|
||||||
|
%define tmp4 r14
|
||||||
|
%define tmp5 rdi
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
%define stack_size 16*10 + 5*8
|
||||||
%define stack_size 16*10 + 3*8
|
|
||||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||||
%define func(x) proc_frame x
|
%define func(x) proc_frame x
|
||||||
|
|
||||||
@@ -67,6 +69,9 @@
|
|||||||
vmovdqa [rsp+16*9],xmm15
|
vmovdqa [rsp+16*9],xmm15
|
||||||
save_reg r12, 10*16 + 0*8
|
save_reg r12, 10*16 + 0*8
|
||||||
save_reg r15, 10*16 + 1*8
|
save_reg r15, 10*16 + 1*8
|
||||||
|
save_reg r13, 10*16 + 2*8
|
||||||
|
save_reg r14, 10*16 + 3*8
|
||||||
|
save_reg rdi, 10*16 + 4*8
|
||||||
end_prolog
|
end_prolog
|
||||||
mov arg4, arg(4)
|
mov arg4, arg(4)
|
||||||
mov arg5, arg(5)
|
mov arg5, arg(5)
|
||||||
@@ -85,6 +90,9 @@
|
|||||||
vmovdqa xmm15, [rsp+16*9]
|
vmovdqa xmm15, [rsp+16*9]
|
||||||
mov r12, [rsp + 10*16 + 0*8]
|
mov r12, [rsp + 10*16 + 0*8]
|
||||||
mov r15, [rsp + 10*16 + 1*8]
|
mov r15, [rsp + 10*16 + 1*8]
|
||||||
|
mov r13, [rsp + 10*16 + 2*8]
|
||||||
|
mov r14, [rsp + 10*16 + 3*8]
|
||||||
|
mov rdi, [rsp + 10*16 + 4*8]
|
||||||
add rsp, stack_size
|
add rsp, stack_size
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@@ -100,12 +108,22 @@
|
|||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
%define tmp2 r10
|
%define tmp2 r10
|
||||||
|
%define tmp3 r12 ; must be saved and restored
|
||||||
|
%define tmp4 r13 ; must be saved and restored
|
||||||
|
%define tmp5 r14 ; must be saved and restored
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
|
||||||
|
|
||||||
%define func(x) x: endbranch
|
%define func(x) x: endbranch
|
||||||
%define FUNC_SAVE
|
%macro FUNC_SAVE 0
|
||||||
%define FUNC_RESTORE
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
%endmacro
|
||||||
|
%macro FUNC_RESTORE 0
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
%endmacro
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||||
@@ -117,7 +135,6 @@
|
|||||||
%define src arg4
|
%define src arg4
|
||||||
%define dest1 arg5
|
%define dest1 arg5
|
||||||
%define pos return
|
%define pos return
|
||||||
%define pos.w return.w
|
|
||||||
|
|
||||||
%define dest2 tmp2
|
%define dest2 tmp2
|
||||||
%define dest3 mul_array
|
%define dest3 mul_array
|
||||||
@@ -177,20 +194,11 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||||
|
|
||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
sal vec, 5 ;Multiply by 32
|
|
||||||
lea tmp, [mul_array + vec_i]
|
lea tmp, [mul_array + vec_i]
|
||||||
|
mov tmp3, tmp
|
||||||
|
sal vec, 5 ;Multiply by 32
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
mov tmp4, vec
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
|
||||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
|
||||||
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
|
||||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
|
||||||
add tmp, vec
|
|
||||||
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
|
||||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
|
||||||
|
|
||||||
mov dest3, [dest1+2*PS] ; reuse mul_array
|
mov dest3, [dest1+2*PS] ; reuse mul_array
|
||||||
mov dest4, [dest1+3*PS] ; reuse vec
|
mov dest4, [dest1+3*PS] ; reuse vec
|
||||||
@@ -198,6 +206,8 @@ func(gf_5vect_mad_avx2)
|
|||||||
mov dest2, [dest1+PS]
|
mov dest2, [dest1+PS]
|
||||||
mov dest1, [dest1]
|
mov dest1, [dest1]
|
||||||
|
|
||||||
|
lea tmp5, [tmp4+2*tmp4] ; vec*3, for addressing
|
||||||
|
|
||||||
.loop32:
|
.loop32:
|
||||||
XLDR x0, [src+pos] ;Get next source vector
|
XLDR x0, [src+pos] ;Get next source vector
|
||||||
|
|
||||||
@@ -210,44 +220,50 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xtmph1, [tmp+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph2, [tmp+tmp4+16] ; hi | hi
|
||||||
|
|
||||||
; dest1
|
; dest1
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
vpxor xtmph1, xgft1_lo ;GF add high and low partials
|
||||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
vpxor xd1, xtmph1 ;xd1 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph1, [tmp+2*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
; dest2
|
; dest2
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
vpxor xtmph2, xgft2_lo ;GF add high and low partials
|
||||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
vpxor xd2, xtmph2 ;xd2 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph2, [tmp+tmp5+16] ; hi | hi
|
||||||
|
|
||||||
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
|
||||||
; dest3
|
; dest3
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
vpxor xtmph1, xgft3_lo ;GF add high and low partials
|
||||||
vpxor xd3, xd3, xtmph1 ;xd3 += partial
|
vpxor xd3, xtmph1 ;xd3 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph1, [tmp+4*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
|
||||||
; dest4
|
; dest4
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
vpxor xtmph2, xgft4_lo ;GF add high and low partials
|
||||||
vpxor xd4, xd4, xtmph2 ;xd4 += partial
|
vpxor xd4, xtmph2 ;xd4 += partial
|
||||||
|
|
||||||
; dest5
|
; dest5
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
vpxor xtmph1, xgft5_lo ;GF add high and low partials
|
||||||
vpxor xd5, xd5, xtmph1 ;xd5 += partial
|
vpxor xd5, xtmph1 ;xd5 += partial
|
||||||
|
|
||||||
XSTR [dest1+pos], xd1
|
XSTR [dest1+pos], xd1
|
||||||
XSTR [dest2+pos], xd2
|
XSTR [dest2+pos], xd2
|
||||||
@@ -288,14 +304,14 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
||||||
vpcmpgtb xtmpl, xtmpl, xtmph2
|
vpcmpgtb xtmpl, xtmpl, xtmph2
|
||||||
|
|
||||||
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft1_lo, [tmp3] ;Load array: lo | lo
|
||||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xtmph1, [tmp3+16] ; hi | hi
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp3+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph2, [tmp3+tmp4+16]; hi | hi
|
||||||
|
|
||||||
; dest1
|
; dest1
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
@@ -304,7 +320,9 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpand xtmph1, xtmph1, xtmpl
|
vpand xtmph1, xtmph1, xtmpl
|
||||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft3_lo, [tmp3+2*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph1, [tmp3+2*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
; dest2
|
; dest2
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
@@ -312,7 +330,9 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpand xtmph2, xtmph2, xtmpl
|
vpand xtmph2, xtmph2, xtmpl
|
||||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||||
|
|
||||||
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft4_lo, [tmp3+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph2, [tmp3+tmp5+16]; hi | hi
|
||||||
|
|
||||||
; dest3
|
; dest3
|
||||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
@@ -320,7 +340,9 @@ func(gf_5vect_mad_avx2)
|
|||||||
vpand xtmph1, xtmph1, xtmpl
|
vpand xtmph1, xtmph1, xtmpl
|
||||||
vpxor xd3, xd3, xtmph1 ;xd3 += partial
|
vpxor xd3, xd3, xtmph1 ;xd3 += partial
|
||||||
|
|
||||||
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft5_lo, [tmp3+4*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph1, [tmp3+4*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
; dest4
|
; dest4
|
||||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
|
|||||||
@@ -218,52 +218,44 @@ func(gf_6vect_dot_prod_avx2)
|
|||||||
.next_vect:
|
.next_vect:
|
||||||
mov ptr, [src+vec_i]
|
mov ptr, [src+vec_i]
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
add vec_i, PS
|
|
||||||
|
|
||||||
vpand xgft3_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xgft3_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xgft3_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
|
||||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi
|
||||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
lea ptr, [vskip1 + vskip1*4]
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo
|
||||||
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
|
vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi
|
||||||
|
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft1_lo, [tmp+vskip3] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft1_hi, [tmp+vskip3+16] ; hi | hi
|
||||||
|
|
||||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+vskip1*4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xgft2_hi, [tmp+vskip1*4+16] ; hi | hi
|
||||||
|
|
||||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp+ptr] ;Load array: lo | lo
|
||||||
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
vbroadcasti128 xgft3_hi, [tmp+ptr+16] ; hi | hi
|
||||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
|
||||||
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
|
||||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
|
||||||
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
|
||||||
; " Fx{00}, Fx{10}, ..., Fx{f0}
|
|
||||||
add tmp, 32
|
add tmp, 32
|
||||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
add vec_i, PS
|
||||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
|
|
||||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
|
|||||||
@@ -48,9 +48,11 @@
|
|||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
%define tmp2 r10
|
%define tmp2 r10
|
||||||
%define tmp3 r13
|
%define tmp3 r13
|
||||||
|
%define tmp4 rdi
|
||||||
|
%define tmp5 rsi
|
||||||
|
%define tmp6 r14
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
%define stack_size 16*10 + 7*8
|
||||||
%define stack_size 16*10 + 3*8
|
|
||||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||||
%define func(x) proc_frame x
|
%define func(x) proc_frame x
|
||||||
|
|
||||||
@@ -69,6 +71,9 @@
|
|||||||
save_reg r12, 10*16 + 0*8
|
save_reg r12, 10*16 + 0*8
|
||||||
save_reg r13, 10*16 + 1*8
|
save_reg r13, 10*16 + 1*8
|
||||||
save_reg r15, 10*16 + 2*8
|
save_reg r15, 10*16 + 2*8
|
||||||
|
save_reg rdi, 10*16 + 3*8
|
||||||
|
save_reg rsi, 10*16 + 4*8
|
||||||
|
save_reg r14, 10*16 + 5*8
|
||||||
end_prolog
|
end_prolog
|
||||||
mov arg4, arg(4)
|
mov arg4, arg(4)
|
||||||
mov arg5, arg(5)
|
mov arg5, arg(5)
|
||||||
@@ -88,6 +93,9 @@
|
|||||||
mov r12, [rsp + 10*16 + 0*8]
|
mov r12, [rsp + 10*16 + 0*8]
|
||||||
mov r13, [rsp + 10*16 + 1*8]
|
mov r13, [rsp + 10*16 + 1*8]
|
||||||
mov r15, [rsp + 10*16 + 2*8]
|
mov r15, [rsp + 10*16 + 2*8]
|
||||||
|
mov rdi, [rsp + 10*16 + 3*8]
|
||||||
|
mov rsi, [rsp + 10*16 + 4*8]
|
||||||
|
mov r14, [rsp + 10*16 + 5*8]
|
||||||
add rsp, stack_size
|
add rsp, stack_size
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@@ -103,15 +111,23 @@
|
|||||||
%define tmp.w r11d
|
%define tmp.w r11d
|
||||||
%define tmp.b r11b
|
%define tmp.b r11b
|
||||||
%define tmp2 r10
|
%define tmp2 r10
|
||||||
%define tmp3 r12
|
%define tmp3 r12 ; must be saved and restored
|
||||||
|
%define tmp4 r13 ; must be saved and restored
|
||||||
|
%define tmp5 r14 ; must be saved and restored
|
||||||
|
%define tmp6 r15 ; must be saved and restored
|
||||||
%define return rax
|
%define return rax
|
||||||
%define return.w eax
|
|
||||||
|
|
||||||
%define func(x) x: endbranch
|
%define func(x) x: endbranch
|
||||||
%macro FUNC_SAVE 0
|
%macro FUNC_SAVE 0
|
||||||
push r12
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
%endmacro
|
%endmacro
|
||||||
%macro FUNC_RESTORE 0
|
%macro FUNC_RESTORE 0
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
pop r12
|
pop r12
|
||||||
%endmacro
|
%endmacro
|
||||||
%endif
|
%endif
|
||||||
@@ -125,7 +141,6 @@
|
|||||||
%define src arg4
|
%define src arg4
|
||||||
%define dest1 arg5
|
%define dest1 arg5
|
||||||
%define pos return
|
%define pos return
|
||||||
%define pos.w return.w
|
|
||||||
|
|
||||||
%define dest2 tmp3
|
%define dest2 tmp3
|
||||||
%define dest3 tmp2
|
%define dest3 tmp2
|
||||||
@@ -190,6 +205,7 @@ func(gf_6vect_mad_avx2)
|
|||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
sal vec, 5 ;Multiply by 32
|
sal vec, 5 ;Multiply by 32
|
||||||
lea tmp, [mul_array + vec_i]
|
lea tmp, [mul_array + vec_i]
|
||||||
|
mov tmp6, tmp
|
||||||
mov vec_i, vec
|
mov vec_i, vec
|
||||||
mov mul_array, vec
|
mov mul_array, vec
|
||||||
sal vec_i, 1
|
sal vec_i, 1
|
||||||
@@ -197,18 +213,7 @@ func(gf_6vect_mad_avx2)
|
|||||||
add vec_i, vec ;vec_i=vec*96
|
add vec_i, vec ;vec_i=vec*96
|
||||||
add mul_array, vec_i ;vec_i=vec*160
|
add mul_array, vec_i ;vec_i=vec*160
|
||||||
|
|
||||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
mov tmp4, vec
|
||||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
|
||||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
|
||||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
|
||||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
|
||||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
|
||||||
vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
|
||||||
; " Fx{00}, Fx{10}, ..., Fx{f0}
|
|
||||||
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
|
||||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
|
||||||
vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
|
||||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
|
||||||
|
|
||||||
mov dest2, [dest1+PS] ; reuse tmp3
|
mov dest2, [dest1+PS] ; reuse tmp3
|
||||||
mov dest3, [dest1+2*PS] ; reuse tmp2
|
mov dest3, [dest1+2*PS] ; reuse tmp2
|
||||||
@@ -225,57 +230,70 @@ func(gf_6vect_mad_avx2)
|
|||||||
XLDR xd4, [dest4+pos] ;Get next dest vector
|
XLDR xd4, [dest4+pos] ;Get next dest vector
|
||||||
XLDR xd5, [dest5+pos] ;Get next dest vector
|
XLDR xd5, [dest5+pos] ;Get next dest vector
|
||||||
|
|
||||||
|
lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing
|
||||||
|
|
||||||
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+16] ; hi | hi
|
||||||
|
|
||||||
;dest1
|
;dest1
|
||||||
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpxor xtmph, xgft1_lo ;GF add high and low partials
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
vpxor xd1, xtmph ;xd1 += partial
|
||||||
vpxor xd1, xd1, xtmph ;xd1 += partial
|
|
||||||
|
|
||||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||||
|
|
||||||
;dest2
|
|
||||||
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
|
||||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
|
||||||
vpxor xd2, xd2, xtmph ;xd2 += partial
|
|
||||||
|
|
||||||
;dest3
|
|
||||||
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
|
||||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
|
||||||
vpxor xd3, xd3, xtmph ;xd3 += partial
|
|
||||||
|
|
||||||
XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector
|
XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector
|
||||||
|
|
||||||
|
vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+tmp4+16] ; hi | hi
|
||||||
|
|
||||||
|
;dest2
|
||||||
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
|
vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
|
vpxor xtmph, xgft2_lo ;GF add high and low partials
|
||||||
|
vpxor xd2, xtmph ;xd2 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+2*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
|
;dest3
|
||||||
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
|
vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
|
vpxor xtmph, xgft3_lo ;GF add high and low partials
|
||||||
|
vpxor xd3, xtmph ;xd3 += partial
|
||||||
|
|
||||||
|
vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi
|
||||||
|
|
||||||
|
lea tmp5, [tmp5+2*tmp4] ;5*vec, for addressing
|
||||||
|
|
||||||
;dest4
|
;dest4
|
||||||
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
vpxor xtmph, xgft4_lo ;GF add high and low partials
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
vpxor xd4, xtmph ;xd4 += partial
|
||||||
vpxor xd4, xd4, xtmph ;xd4 += partial
|
|
||||||
|
vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+4*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
;dest5
|
;dest5
|
||||||
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
vpxor xtmph, xgft5_lo ;GF add high and low partials
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
vpxor xd5, xtmph ;xd5 += partial
|
||||||
vpxor xd5, xd5, xtmph ;xd5 += partial
|
|
||||||
|
vbroadcasti128 xgft6_lo, [tmp+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi
|
||||||
|
|
||||||
;dest6
|
;dest6
|
||||||
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
|
vpshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xgft6_lo, xtmpl ;Lookup mul table of low nibble
|
||||||
vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
vpxor xtmph, xgft6_lo ;GF add high and low partials
|
||||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
vpxor xd6, xtmph ;xd6 += partial
|
||||||
vpxor xd6, xd6, xtmph ;xd6 += partial
|
|
||||||
|
|
||||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||||
@@ -308,20 +326,21 @@ func(gf_6vect_mad_avx2)
|
|||||||
XLDR xd5, [dest5+tmp] ;Get next dest vector
|
XLDR xd5, [dest5+tmp] ;Get next dest vector
|
||||||
|
|
||||||
sub len, pos
|
sub len, pos
|
||||||
|
lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing
|
||||||
|
|
||||||
vpinsrb xtmplx, xtmplx, len.w, 15
|
vpinsrb xtmplx, xtmplx, len.w, 15
|
||||||
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
|
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
|
||||||
vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f...
|
vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f...
|
||||||
vpcmpgtb xtmpl, xtmpl, [constip32]
|
vpcmpgtb xtmpl, xtmpl, [constip32]
|
||||||
|
|
||||||
vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0
|
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||||
vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
|
||||||
vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
|
||||||
|
|
||||||
;dest1
|
;dest1
|
||||||
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft1_lo, [tmp6] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+16] ; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials
|
||||||
@@ -329,9 +348,12 @@ func(gf_6vect_mad_avx2)
|
|||||||
vpxor xd1, xd1, xtmph ;xd1 += partial
|
vpxor xd1, xd1, xtmph ;xd1 += partial
|
||||||
|
|
||||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||||
|
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
|
||||||
|
|
||||||
;dest2
|
;dest2
|
||||||
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft2_lo, [tmp6+tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+tmp4+16]; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials
|
||||||
@@ -339,25 +361,31 @@ func(gf_6vect_mad_avx2)
|
|||||||
vpxor xd2, xd2, xtmph ;xd2 += partial
|
vpxor xd2, xd2, xtmph ;xd2 += partial
|
||||||
|
|
||||||
;dest3
|
;dest3
|
||||||
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft3_lo, [tmp6+2*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+2*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials
|
||||||
vpand xtmph, xtmph, xtmpl
|
vpand xtmph, xtmph, xtmpl
|
||||||
vpxor xd3, xd3, xtmph ;xd3 += partial
|
vpxor xd3, xd3, xtmph ;xd3 += partial
|
||||||
|
|
||||||
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
|
|
||||||
|
|
||||||
;dest4
|
;dest4
|
||||||
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft4_lo, [tmp6+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials
|
||||||
vpand xtmph, xtmph, xtmpl
|
vpand xtmph, xtmph, xtmpl
|
||||||
vpxor xd4, xd4, xtmph ;xd4 += partial
|
vpxor xd4, xd4, xtmph ;xd4 += partial
|
||||||
|
|
||||||
|
lea tmp5, [tmp5+2*tmp4] ; 5*vec, for addressing
|
||||||
|
|
||||||
;dest5
|
;dest5
|
||||||
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft5_lo, [tmp6+4*tmp4] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+4*tmp4+16] ; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials
|
||||||
@@ -365,7 +393,9 @@ func(gf_6vect_mad_avx2)
|
|||||||
vpxor xd5, xd5, xtmph ;xd5 += partial
|
vpxor xd5, xd5, xtmph ;xd5 += partial
|
||||||
|
|
||||||
;dest6
|
;dest6
|
||||||
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
|
vbroadcasti128 xgft6_lo, [tmp6+tmp5] ;Load array: lo | lo
|
||||||
|
vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi
|
||||||
|
|
||||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||||
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||||
vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials
|
vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials
|
||||||
|
|||||||
@@ -151,11 +151,8 @@ func(gf_vect_dot_prod_avx2)
|
|||||||
|
|
||||||
mov ptr, [src+vec_i*PS]
|
mov ptr, [src+vec_i*PS]
|
||||||
|
|
||||||
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
vbroadcasti128 xgft_lo, [tmp] ;Load array: lo | lo
|
||||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
vbroadcasti128 xgft_hi, [tmp+16] ; hi | hi
|
||||||
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
XLDR x0, [ptr+pos] ;Get next source vector
|
XLDR x0, [ptr+pos] ;Get next source vector
|
||||||
|
|
||||||
add tmp, 32
|
add tmp, 32
|
||||||
|
|||||||
@@ -150,10 +150,8 @@ func(gf_vect_mad_avx2)
|
|||||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||||
|
|
||||||
sal vec_i, 5 ;Multiply by 32
|
sal vec_i, 5 ;Multiply by 32
|
||||||
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
vbroadcasti128 xgft_lo, [vec_i+mul_array] ;Load array: lo | lo
|
||||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
vbroadcasti128 xgft_hi, [vec_i+mul_array+16] ; hi | hi
|
||||||
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
|
|
||||||
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
|
|
||||||
|
|
||||||
XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest
|
XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user