erasure_code: load xgfts with VBROADCASTI128 in x86 AVX2 impl

To generate the side-by-side pattern of two 128-bit xgfts within a
YMM reg, loading them with VBROADCASTI128 from mem directly can be
faster than loading then swapping them with VMOVDQU + VPERM2i128.

Remove some out-of-date macro as well.

Signed-off-by: Maodi Ma <mamaodi@hygon.cn>
This commit is contained in:
Maodi Ma
2025-09-24 16:57:32 +00:00
committed by Pablo de Lara
parent 63471d7f29
commit a439f0dd5d
12 changed files with 325 additions and 307 deletions

View File

@@ -183,24 +183,19 @@ func(gf_2vect_dot_prod_avx2)
.next_vect: .next_vect:
SLDR src, src_m SLDR src, src_m
mov ptr, [src+vec_i] mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, PS
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
add vec_i, PS
vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
add tmp, 32
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xgft1_hi, xgft1_lo ;GF add high and low partials

View File

@@ -175,17 +175,14 @@ func(gf_2vect_mad_avx2)
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i] lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} sal vec, 5
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
; " Bx{00}, Bx{10}, ..., Bx{f0} vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo
vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
mov dest2, [dest1+PS] ; reuse mul_array mov dest2, [dest1+PS] ; reuse mul_array
mov dest1, [dest1] mov dest1, [dest1]

View File

@@ -199,29 +199,21 @@ func(gf_3vect_dot_prod_avx2)
.next_vect: .next_vect:
SLDR src, src_m SLDR src, src_m
mov ptr, [src+vec_i] mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
add tmp, 32
add vec_i, PS
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
add vec_i, PS
vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo
vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi
add tmp, 32
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xgft1_hi, xgft1_lo ;GF add high and low partials

View File

@@ -47,6 +47,7 @@
%define tmp r11 %define tmp r11
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10
%define return rax %define return rax
%define return.w eax %define return.w eax
%define stack_size 16*10 + 3*8 %define stack_size 16*10 + 3*8
@@ -100,6 +101,7 @@
%define tmp r11 %define tmp r11
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10
%define return rax %define return rax
%define return.w eax %define return.w eax
@@ -149,6 +151,7 @@ section .text
%define xgft1_hi ymm13 %define xgft1_hi ymm13
%define xgft2_lo ymm12 %define xgft2_lo ymm12
%define xgft3_lo ymm11 %define xgft3_lo ymm11
%define xgft2_hi xgft3_lo ; Reuse ymm11
%define x0 ymm0 %define x0 ymm0
%define xtmpa ymm1 %define xtmpa ymm1
@@ -176,18 +179,15 @@ func(gf_3vect_mad_avx2)
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i] lea tmp, [mul_array + vec_i]
mov tmp2, tmp
sal vec, 5
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
; " Ax{00}, Ax{10}, ..., Ax{f0} vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
mov dest2, [dest1+PS] ; reuse mul_array mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec_i mov dest3, [dest1+2*PS] ; reuse vec_i
mov dest1, [dest1] mov dest1, [dest1]
@@ -197,11 +197,9 @@ func(gf_3vect_mad_avx2)
XLDR xd1, [dest1+pos] ;Get next dest vector XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector XLDR xd2, [dest2+pos] ;Get next dest vector
XLDR xd3, [dest3+pos] ;Get next dest vector XLDR xd3, [dest3+pos] ;Get next dest vector
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi vbroadcasti128 xtmpl3, [tmp+2*vec] ;Load array: lo | lo
vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo vbroadcasti128 xtmph3, [tmp+2*vec+16] ; hi | hi
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
@@ -214,10 +212,10 @@ func(gf_3vect_mad_avx2)
vpxor xd1, xd1, xtmph1 ;xd1 += partial vpxor xd1, xd1, xtmph1 ;xd1 += partial
; dest2 ; dest2
vpshufb xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmpl2 ;GF add high and low partials vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xtmph2 ;xd2 += partial vpxor xd2, xd2, xtmph2 ;xd2 += partial
; dest3 ; dest3
vpshufb xtmph3, x0 ;Lookup mul table of high nibble vpshufb xtmph3, x0 ;Lookup mul table of high nibble
@@ -259,11 +257,10 @@ func(gf_3vect_mad_avx2)
vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f... vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
vpcmpgtb xtmpl3, xtmpl3, xtmph3 vpcmpgtb xtmpl3, xtmpl3, xtmph3
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi vbroadcasti128 xgft2_lo, [tmp2+vec] ; Load array: lo | lo
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo vbroadcasti128 xtmph2, [tmp2+vec+16] ; hi | hi
vbroadcasti128 xgft3_lo, [tmp2+2*vec] ; Load array: lo | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi vbroadcasti128 xtmph3, [tmp2+2*vec+16]; hi | hi
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0

View File

@@ -228,26 +228,19 @@ func(gf_4vect_dot_prod_avx2)
mov ptr, [src+vec_i] mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
; " Ax{00}, Ax{10}, ..., Ax{f0} vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo
; " Bx{00}, Bx{10}, ..., Bx{f0} vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} add vec_i, PS
; " Cx{00}, Cx{10}, ..., Cx{f0} vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi
; " Dx{00}, Dx{10}, ..., Dx{f0} vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo
vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
add tmp, 32 add tmp, 32
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble

View File

@@ -46,9 +46,11 @@
%define tmp r11 %define tmp r11
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r13
%define tmp3 r14
%define tmp4 r10
%define return rax %define return rax
%define return.w eax %define stack_size 16*10 + 5*8
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x] %define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x %define func(x) proc_frame x
@@ -66,6 +68,8 @@
vmovdqa [rsp+16*9],xmm15 vmovdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8 save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8 save_reg r15, 10*16 + 1*8
save_reg r13, 10*16 + 2*8
save_reg r14, 10*16 + 3*8
end_prolog end_prolog
mov arg4, arg(4) mov arg4, arg(4)
mov arg5, arg(5) mov arg5, arg(5)
@@ -84,6 +88,8 @@
vmovdqa xmm15, [rsp+16*9] vmovdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8] mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8] mov r15, [rsp + 10*16 + 1*8]
mov r13, [rsp + 10*16 + 2*8]
mov r14, [rsp + 10*16 + 3*8]
add rsp, stack_size add rsp, stack_size
%endmacro %endmacro
@@ -98,12 +104,20 @@
%define tmp r11 %define tmp r11
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10
%define tmp3 r12 ; must be saved and restored
%define tmp4 r13 ; must be saved and restored
%define return rax %define return rax
%define return.w eax
%define func(x) x: endbranch %define func(x) x: endbranch
%define FUNC_SAVE %macro FUNC_SAVE 0
%define FUNC_RESTORE push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif %endif
@@ -116,7 +130,6 @@
%define src arg4 %define src arg4
%define dest1 arg5 %define dest1 arg5
%define pos return %define pos return
%define pos.w return.w
%define dest2 mul_array %define dest2 mul_array
%define dest3 vec %define dest3 vec
@@ -176,24 +189,18 @@ func(gf_4vect_mad_avx2)
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i] lea tmp, [mul_array + vec_i]
mov tmp2, tmp
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... sal vec, 5 ;Multiply by 32
; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} mov tmp3, vec
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
add tmp, vec
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
mov dest2, [dest1+PS] ; reuse mul_array mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec mov dest3, [dest1+2*PS] ; reuse vec
mov dest4, [dest1+3*PS] ; reuse vec_i mov dest4, [dest1+3*PS] ; reuse vec_i
mov dest1, [dest1] mov dest1, [dest1]
lea tmp4, [tmp3+2*tmp3]
.loop32: .loop32:
XLDR x0, [src+pos] ;Get next source vector XLDR x0, [src+pos] ;Get next source vector
@@ -206,37 +213,40 @@ func(gf_4vect_mad_avx2)
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo vbroadcasti128 xtmph1, [tmp+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp+tmp3] ;Load array: lo | lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xtmph2, [tmp+tmp3+16] ; hi | hi
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest1 ; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials vpxor xtmph1, xgft1_lo ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial vpxor xd1, xtmph1 ;xd1 += partial
vbroadcasti128 xgft3_lo, [tmp+2*tmp3] ;Load array: lo | lo
vbroadcasti128 xtmph3, [tmp+2*tmp3+16] ; hi | hi
; dest2 ; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials vpxor xtmph2, xgft2_lo ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial vpxor xd2, xtmph2 ;xd2 += partial
vbroadcasti128 xgft4_lo, [tmp+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph4, [tmp+tmp4+16] ; hi | hi
; dest3 ; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble vpshufb xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials vpxor xtmph3, xgft3_lo ;GF add high and low partials
vpxor xd3, xd3, xtmph3 ;xd3 += partial vpxor xd3, xtmph3 ;xd3 += partial
; dest4 ; dest4
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble vpshufb xtmph4, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials vpxor xtmph4, xgft4_lo ;GF add high and low partials
vpxor xd4, xd4, xtmph4 ;xd4 += partial vpxor xd4, xtmph4 ;xd4 += partial
XSTR [dest1+pos], xd1 XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2 XSTR [dest2+pos], xd2
@@ -275,17 +285,14 @@ func(gf_4vect_mad_avx2)
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, xtmph2 vpcmpgtb xtmpl, xtmpl, xtmph2
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi vbroadcasti128 xgft1_lo, [tmp2] ;Load array: lo | lo
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo vbroadcasti128 xtmph1, [tmp2+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp2+tmp3] ;Load array: lo | lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xtmph2, [tmp2+tmp3+16]; hi | hi
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest1 ; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
@@ -294,6 +301,9 @@ func(gf_4vect_mad_avx2)
vpand xtmph1, xtmph1, xtmpl vpand xtmph1, xtmph1, xtmpl
vpxor xd1, xd1, xtmph1 ;xd1 += partial vpxor xd1, xd1, xtmph1 ;xd1 += partial
vbroadcasti128 xgft3_lo, [tmp2+2*tmp3] ;Load array: lo | lo
vbroadcasti128 xtmph3, [tmp2+2*tmp3+16] ; hi | hi
; dest2 ; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
@@ -301,6 +311,9 @@ func(gf_4vect_mad_avx2)
vpand xtmph2, xtmph2, xtmpl vpand xtmph2, xtmph2, xtmpl
vpxor xd2, xd2, xtmph2 ;xd2 += partial vpxor xd2, xd2, xtmph2 ;xd2 += partial
vbroadcasti128 xgft4_lo, [tmp2+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph4, [tmp2+tmp4+16]; hi | hi
; dest3 ; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble

View File

@@ -220,43 +220,35 @@ func(gf_5vect_dot_prod_avx2)
.next_vect: .next_vect:
mov ptr, [src+vec_i] mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
; " Ax{00}, Ax{10}, ..., Ax{f0} vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo
; " Bx{00}, Bx{10}, ..., Bx{f0} vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} add vec_i, PS
; " Cx{00}, Cx{10}, ..., Cx{f0} vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi
; " Dx{00}, Dx{10}, ..., Dx{f0} vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo
vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial vpxor xp1, xgft1_hi ;xp1 += partial
vbroadcasti128 xgft1_lo, [tmp+vskip1*4] ;Load array: lo | lo
vbroadcasti128 xgft1_hi, [tmp+vskip1*4+16] ; hi | hi
add tmp, 32
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial vpxor xp2, xgft2_hi ;xp2 += partial
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
add tmp, 32
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials vpxor xgft3_hi, xgft3_lo ;GF add high and low partials

View File

@@ -47,9 +47,11 @@
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10 %define tmp2 r10
%define tmp3 r13
%define tmp4 r14
%define tmp5 rdi
%define return rax %define return rax
%define return.w eax %define stack_size 16*10 + 5*8
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x] %define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x %define func(x) proc_frame x
@@ -67,6 +69,9 @@
vmovdqa [rsp+16*9],xmm15 vmovdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8 save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8 save_reg r15, 10*16 + 1*8
save_reg r13, 10*16 + 2*8
save_reg r14, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
end_prolog end_prolog
mov arg4, arg(4) mov arg4, arg(4)
mov arg5, arg(5) mov arg5, arg(5)
@@ -85,6 +90,9 @@
vmovdqa xmm15, [rsp+16*9] vmovdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8] mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8] mov r15, [rsp + 10*16 + 1*8]
mov r13, [rsp + 10*16 + 2*8]
mov r14, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
add rsp, stack_size add rsp, stack_size
%endmacro %endmacro
@@ -100,12 +108,22 @@
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10 %define tmp2 r10
%define tmp3 r12 ; must be saved and restored
%define tmp4 r13 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define return rax %define return rax
%define return.w eax
%define func(x) x: endbranch %define func(x) x: endbranch
%define FUNC_SAVE %macro FUNC_SAVE 0
%define FUNC_RESTORE push r12
push r13
push r14
%endmacro
%macro FUNC_RESTORE 0
pop r14
pop r13
pop r12
%endmacro
%endif %endif
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) ;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
@@ -117,7 +135,6 @@
%define src arg4 %define src arg4
%define dest1 arg5 %define dest1 arg5
%define pos return %define pos return
%define pos.w return.w
%define dest2 tmp2 %define dest2 tmp2
%define dest3 mul_array %define dest3 mul_array
@@ -177,20 +194,11 @@ func(gf_5vect_mad_avx2)
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i] lea tmp, [mul_array + vec_i]
mov tmp3, tmp
sal vec, 5 ;Multiply by 32
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} mov tmp4, vec
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
add tmp, vec
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
mov dest3, [dest1+2*PS] ; reuse mul_array mov dest3, [dest1+2*PS] ; reuse mul_array
mov dest4, [dest1+3*PS] ; reuse vec mov dest4, [dest1+3*PS] ; reuse vec
@@ -198,6 +206,8 @@ func(gf_5vect_mad_avx2)
mov dest2, [dest1+PS] mov dest2, [dest1+PS]
mov dest1, [dest1] mov dest1, [dest1]
lea tmp5, [tmp4+2*tmp4] ; vec*3, for addressing
.loop32: .loop32:
XLDR x0, [src+pos] ;Get next source vector XLDR x0, [src+pos] ;Get next source vector
@@ -210,44 +220,50 @@ func(gf_5vect_mad_avx2)
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xtmph1, [tmp+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph2, [tmp+tmp4+16] ; hi | hi
; dest1 ; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials vpxor xtmph1, xgft1_lo ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial vpxor xd1, xtmph1 ;xd1 += partial
vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph1, [tmp+2*tmp4+16] ; hi | hi
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
; dest2 ; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials vpxor xtmph2, xgft2_lo ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial vpxor xd2, xtmph2 ;xd2 += partial
vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph2, [tmp+tmp5+16] ; hi | hi
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest3 ; dest3
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials vpxor xtmph1, xgft3_lo ;GF add high and low partials
vpxor xd3, xd3, xtmph1 ;xd3 += partial vpxor xd3, xtmph1 ;xd3 += partial
vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph1, [tmp+4*tmp4+16] ; hi | hi
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
; dest4 ; dest4
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials vpxor xtmph2, xgft4_lo ;GF add high and low partials
vpxor xd4, xd4, xtmph2 ;xd4 += partial vpxor xd4, xtmph2 ;xd4 += partial
; dest5 ; dest5
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials vpxor xtmph1, xgft5_lo ;GF add high and low partials
vpxor xd5, xd5, xtmph1 ;xd5 += partial vpxor xd5, xtmph1 ;xd5 += partial
XSTR [dest1+pos], xd1 XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2 XSTR [dest2+pos], xd2
@@ -288,14 +304,14 @@ func(gf_5vect_mad_avx2)
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, xtmph2 vpcmpgtb xtmpl, xtmpl, xtmph2
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft1_lo, [tmp3] ;Load array: lo | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xtmph1, [tmp3+16] ; hi | hi
vbroadcasti128 xgft2_lo, [tmp3+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph2, [tmp3+tmp4+16]; hi | hi
; dest1 ; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
@@ -304,7 +320,9 @@ func(gf_5vect_mad_avx2)
vpand xtmph1, xtmph1, xtmpl vpand xtmph1, xtmph1, xtmpl
vpxor xd1, xd1, xtmph1 ;xd1 += partial vpxor xd1, xd1, xtmph1 ;xd1 += partial
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft3_lo, [tmp3+2*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph1, [tmp3+2*tmp4+16] ; hi | hi
; dest2 ; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
@@ -312,7 +330,9 @@ func(gf_5vect_mad_avx2)
vpand xtmph2, xtmph2, xtmpl vpand xtmph2, xtmph2, xtmpl
vpxor xd2, xd2, xtmph2 ;xd2 += partial vpxor xd2, xd2, xtmph2 ;xd2 += partial
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft4_lo, [tmp3+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph2, [tmp3+tmp5+16]; hi | hi
; dest3 ; dest3
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
@@ -320,7 +340,9 @@ func(gf_5vect_mad_avx2)
vpand xtmph1, xtmph1, xtmpl vpand xtmph1, xtmph1, xtmpl
vpxor xd3, xd3, xtmph1 ;xd3 += partial vpxor xd3, xd3, xtmph1 ;xd3 += partial
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft5_lo, [tmp3+4*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph1, [tmp3+4*tmp4+16] ; hi | hi
; dest4 ; dest4
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble

View File

@@ -218,52 +218,44 @@ func(gf_6vect_dot_prod_avx2)
.next_vect: .next_vect:
mov ptr, [src+vec_i] mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS
vpand xgft3_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft3_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft3_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
; " Ax{00}, Ax{10}, ..., Ax{f0} vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo
; " Bx{00}, Bx{10}, ..., Bx{f0} vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} lea ptr, [vskip1 + vskip1*4]
; " Cx{00}, Cx{10}, ..., Cx{f0} vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5 vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial vpxor xp1, xgft1_hi ;xp1 += partial
vbroadcasti128 xgft1_lo, [tmp+vskip3] ;Load array: lo | lo
vbroadcasti128 xgft1_hi, [tmp+vskip3+16] ; hi | hi
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial vpxor xp2, xgft2_hi ;xp2 += partial
vbroadcasti128 xgft2_lo, [tmp+vskip1*4] ;Load array: lo | lo
vbroadcasti128 xgft2_hi, [tmp+vskip1*4+16] ; hi | hi
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial vpxor xp3, xgft3_hi ;xp3 += partial
vbroadcasti128 xgft3_lo, [tmp+ptr] ;Load array: lo | lo
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} vbroadcasti128 xgft3_hi, [tmp+ptr+16] ; hi | hi
; " Dx{00}, Dx{10}, ..., Dx{f0}
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
; " Fx{00}, Fx{10}, ..., Fx{f0}
add tmp, 32 add tmp, 32
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo add vec_i, PS
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble

View File

@@ -48,9 +48,11 @@
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10 %define tmp2 r10
%define tmp3 r13 %define tmp3 r13
%define tmp4 rdi
%define tmp5 rsi
%define tmp6 r14
%define return rax %define return rax
%define return.w eax %define stack_size 16*10 + 7*8
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x] %define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x %define func(x) proc_frame x
@@ -69,6 +71,9 @@
save_reg r12, 10*16 + 0*8 save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8 save_reg r13, 10*16 + 1*8
save_reg r15, 10*16 + 2*8 save_reg r15, 10*16 + 2*8
save_reg rdi, 10*16 + 3*8
save_reg rsi, 10*16 + 4*8
save_reg r14, 10*16 + 5*8
end_prolog end_prolog
mov arg4, arg(4) mov arg4, arg(4)
mov arg5, arg(5) mov arg5, arg(5)
@@ -88,6 +93,9 @@
mov r12, [rsp + 10*16 + 0*8] mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8] mov r13, [rsp + 10*16 + 1*8]
mov r15, [rsp + 10*16 + 2*8] mov r15, [rsp + 10*16 + 2*8]
mov rdi, [rsp + 10*16 + 3*8]
mov rsi, [rsp + 10*16 + 4*8]
mov r14, [rsp + 10*16 + 5*8]
add rsp, stack_size add rsp, stack_size
%endmacro %endmacro
@@ -103,15 +111,23 @@
%define tmp.w r11d %define tmp.w r11d
%define tmp.b r11b %define tmp.b r11b
%define tmp2 r10 %define tmp2 r10
%define tmp3 r12 %define tmp3 r12 ; must be saved and restored
%define tmp4 r13 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax %define return rax
%define return.w eax
%define func(x) x: endbranch %define func(x) x: endbranch
%macro FUNC_SAVE 0 %macro FUNC_SAVE 0
push r12 push r12
push r13
push r14
push r15
%endmacro %endmacro
%macro FUNC_RESTORE 0 %macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12 pop r12
%endmacro %endmacro
%endif %endif
@@ -125,7 +141,6 @@
%define src arg4 %define src arg4
%define dest1 arg5 %define dest1 arg5
%define pos return %define pos return
%define pos.w return.w
%define dest2 tmp3 %define dest2 tmp3
%define dest3 tmp2 %define dest3 tmp2
@@ -190,6 +205,7 @@ func(gf_6vect_mad_avx2)
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32 sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i] lea tmp, [mul_array + vec_i]
mov tmp6, tmp
mov vec_i, vec mov vec_i, vec
mov mul_array, vec mov mul_array, vec
sal vec_i, 1 sal vec_i, 1
@@ -197,18 +213,7 @@ func(gf_6vect_mad_avx2)
add vec_i, vec ;vec_i=vec*96 add vec_i, vec ;vec_i=vec*96
add mul_array, vec_i ;vec_i=vec*160 add mul_array, vec_i ;vec_i=vec*160
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} mov tmp4, vec
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
; " Fx{00}, Fx{10}, ..., Fx{f0}
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
mov dest2, [dest1+PS] ; reuse tmp3 mov dest2, [dest1+PS] ; reuse tmp3
mov dest3, [dest1+2*PS] ; reuse tmp2 mov dest3, [dest1+2*PS] ; reuse tmp2
@@ -225,57 +230,70 @@ func(gf_6vect_mad_avx2)
XLDR xd4, [dest4+pos] ;Get next dest vector XLDR xd4, [dest4+pos] ;Get next dest vector
XLDR xd5, [dest5+pos] ;Get next dest vector XLDR xd5, [dest5+pos] ;Get next dest vector
lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+16] ; hi | hi
;dest1 ;dest1
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xgft1_lo ;GF add high and low partials
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials vpxor xd1, xtmph ;xd1 += partial
vpxor xd1, xd1, xtmph ;xd1 += partial
XSTR [dest1+pos], xd1 ;Store result into dest1 XSTR [dest1+pos], xd1 ;Store result into dest1
;dest2
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd2, xd2, xtmph ;xd2 += partial
;dest3
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd3, xd3, xtmph ;xd3 += partial
XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector
vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+tmp4+16] ; hi | hi
;dest2
vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph, xgft2_lo ;GF add high and low partials
vpxor xd2, xtmph ;xd2 += partial
vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+2*tmp4+16] ; hi | hi
;dest3
vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble
vpxor xtmph, xgft3_lo ;GF add high and low partials
vpxor xd3, xtmph ;xd3 += partial
vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi
lea tmp5, [tmp5+2*tmp4] ;5*vec, for addressing
;dest4 ;dest4
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xgft4_lo ;GF add high and low partials
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials vpxor xd4, xtmph ;xd4 += partial
vpxor xd4, xd4, xtmph ;xd4 += partial
vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+4*tmp4+16] ; hi | hi
;dest5 ;dest5
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xgft5_lo ;GF add high and low partials
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials vpxor xd5, xtmph ;xd5 += partial
vpxor xd5, xd5, xtmph ;xd5 += partial
vbroadcasti128 xgft6_lo, [tmp+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi
;dest6 ;dest6
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo vpshufb xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft6_lo, xtmpl ;Lookup mul table of low nibble
vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xgft6_lo ;GF add high and low partials
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials vpxor xd6, xtmph ;xd6 += partial
vpxor xd6, xd6, xtmph ;xd6 += partial
XSTR [dest2+pos], xd2 ;Store result into dest2 XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3 XSTR [dest3+pos], xd3 ;Store result into dest3
@@ -308,20 +326,21 @@ func(gf_6vect_mad_avx2)
XLDR xd5, [dest5+tmp] ;Get next dest vector XLDR xd5, [dest5+tmp] ;Get next dest vector
sub len, pos sub len, pos
lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing
vpinsrb xtmplx, xtmplx, len.w, 15 vpinsrb xtmplx, xtmplx, len.w, 15
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f... vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, [constip32] vpcmpgtb xtmpl, xtmpl, [constip32]
vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
;dest1 ;dest1
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft1_lo, [tmp6] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+16] ; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials
@@ -329,9 +348,12 @@ func(gf_6vect_mad_avx2)
vpxor xd1, xd1, xtmph ;xd1 += partial vpxor xd1, xd1, xtmph ;xd1 += partial
XSTR [dest1+tmp], xd1 ;Store result into dest1 XSTR [dest1+tmp], xd1 ;Store result into dest1
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
;dest2 ;dest2
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft2_lo, [tmp6+tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+tmp4+16]; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials
@@ -339,25 +361,31 @@ func(gf_6vect_mad_avx2)
vpxor xd2, xd2, xtmph ;xd2 += partial vpxor xd2, xd2, xtmph ;xd2 += partial
;dest3 ;dest3
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft3_lo, [tmp6+2*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+2*tmp4+16] ; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl vpand xtmph, xtmph, xtmpl
vpxor xd3, xd3, xtmph ;xd3 += partial vpxor xd3, xd3, xtmph ;xd3 += partial
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
;dest4 ;dest4
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft4_lo, [tmp6+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl vpand xtmph, xtmph, xtmpl
vpxor xd4, xd4, xtmph ;xd4 += partial vpxor xd4, xd4, xtmph ;xd4 += partial
lea tmp5, [tmp5+2*tmp4] ; 5*vec, for addressing
;dest5 ;dest5
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft5_lo, [tmp6+4*tmp4] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+4*tmp4+16] ; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials
@@ -365,7 +393,9 @@ func(gf_6vect_mad_avx2)
vpxor xd5, xd5, xtmph ;xd5 += partial vpxor xd5, xd5, xtmph ;xd5 += partial
;dest6 ;dest6
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo vbroadcasti128 xgft6_lo, [tmp6+tmp5] ;Load array: lo | lo
vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials

View File

@@ -151,11 +151,8 @@ func(gf_vect_dot_prod_avx2)
mov ptr, [src+vec_i*PS] mov ptr, [src+vec_i*PS]
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ... vbroadcasti128 xgft_lo, [tmp] ;Load array: lo | lo
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} vbroadcasti128 xgft_hi, [tmp+16] ; hi | hi
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
XLDR x0, [ptr+pos] ;Get next source vector XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32 add tmp, 32

View File

@@ -150,10 +150,8 @@ func(gf_vect_mad_avx2)
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32 sal vec_i, 5 ;Multiply by 32
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... vbroadcasti128 xgft_lo, [vec_i+mul_array] ;Load array: lo | lo
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} vbroadcasti128 xgft_hi, [vec_i+mul_array+16] ; hi | hi
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest