From a439f0dd5d682f4ae7e12c0e1b29eb1314b22f14 Mon Sep 17 00:00:00 2001 From: Maodi Ma Date: Wed, 24 Sep 2025 16:57:32 +0000 Subject: [PATCH] erasure_code: load xgfts with VBROADCASTI128 in x86 AVX2 impl To generate the side-by-side pattern of two 128-bit xgfts within a YMM reg, loading them with VBROADCASTI128 from mem directly can be faster than loading then swapping them with VMOVDQU + VPERM2i128. Remove some out-of-date macro as well. Signed-off-by: Maodi Ma --- erasure_code/gf_2vect_dot_prod_avx2.asm | 19 ++- erasure_code/gf_2vect_mad_avx2.asm | 15 +-- erasure_code/gf_3vect_dot_prod_avx2.asm | 34 ++--- erasure_code/gf_3vect_mad_avx2.asm | 47 ++++--- erasure_code/gf_4vect_dot_prod_avx2.asm | 27 ++-- erasure_code/gf_4vect_mad_avx2.asm | 109 +++++++++------- erasure_code/gf_5vect_dot_prod_avx2.asm | 36 +++--- erasure_code/gf_5vect_mad_avx2.asm | 128 +++++++++++-------- erasure_code/gf_6vect_dot_prod_avx2.asm | 42 +++---- erasure_code/gf_6vect_mad_avx2.asm | 160 ++++++++++++++---------- erasure_code/gf_vect_dot_prod_avx2.asm | 9 +- erasure_code/gf_vect_mad_avx2.asm | 6 +- 12 files changed, 325 insertions(+), 307 deletions(-) diff --git a/erasure_code/gf_2vect_dot_prod_avx2.asm b/erasure_code/gf_2vect_dot_prod_avx2.asm index 1c21816..db97e1f 100644 --- a/erasure_code/gf_2vect_dot_prod_avx2.asm +++ b/erasure_code/gf_2vect_dot_prod_avx2.asm @@ -183,24 +183,19 @@ func(gf_2vect_dot_prod_avx2) .next_vect: SLDR src, src_m mov ptr, [src+vec_i] - - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo - vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - XLDR x0, [ptr+pos] ;Get next source vector - add tmp, 32 - add vec_i, PS vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + add vec_i, PS + vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi + add tmp, 32 + vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials diff --git a/erasure_code/gf_2vect_mad_avx2.asm b/erasure_code/gf_2vect_mad_avx2.asm index 13d1b78..63c87b5 100644 --- a/erasure_code/gf_2vect_mad_avx2.asm +++ b/erasure_code/gf_2vect_mad_avx2.asm @@ -175,17 +175,14 @@ func(gf_2vect_mad_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec_i, 5 ;Multiply by 32 - sal vec, 5 lea tmp, [mul_array + vec_i] - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} + sal vec, 5 + + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo mov dest2, [dest1+PS] ; reuse mul_array mov dest1, [dest1] diff --git a/erasure_code/gf_3vect_dot_prod_avx2.asm b/erasure_code/gf_3vect_dot_prod_avx2.asm index 27e6416..9c7b153 100644 --- a/erasure_code/gf_3vect_dot_prod_avx2.asm +++ b/erasure_code/gf_3vect_dot_prod_avx2.asm @@ -199,38 +199,30 @@ func(gf_3vect_dot_prod_avx2) .next_vect: SLDR src, src_m mov ptr, [src+vec_i] - - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo - vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - - vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo - - add tmp, 32 - add vec_i, PS XLDR x0, [ptr+pos] ;Get next source vector vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi + add vec_i, PS + vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo + vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi + add tmp, 32 + vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial - vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble - vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble - vpxor xgft2_hi, xgft2_lo ;GF add high and low partials - vpxor xp2, xgft2_hi ;xp2 += partial + vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft2_hi, xgft2_lo ;GF add high and low partials + vpxor xp2, xgft2_hi ;xp2 += partial vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble diff --git a/erasure_code/gf_3vect_mad_avx2.asm b/erasure_code/gf_3vect_mad_avx2.asm index 794e0e7..4c07424 100644 --- a/erasure_code/gf_3vect_mad_avx2.asm +++ b/erasure_code/gf_3vect_mad_avx2.asm @@ -47,6 +47,7 @@ %define tmp r11 %define tmp.w r11d %define tmp.b r11b + %define tmp2 r10 %define return rax %define return.w eax %define stack_size 16*10 + 3*8 @@ -97,9 +98,10 @@ %define arg4 r8 %define arg5 r9 - %define tmp r11 - %define tmp.w r11d - %define tmp.b r11b + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 %define return rax %define return.w eax @@ -149,6 +151,7 @@ section .text %define xgft1_hi ymm13 %define xgft2_lo ymm12 %define xgft3_lo ymm11 +%define xgft2_hi xgft3_lo ; Reuse ymm11 %define x0 ymm0 %define xtmpa ymm1 @@ -176,18 +179,15 @@ func(gf_3vect_mad_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec_i, 5 ;Multiply by 32 - sal vec, 5 lea tmp, [mul_array + vec_i] + mov tmp2, tmp + sal vec, 5 - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vec] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vec+16] ; hi | hi - vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... - ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} - vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... - ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} mov dest2, [dest1+PS] ; reuse mul_array mov dest3, [dest1+2*PS] ; reuse vec_i mov dest1, [dest1] @@ -197,11 +197,9 @@ func(gf_3vect_mad_avx2) XLDR xd1, [dest1+pos] ;Get next dest vector XLDR xd2, [dest2+pos] ;Get next dest vector XLDR xd3, [dest3+pos] ;Get next dest vector - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi - vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi - vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo + vbroadcasti128 xtmpl3, [tmp+2*vec] ;Load array: lo | lo + vbroadcasti128 xtmph3, [tmp+2*vec+16] ; hi | hi vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 @@ -214,10 +212,10 @@ func(gf_3vect_mad_avx2) vpxor xd1, xd1, xtmph1 ;xd1 += partial ; dest2 - vpshufb xtmph2, x0 ;Lookup mul table of high nibble - vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble - vpxor xtmph2, xtmpl2 ;GF add high and low partials - vpxor xd2, xtmph2 ;xd2 += partial + vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial ; dest3 vpshufb xtmph3, x0 ;Lookup mul table of high nibble @@ -259,11 +257,10 @@ func(gf_3vect_mad_avx2) vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f... vpcmpgtb xtmpl3, xtmpl3, xtmph3 - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - - vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo + vbroadcasti128 xgft2_lo, [tmp2+vec] ; Load array: lo | lo + vbroadcasti128 xtmph2, [tmp2+vec+16] ; hi | hi + vbroadcasti128 xgft3_lo, [tmp2+2*vec] ; Load array: lo | lo + vbroadcasti128 xtmph3, [tmp2+2*vec+16]; hi | hi vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 diff --git a/erasure_code/gf_4vect_dot_prod_avx2.asm b/erasure_code/gf_4vect_dot_prod_avx2.asm index ebe7e15..a80cf9f 100644 --- a/erasure_code/gf_4vect_dot_prod_avx2.asm +++ b/erasure_code/gf_4vect_dot_prod_avx2.asm @@ -228,26 +228,19 @@ func(gf_4vect_dot_prod_avx2) mov ptr, [src+vec_i] XLDR x0, [ptr+pos] ;Get next source vector - add vec_i, PS - vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} - ; " Dx{00}, Dx{10}, ..., Dx{f0} - - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vec*(32/PS)] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vec*(32/PS)+16] ; hi | hi + add vec_i, PS + vbroadcasti128 xgft3_lo, [tmp+vec*(64/PS)] ;Load array: lo | lo + vbroadcasti128 xgft3_hi, [tmp+vec*(64/PS)+16] ; hi | hi + vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo + vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi add tmp, 32 vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble diff --git a/erasure_code/gf_4vect_mad_avx2.asm b/erasure_code/gf_4vect_mad_avx2.asm index b02f381..5d35092 100644 --- a/erasure_code/gf_4vect_mad_avx2.asm +++ b/erasure_code/gf_4vect_mad_avx2.asm @@ -46,9 +46,11 @@ %define tmp r11 %define tmp.w r11d %define tmp.b r11b + %define tmp2 r13 + %define tmp3 r14 + %define tmp4 r10 %define return rax - %define return.w eax - %define stack_size 16*10 + 3*8 + %define stack_size 16*10 + 5*8 %define arg(x) [rsp + stack_size + PS + PS*x] %define func(x) proc_frame x @@ -66,6 +68,8 @@ vmovdqa [rsp+16*9],xmm15 save_reg r12, 10*16 + 0*8 save_reg r15, 10*16 + 1*8 + save_reg r13, 10*16 + 2*8 + save_reg r14, 10*16 + 3*8 end_prolog mov arg4, arg(4) mov arg5, arg(5) @@ -84,6 +88,8 @@ vmovdqa xmm15, [rsp+16*9] mov r12, [rsp + 10*16 + 0*8] mov r15, [rsp + 10*16 + 1*8] + mov r13, [rsp + 10*16 + 2*8] + mov r14, [rsp + 10*16 + 3*8] add rsp, stack_size %endmacro @@ -98,12 +104,20 @@ %define tmp r11 %define tmp.w r11d %define tmp.b r11b + %define tmp2 r10 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored %define return rax - %define return.w eax %define func(x) x: endbranch - %define FUNC_SAVE - %define FUNC_RESTORE + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro + %macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro %endif @@ -116,7 +130,6 @@ %define src arg4 %define dest1 arg5 %define pos return -%define pos.w return.w %define dest2 mul_array %define dest3 vec @@ -176,24 +189,18 @@ func(gf_4vect_mad_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec_i, 5 ;Multiply by 32 - sal vec, 5 ;Multiply by 32 lea tmp, [mul_array + vec_i] - - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... - ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} - vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... - ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} - vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... - ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} - add tmp, vec - vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ... - ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + mov tmp2, tmp + sal vec, 5 ;Multiply by 32 + mov tmp3, vec mov dest2, [dest1+PS] ; reuse mul_array mov dest3, [dest1+2*PS] ; reuse vec mov dest4, [dest1+3*PS] ; reuse vec_i mov dest1, [dest1] + lea tmp4, [tmp3+2*tmp3] + .loop32: XLDR x0, [src+pos] ;Get next source vector @@ -206,37 +213,40 @@ func(gf_4vect_mad_avx2) vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - - vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+tmp3] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp+tmp3+16] ; hi | hi ; dest1 - vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials - vpxor xd1, xd1, xtmph1 ;xd1 += partial + vpshufb xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph1, xgft1_lo ;GF add high and low partials + vpxor xd1, xtmph1 ;xd1 += partial + + vbroadcasti128 xgft3_lo, [tmp+2*tmp3] ;Load array: lo | lo + vbroadcasti128 xtmph3, [tmp+2*tmp3+16] ; hi | hi ; dest2 - vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials - vpxor xd2, xd2, xtmph2 ;xd2 += partial + vpshufb xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph2, xgft2_lo ;GF add high and low partials + vpxor xd2, xtmph2 ;xd2 += partial + + vbroadcasti128 xgft4_lo, [tmp+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph4, [tmp+tmp4+16] ; hi | hi ; dest3 - vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials - vpxor xd3, xd3, xtmph3 ;xd3 += partial + vpshufb xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph3, xgft3_lo ;GF add high and low partials + vpxor xd3, xtmph3 ;xd3 += partial ; dest4 - vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials - vpxor xd4, xd4, xtmph4 ;xd4 += partial + vpshufb xtmph4, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph4, xgft4_lo ;GF add high and low partials + vpxor xd4, xtmph4 ;xd4 += partial XSTR [dest1+pos], xd1 XSTR [dest2+pos], xd2 @@ -275,17 +285,14 @@ func(gf_4vect_mad_avx2) vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... vpcmpgtb xtmpl, xtmpl, xtmph2 - vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - - vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp2] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp2+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp2+tmp3] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp2+tmp3+16]; hi | hi ; dest1 vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble @@ -294,6 +301,9 @@ func(gf_4vect_mad_avx2) vpand xtmph1, xtmph1, xtmpl vpxor xd1, xd1, xtmph1 ;xd1 += partial + vbroadcasti128 xgft3_lo, [tmp2+2*tmp3] ;Load array: lo | lo + vbroadcasti128 xtmph3, [tmp2+2*tmp3+16] ; hi | hi + ; dest2 vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble @@ -301,6 +311,9 @@ func(gf_4vect_mad_avx2) vpand xtmph2, xtmph2, xtmpl vpxor xd2, xd2, xtmph2 ;xd2 += partial + vbroadcasti128 xgft4_lo, [tmp2+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph4, [tmp2+tmp4+16]; hi | hi + ; dest3 vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble diff --git a/erasure_code/gf_5vect_dot_prod_avx2.asm b/erasure_code/gf_5vect_dot_prod_avx2.asm index d21c35b..bfca4b2 100644 --- a/erasure_code/gf_5vect_dot_prod_avx2.asm +++ b/erasure_code/gf_5vect_dot_prod_avx2.asm @@ -220,43 +220,35 @@ func(gf_5vect_dot_prod_avx2) .next_vect: mov ptr, [src+vec_i] XLDR x0, [ptr+pos] ;Get next source vector - add vec_i, PS - vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} - ; " Dx{00}, Dx{10}, ..., Dx{f0} - - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi + add vec_i, PS + vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo + vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi + vbroadcasti128 xgft4_lo, [tmp+vskip3] ;Load array: lo | lo + vbroadcasti128 xgft4_hi, [tmp+vskip3+16] ; hi | hi vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + vbroadcasti128 xgft1_lo, [tmp+vskip1*4] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+vskip1*4+16] ; hi | hi + add tmp, 32 + vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xp2, xgft2_hi ;xp2 += partial - vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} - ; " Ex{00}, Ex{10}, ..., Ex{f0} - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - add tmp, 32 - vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials diff --git a/erasure_code/gf_5vect_mad_avx2.asm b/erasure_code/gf_5vect_mad_avx2.asm index 68c4c95..a0f99e9 100644 --- a/erasure_code/gf_5vect_mad_avx2.asm +++ b/erasure_code/gf_5vect_mad_avx2.asm @@ -47,9 +47,11 @@ %define tmp.w r11d %define tmp.b r11b %define tmp2 r10 + %define tmp3 r13 + %define tmp4 r14 + %define tmp5 rdi %define return rax - %define return.w eax - %define stack_size 16*10 + 3*8 + %define stack_size 16*10 + 5*8 %define arg(x) [rsp + stack_size + PS + PS*x] %define func(x) proc_frame x @@ -67,6 +69,9 @@ vmovdqa [rsp+16*9],xmm15 save_reg r12, 10*16 + 0*8 save_reg r15, 10*16 + 1*8 + save_reg r13, 10*16 + 2*8 + save_reg r14, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 end_prolog mov arg4, arg(4) mov arg5, arg(5) @@ -85,6 +90,9 @@ vmovdqa xmm15, [rsp+16*9] mov r12, [rsp + 10*16 + 0*8] mov r15, [rsp + 10*16 + 1*8] + mov r13, [rsp + 10*16 + 2*8] + mov r14, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] add rsp, stack_size %endmacro @@ -100,12 +108,22 @@ %define tmp.w r11d %define tmp.b r11b %define tmp2 r10 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored %define return rax - %define return.w eax %define func(x) x: endbranch - %define FUNC_SAVE - %define FUNC_RESTORE + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + %endmacro + %macro FUNC_RESTORE 0 + pop r14 + pop r13 + pop r12 + %endmacro %endif ;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) @@ -117,7 +135,6 @@ %define src arg4 %define dest1 arg5 %define pos return -%define pos.w return.w %define dest2 tmp2 %define dest3 mul_array @@ -177,20 +194,11 @@ func(gf_5vect_mad_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec_i, 5 ;Multiply by 32 - sal vec, 5 ;Multiply by 32 lea tmp, [mul_array + vec_i] + mov tmp3, tmp + sal vec, 5 ;Multiply by 32 - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} - ; " Ex{00}, Ex{10}, ..., Ex{f0} - add tmp, vec - vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} - ; " Dx{00}, Dx{10}, ..., Dx{f0} + mov tmp4, vec mov dest3, [dest1+2*PS] ; reuse mul_array mov dest4, [dest1+3*PS] ; reuse vec @@ -198,6 +206,8 @@ func(gf_5vect_mad_avx2) mov dest2, [dest1+PS] mov dest1, [dest1] + lea tmp5, [tmp4+2*tmp4] ; vec*3, for addressing + .loop32: XLDR x0, [src+pos] ;Get next source vector @@ -210,44 +220,50 @@ func(gf_5vect_mad_avx2) vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp+tmp4+16] ; hi | hi ; dest1 - vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials - vpxor xd1, xd1, xtmph1 ;xd1 += partial + vpshufb xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph1, xgft1_lo ;GF add high and low partials + vpxor xd1, xtmph1 ;xd1 += partial + + vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp+2*tmp4+16] ; hi | hi - vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo ; dest2 - vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials - vpxor xd2, xd2, xtmph2 ;xd2 += partial + vpshufb xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph2, xgft2_lo ;GF add high and low partials + vpxor xd2, xtmph2 ;xd2 += partial + + vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp+tmp5+16] ; hi | hi - vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo ; dest3 - vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials - vpxor xd3, xd3, xtmph1 ;xd3 += partial + vpshufb xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph1, xgft3_lo ;GF add high and low partials + vpxor xd3, xtmph1 ;xd3 += partial + + vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp+4*tmp4+16] ; hi | hi - vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo ; dest4 - vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials - vpxor xd4, xd4, xtmph2 ;xd4 += partial + vpshufb xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph2, xgft4_lo ;GF add high and low partials + vpxor xd4, xtmph2 ;xd4 += partial ; dest5 - vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials - vpxor xd5, xd5, xtmph1 ;xd5 += partial + vpshufb xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph1, xgft5_lo ;GF add high and low partials + vpxor xd5, xtmph1 ;xd5 += partial XSTR [dest1+pos], xd1 XSTR [dest2+pos], xd2 @@ -288,14 +304,14 @@ func(gf_5vect_mad_avx2) vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... vpcmpgtb xtmpl, xtmpl, xtmph2 - vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp3] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp3+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp3+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp3+tmp4+16]; hi | hi ; dest1 vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble @@ -304,7 +320,9 @@ func(gf_5vect_mad_avx2) vpand xtmph1, xtmph1, xtmpl vpxor xd1, xd1, xtmph1 ;xd1 += partial - vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft3_lo, [tmp3+2*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp3+2*tmp4+16] ; hi | hi + ; dest2 vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble @@ -312,7 +330,9 @@ func(gf_5vect_mad_avx2) vpand xtmph2, xtmph2, xtmpl vpxor xd2, xd2, xtmph2 ;xd2 += partial - vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft4_lo, [tmp3+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph2, [tmp3+tmp5+16]; hi | hi + ; dest3 vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble @@ -320,7 +340,9 @@ func(gf_5vect_mad_avx2) vpand xtmph1, xtmph1, xtmpl vpxor xd3, xd3, xtmph1 ;xd3 += partial - vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft5_lo, [tmp3+4*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph1, [tmp3+4*tmp4+16] ; hi | hi + ; dest4 vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble diff --git a/erasure_code/gf_6vect_dot_prod_avx2.asm b/erasure_code/gf_6vect_dot_prod_avx2.asm index 43118d6..ffee956 100644 --- a/erasure_code/gf_6vect_dot_prod_avx2.asm +++ b/erasure_code/gf_6vect_dot_prod_avx2.asm @@ -218,52 +218,44 @@ func(gf_6vect_dot_prod_avx2) .next_vect: mov ptr, [src+vec_i] XLDR x0, [ptr+pos] ;Get next source vector - add vec_i, PS - vpand xgft3_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xgft3_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xgft3_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5 - - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+16] ; hi | hi + vbroadcasti128 xgft2_lo, [tmp+vskip1*1] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vskip1*1+16] ; hi | hi + lea ptr, [vskip1 + vskip1*4] + vbroadcasti128 xgft3_lo, [tmp+vskip1*2] ;Load array: lo | lo + vbroadcasti128 xgft3_hi, [tmp+vskip1*2+16] ; hi | hi vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + vbroadcasti128 xgft1_lo, [tmp+vskip3] ;Load array: lo | lo + vbroadcasti128 xgft1_hi, [tmp+vskip3+16] ; hi | hi + vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xp2, xgft2_hi ;xp2 += partial + vbroadcasti128 xgft2_lo, [tmp+vskip1*4] ;Load array: lo | lo + vbroadcasti128 xgft2_hi, [tmp+vskip1*4+16] ; hi | hi + vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials vpxor xp3, xgft3_hi ;xp3 += partial - - vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} - ; " Dx{00}, Dx{10}, ..., Dx{f0} - vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} - ; " Ex{00}, Ex{10}, ..., Ex{f0} - vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f} - ; " Fx{00}, Fx{10}, ..., Fx{f0} + vbroadcasti128 xgft3_lo, [tmp+ptr] ;Load array: lo | lo + vbroadcasti128 xgft3_hi, [tmp+ptr+16] ; hi | hi add tmp, 32 - vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + add vec_i, PS vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble diff --git a/erasure_code/gf_6vect_mad_avx2.asm b/erasure_code/gf_6vect_mad_avx2.asm index 651759e..ab9b370 100644 --- a/erasure_code/gf_6vect_mad_avx2.asm +++ b/erasure_code/gf_6vect_mad_avx2.asm @@ -48,9 +48,11 @@ %define tmp.b r11b %define tmp2 r10 %define tmp3 r13 + %define tmp4 rdi + %define tmp5 rsi + %define tmp6 r14 %define return rax - %define return.w eax - %define stack_size 16*10 + 3*8 + %define stack_size 16*10 + 7*8 %define arg(x) [rsp + stack_size + PS + PS*x] %define func(x) proc_frame x @@ -69,6 +71,9 @@ save_reg r12, 10*16 + 0*8 save_reg r13, 10*16 + 1*8 save_reg r15, 10*16 + 2*8 + save_reg rdi, 10*16 + 3*8 + save_reg rsi, 10*16 + 4*8 + save_reg r14, 10*16 + 5*8 end_prolog mov arg4, arg(4) mov arg5, arg(5) @@ -88,6 +93,9 @@ mov r12, [rsp + 10*16 + 0*8] mov r13, [rsp + 10*16 + 1*8] mov r15, [rsp + 10*16 + 2*8] + mov rdi, [rsp + 10*16 + 3*8] + mov rsi, [rsp + 10*16 + 4*8] + mov r14, [rsp + 10*16 + 5*8] add rsp, stack_size %endmacro @@ -103,15 +111,23 @@ %define tmp.w r11d %define tmp.b r11b %define tmp2 r10 - %define tmp3 r12 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored %define return rax - %define return.w eax %define func(x) x: endbranch %macro FUNC_SAVE 0 push r12 + push r13 + push r14 + push r15 %endmacro %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 pop r12 %endmacro %endif @@ -125,7 +141,6 @@ %define src arg4 %define dest1 arg5 %define pos return -%define pos.w return.w %define dest2 tmp3 %define dest3 tmp2 @@ -190,6 +205,7 @@ func(gf_6vect_mad_avx2) sal vec_i, 5 ;Multiply by 32 sal vec, 5 ;Multiply by 32 lea tmp, [mul_array + vec_i] + mov tmp6, tmp mov vec_i, vec mov mul_array, vec sal vec_i, 1 @@ -197,18 +213,7 @@ func(gf_6vect_mad_avx2) add vec_i, vec ;vec_i=vec*96 add mul_array, vec_i ;vec_i=vec*160 - vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} - ; " Ax{00}, Ax{10}, ..., Ax{f0} - vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} - ; " Bx{00}, Bx{10}, ..., Bx{f0} - vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f} - ; " Fx{00}, Fx{10}, ..., Fx{f0} - vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} - ; " Ex{00}, Ex{10}, ..., Ex{f0} - vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} - ; " Dx{00}, Dx{10}, ..., Dx{f0} + mov tmp4, vec mov dest2, [dest1+PS] ; reuse tmp3 mov dest3, [dest1+2*PS] ; reuse tmp2 @@ -225,57 +230,70 @@ func(gf_6vect_mad_avx2) XLDR xd4, [dest4+pos] ;Get next dest vector XLDR xd5, [dest5+pos] ;Get next dest vector + lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing + vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vbroadcasti128 xgft1_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+16] ; hi | hi ;dest1 - vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd1, xd1, xtmph ;xd1 += partial + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft1_lo ;GF add high and low partials + vpxor xd1, xtmph ;xd1 += partial XSTR [dest1+pos], xd1 ;Store result into dest1 - - ;dest2 - vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd2, xd2, xtmph ;xd2 += partial - - ;dest3 - vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd3, xd3, xtmph ;xd3 += partial - XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector + vbroadcasti128 xgft2_lo, [tmp+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+tmp4+16] ; hi | hi + + ;dest2 + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft2_lo ;GF add high and low partials + vpxor xd2, xtmph ;xd2 += partial + + vbroadcasti128 xgft3_lo, [tmp+2*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+2*tmp4+16] ; hi | hi + + ;dest3 + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft3_lo ;GF add high and low partials + vpxor xd3, xtmph ;xd3 += partial + + vbroadcasti128 xgft4_lo, [tmp+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi + + lea tmp5, [tmp5+2*tmp4] ;5*vec, for addressing + ;dest4 - vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd4, xd4, xtmph ;xd4 += partial + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft4_lo ;GF add high and low partials + vpxor xd4, xtmph ;xd4 += partial + + vbroadcasti128 xgft5_lo, [tmp+4*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+4*tmp4+16] ; hi | hi ;dest5 - vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd5, xd5, xtmph ;xd5 += partial + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft5_lo ;GF add high and low partials + vpxor xd5, xtmph ;xd5 += partial + + vbroadcasti128 xgft6_lo, [tmp+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp+tmp5+16] ; hi | hi ;dest6 - vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo - vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble - vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble - vpxor xtmph, xtmph, xtmpl ;GF add high and low partials - vpxor xd6, xd6, xtmph ;xd6 += partial + vpshufb xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft6_lo, xtmpl ;Lookup mul table of low nibble + vpxor xtmph, xgft6_lo ;GF add high and low partials + vpxor xd6, xtmph ;xd6 += partial XSTR [dest2+pos], xd2 ;Store result into dest2 XSTR [dest3+pos], xd3 ;Store result into dest3 @@ -308,20 +326,21 @@ func(gf_6vect_mad_avx2) XLDR xd5, [dest5+tmp] ;Get next dest vector sub len, pos + lea tmp5, [tmp4+2*tmp4] ;3*vec, for addressing vpinsrb xtmplx, xtmplx, len.w, 15 vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f... vpcmpgtb xtmpl, xtmpl, [constip32] - vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 - vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi - vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo ;dest1 - vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft1_lo, [tmp6] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+16] ; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials @@ -329,9 +348,12 @@ func(gf_6vect_mad_avx2) vpxor xd1, xd1, xtmph ;xd1 += partial XSTR [dest1+tmp], xd1 ;Store result into dest1 + XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector ;dest2 - vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft2_lo, [tmp6+tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+tmp4+16]; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials @@ -339,25 +361,31 @@ func(gf_6vect_mad_avx2) vpxor xd2, xd2, xtmph ;xd2 += partial ;dest3 - vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft3_lo, [tmp6+2*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+2*tmp4+16] ; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials vpand xtmph, xtmph, xtmpl vpxor xd3, xd3, xtmph ;xd3 += partial - XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector - ;dest4 - vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft4_lo, [tmp6+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials vpand xtmph, xtmph, xtmpl vpxor xd4, xd4, xtmph ;xd4 += partial + lea tmp5, [tmp5+2*tmp4] ; 5*vec, for addressing + ;dest5 - vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft5_lo, [tmp6+4*tmp4] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+4*tmp4+16] ; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials @@ -365,7 +393,9 @@ func(gf_6vect_mad_avx2) vpxor xd5, xd5, xtmph ;xd5 += partial ;dest6 - vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo + vbroadcasti128 xgft6_lo, [tmp6+tmp5] ;Load array: lo | lo + vbroadcasti128 xtmph, [tmp6+tmp5+16]; hi | hi + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials diff --git a/erasure_code/gf_vect_dot_prod_avx2.asm b/erasure_code/gf_vect_dot_prod_avx2.asm index dd79f1c..94a16b1 100644 --- a/erasure_code/gf_vect_dot_prod_avx2.asm +++ b/erasure_code/gf_vect_dot_prod_avx2.asm @@ -151,12 +151,9 @@ func(gf_vect_dot_prod_avx2) mov ptr, [src+vec_i*PS] - vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ... - ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} - vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo - - XLDR x0, [ptr+pos] ;Get next source vector + vbroadcasti128 xgft_lo, [tmp] ;Load array: lo | lo + vbroadcasti128 xgft_hi, [tmp+16] ; hi | hi + XLDR x0, [ptr+pos] ;Get next source vector add tmp, 32 add vec_i, 1 diff --git a/erasure_code/gf_vect_mad_avx2.asm b/erasure_code/gf_vect_mad_avx2.asm index bf5b567..390700f 100644 --- a/erasure_code/gf_vect_mad_avx2.asm +++ b/erasure_code/gf_vect_mad_avx2.asm @@ -150,10 +150,8 @@ func(gf_vect_mad_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec_i, 5 ;Multiply by 32 - vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... - ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} - vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi - vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo + vbroadcasti128 xgft_lo, [vec_i+mul_array] ;Load array: lo | lo + vbroadcasti128 xgft_hi, [vec_i+mul_array+16] ; hi | hi XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest