erasure_code: optimize AVX2 GFNI 2 vector dot product

Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
This commit is contained in:
Marcel Cornu 2023-12-08 10:07:48 +00:00 committed by Tomasz Kantecki
parent 3f87141d03
commit 0052080f53

View File

@ -80,36 +80,42 @@
%define tmp3 r13 ; must be saved and restored %define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored %define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored %define tmp5 rdi ; must be saved and restored
%define stack_size 4*16 + 5*8 ; must be an odd multiple of 8 %define stack_size 7*16 + 5*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + 8 + 8*x] %define arg(x) [rsp + stack_size + 8 + 8*x]
%define func(x) proc_frame x %define func(x) proc_frame x
%macro FUNC_SAVE 0 %macro FUNC_SAVE 0
alloc_stack stack_size alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6 vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7 vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8 vmovdqa [rsp + 2*16], xmm8
vmovdqa [rsp + 3*16], xmm9 vmovdqa [rsp + 3*16], xmm9
mov [rsp + 4*16 + 0*8], r12 vmovdqa [rsp + 4*16], xmm10
mov [rsp + 4*16 + 1*8], r13 vmovdqa [rsp + 5*16], xmm11
mov [rsp + 4*16 + 2*8], r14 vmovdqa [rsp + 6*16], xmm12
mov [rsp + 4*16 + 3*8], r15 mov [rsp + 7*16 + 0*8], r12
mov [rsp + 4*16 + 4*8], rdi mov [rsp + 7*16 + 1*8], r13
end_prolog mov [rsp + 7*16 + 2*8], r14
mov arg4, arg(4) mov [rsp + 7*16 + 3*8], r15
mov [rsp + 7*16 + 4*8], rdi
end_prolog
mov arg4, arg(4)
%endmacro %endmacro
%macro FUNC_RESTORE 0 %macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16] vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16] vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16] vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16] vmovdqa xmm9, [rsp + 3*16]
mov r12, [rsp + 4*16 + 0*8] vmovdqa xmm10, [rsp + 4*16]
mov r13, [rsp + 4*16 + 1*8] vmovdqa xmm11, [rsp + 5*16]
mov r14, [rsp + 4*16 + 2*8] vmovdqa xmm12, [rsp + 6*16]
mov r15, [rsp + 4*16 + 3*8] mov r12, [rsp + 7*16 + 0*8]
mov rdi, [rsp + 4*16 + 4*8] mov r13, [rsp + 7*16 + 1*8]
add rsp, stack_size mov r14, [rsp + 7*16 + 2*8]
mov r15, [rsp + 7*16 + 3*8]
mov rdi, [rsp + 7*16 + 4*8]
add rsp, stack_size
%endmacro %endmacro
%endif %endif
@ -142,17 +148,22 @@
%define x0l ymm0 %define x0l ymm0
%define x0h ymm1 %define x0h ymm1
%define x0x ymm2
%define xgft1 ymm2 %define xgft1 ymm3
%define xgft2 ymm3 %define xgft2 ymm4
%define xtmp1 ymm4 %define xtmp1 ymm5
%define xtmp2 ymm5 %define xtmp2 ymm6
%define xp1l ymm6 %define xp1l ymm7
%define xp2l ymm7 %define xp2l ymm8
%define xp1h ymm8
%define xp2h ymm9 %define xp1h ymm9
%define xp2h ymm10
%define xp1x ymm11
%define xp2x ymm12
%define x0 x0l %define x0 x0l
%define xp1 xp1l %define xp1 xp1l
@ -163,64 +174,105 @@ default rel
section .text section .text
;;
;; Encodes 96 bytes of all "k" sources into 2x 96 bytes (parity disk)
;;
%macro ENCODE_96B_2 0
vpxor xp1l, xp1l, xp1l
vpxor xp1h, xp1h, xp1h
vpxor xp1x, xp1x, xp1x
vpxor xp2l, xp2l, xp2l
vpxor xp2h, xp2h, xp2h
vpxor xp2x, xp2x, xp2x
mov tmp, mul_array
xor vec_i, vec_i
%%next_vect:
;; load next source vector
mov ptr, [src + vec_i]
XLDR x0l, [ptr + pos]
XLDR x0h, [ptr + pos + 32]
XLDR x0x, [ptr + pos + 64]
add vec_i, 8
vbroadcastsd xgft1, [tmp]
vbroadcastsd xgft2, [tmp + vec]
GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l
GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h, xgft2, xtmp2, xp2h
GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x, xgft2, xtmp2, xp2x
add tmp, 8
cmp vec_i, vec
jl %%next_vect
XSTR [dest1 + pos], xp1l
XSTR [dest1 + pos + 32], xp1h
XSTR [dest1 + pos + 64], xp1x
XSTR [dest2 + pos], xp2l
XSTR [dest2 + pos + 32], xp2h
XSTR [dest2 + pos + 64], xp2x
%endmacro
;; ;;
;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks) ;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
;; ;;
%macro ENCODE_64B_2 0 %macro ENCODE_64B_2 0
vpxor xp1l, xp1l, xp1l vpxor xp1l, xp1l, xp1l
vpxor xp1h, xp1h, xp1h vpxor xp1h, xp1h, xp1h
vpxor xp2l, xp2l, xp2l vpxor xp2l, xp2l, xp2l
vpxor xp2h, xp2h, xp2h vpxor xp2h, xp2h, xp2h
mov tmp, mul_array mov tmp, mul_array
xor vec_i, vec_i xor vec_i, vec_i
%%next_vect: %%next_vect:
mov ptr, [src + vec_i] mov ptr, [src + vec_i]
XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes
XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes
add vec_i, 8 add vec_i, 8
vbroadcastsd xgft1, [tmp] vbroadcastsd xgft1, [tmp]
vbroadcastsd xgft2, [tmp + vec] vbroadcastsd xgft2, [tmp + vec]
add tmp, 8 add tmp, 8
GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l
GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h
cmp vec_i, vec cmp vec_i, vec
jl %%next_vect jl %%next_vect
XSTR [dest1 + pos], xp1l XSTR [dest1 + pos], xp1l
XSTR [dest1 + pos + 32], xp1h XSTR [dest1 + pos + 32], xp1h
XSTR [dest2 + pos], xp2l XSTR [dest2 + pos], xp2l
XSTR [dest2 + pos + 32], xp2h XSTR [dest2 + pos + 32], xp2h
%endmacro %endmacro
;; ;;
;; Encodes 32 bytes of all "k" sources into 2x 32 bytes (parity disks) ;; Encodes 32 bytes of all "k" sources into 2x 32 bytes (parity disks)
;; ;;
%macro ENCODE_32B_2 0 %macro ENCODE_32B_2 0
vpxor xp1, xp1, xp1 vpxor xp1, xp1, xp1
vpxor xp2, xp2, xp2 vpxor xp2, xp2, xp2
mov tmp, mul_array mov tmp, mul_array
xor vec_i, vec_i xor vec_i, vec_i
%%next_vect: %%next_vect:
mov ptr, [src + vec_i] mov ptr, [src + vec_i]
XLDR x0, [ptr + pos] ;Get next source vector (32 bytes) XLDR x0, [ptr + pos] ;Get next source vector (32 bytes)
add vec_i, 8 add vec_i, 8
vbroadcastsd xgft1, [tmp] vbroadcastsd xgft1, [tmp]
vbroadcastsd xgft2, [tmp + vec] vbroadcastsd xgft2, [tmp + vec]
add tmp, 8 add tmp, 8
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
cmp vec_i, vec cmp vec_i, vec
jl %%next_vect jl %%next_vect
XSTR [dest1 + pos], xp1 XSTR [dest1 + pos], xp1
XSTR [dest2 + pos], xp2 XSTR [dest2 + pos], xp2
%endmacro %endmacro
;; ;;
@ -229,73 +281,82 @@ section .text
%macro ENCODE_LT_32B_2 1 %macro ENCODE_LT_32B_2 1
%define %%LEN %1 %define %%LEN %1
vpxor xp1, xp1, xp1 vpxor xp1, xp1, xp1
vpxor xp2, xp2, xp2 vpxor xp2, xp2, xp2
xor vec_i, vec_i xor vec_i, vec_i
%%next_vect: %%next_vect:
mov ptr, [src + vec_i] mov ptr, [src + vec_i]
simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp4 ;Get next source vector simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp4 ;Get next source vector
add vec_i, 8 add vec_i, 8
vbroadcastsd xgft1, [mul_array] vbroadcastsd xgft1, [mul_array]
vbroadcastsd xgft2, [mul_array + vec] vbroadcastsd xgft2, [mul_array + vec]
add mul_array, 8 add mul_array, 8
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
cmp vec_i, vec cmp vec_i, vec
jl %%next_vect jl %%next_vect
;Store updated encoded data ;Store updated encoded data
lea ptr, [dest1 + pos] lea ptr, [dest1 + pos]
simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i simd_store_avx2 ptr, xp1, %%LEN, tmp, tmp4
lea ptr, [dest2 + pos] lea ptr, [dest2 + pos]
simd_store_avx2 ptr, xp2, %%LEN, tmp, vec_i simd_store_avx2 ptr, xp2, %%LEN, tmp, tmp4
%endmacro %endmacro
align 16 align 16
mk_global gf_2vect_dot_prod_avx2_gfni, function mk_global gf_2vect_dot_prod_avx2_gfni, function
func(gf_2vect_dot_prod_avx2_gfni) func(gf_2vect_dot_prod_avx2_gfni)
FUNC_SAVE FUNC_SAVE
xor pos, pos xor pos, pos
shl vec, 3 ;; vec *= 8. Make vec_i count by 8 shl vec, 3 ;; vec *= 8. Make vec_i count by 8
mov dest1, [dest] mov dest1, [dest]
mov dest2, [dest + 8] mov dest2, [dest + 8]
cmp len, 64 cmp len, 96
jb .len_lt_64 jl .len_lt_96
.loop64: .loop96:
ENCODE_64B_2 ENCODE_96B_2
add pos, 64 ;; Loop on 64 bytes at a time first add pos, 96 ;; Loop on 96 bytes at a time first
sub len, 64 sub len, 96
cmp len, 64 cmp len, 96
jge .loop64 jge .loop96
.len_lt_96:
cmp len, 64
jl .len_lt_64
ENCODE_64B_2
add pos, 64 ;; encode next 64 bytes
sub len, 64
.len_lt_64: .len_lt_64:
cmp len, 32 cmp len, 32
jb .len_lt_32 jl .len_lt_32
ENCODE_32B_2 ENCODE_32B_2
add pos, 32 ;; encode next 32 bytes add pos, 32 ;; encode next 32 bytes
sub len, 32 sub len, 32
.len_lt_32: .len_lt_32:
cmp len, 0 cmp len, 0
jle .exit jle .exit
ENCODE_LT_32B_2 len ;; encode remaining bytes ENCODE_LT_32B_2 len ;; encode remaining bytes
.exit: .exit:
vzeroupper vzeroupper
FUNC_RESTORE FUNC_RESTORE
ret ret
endproc_frame endproc_frame
%endif ; if AS_FEATURE_LEVEL >= 10 %endif ; if AS_FEATURE_LEVEL >= 10