crc: only prefetch data that will be consumed for VPCLMUL functions

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
Pablo de Lara
2025-08-11 07:49:37 +00:00
committed by Marcel Cornu
parent 510de484c4
commit e677f668c8
6 changed files with 226 additions and 14 deletions

View File

@@ -145,7 +145,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
.fold_256_B_loop:
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16
.fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0]
@@ -175,6 +181,39 @@ FUNCTION_NAME:
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
.fold_256_B_loop:
add arg2, 256
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
jge .fold_256_B_loop

View File

@@ -133,8 +133,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16
.fold_256_B_loop:
.fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -156,6 +161,32 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
jge .fold_256_B_loop

View File

@@ -133,8 +133,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16
.fold_256_B_loop:
.fold_and_prefetch_256_B_loop:
add arg2, 256
vmovdqu8 zmm3, [arg2+16*0]
PREFETCH [arg2+fetch_dist+0]
@@ -164,6 +169,40 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
jge .fold_256_B_loop

View File

@@ -122,8 +122,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16
.fold_256_B_loop:
.fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -145,6 +150,32 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
jge .fold_256_B_loop

View File

@@ -115,30 +115,70 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
_fold_256_B_loop:
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb _fold_256_B_loop
align 16
_fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
PREFETCH [arg2+fetch_dist+64]
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
PREFETCH [arg2+fetch_dist+64*2]
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
PREFETCH [arg2+fetch_dist+64*3]
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge _fold_and_prefetch_256_B_loop
%endif
align 16
_fold_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
PREFETCH [arg2+fetch_dist+64]
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
PREFETCH [arg2+fetch_dist+64*2]
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
PREFETCH [arg2+fetch_dist+64*3]
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00

View File

@@ -116,24 +116,56 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256
_fold_256_B_loop:
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb _fold_256_B_loop
align 16
_fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
PREFETCH [arg2+fetch_dist+64]
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
PREFETCH [arg2+fetch_dist+64*2]
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
PREFETCH [arg2+fetch_dist+64*3]
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge _fold_and_prefetch_256_B_loop
%endif
align 16
_fold_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
PREFETCH [arg2+fetch_dist+64]
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
PREFETCH [arg2+fetch_dist+64*2]
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
PREFETCH [arg2+fetch_dist+64*3]
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96