crc: only prefetch data that will be consumed for VPCLMUL functions

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
Pablo de Lara
2025-08-11 07:49:37 +00:00
committed by Marcel Cornu
parent 510de484c4
commit e677f668c8
6 changed files with 226 additions and 14 deletions

View File

@@ -145,7 +145,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
.fold_256_B_loop: %if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16
.fold_and_prefetch_256_B_loop:
add arg2, 256 add arg2, 256
PREFETCH [arg2+fetch_dist+0] PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0] vmovdqu8 zmm3, [arg2+16*0]
@@ -175,6 +181,39 @@ FUNCTION_NAME:
vpclmulqdq zmm8, zmm8, zmm16, 0x11 vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96 vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
.fold_256_B_loop:
add arg2, 256
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256 sub arg3, 256
jge .fold_256_B_loop jge .fold_256_B_loop

View File

@@ -133,8 +133,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16 align 16
.fold_256_B_loop: .fold_and_prefetch_256_B_loop:
add arg2, 256 add arg2, 256
PREFETCH [arg2+fetch_dist+0] PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10 vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -156,6 +161,32 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256 sub arg3, 256
jge .fold_256_B_loop jge .fold_256_B_loop

View File

@@ -133,8 +133,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16 align 16
.fold_256_B_loop: .fold_and_prefetch_256_B_loop:
add arg2, 256 add arg2, 256
vmovdqu8 zmm3, [arg2+16*0] vmovdqu8 zmm3, [arg2+16*0]
PREFETCH [arg2+fetch_dist+0] PREFETCH [arg2+fetch_dist+0]
@@ -164,6 +169,40 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x11 vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96 vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256 sub arg3, 256
jge .fold_256_B_loop jge .fold_256_B_loop

View File

@@ -122,8 +122,13 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
%if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb .fold_256_B_loop
align 16 align 16
.fold_256_B_loop: .fold_and_prefetch_256_B_loop:
add arg2, 256 add arg2, 256
PREFETCH [arg2+fetch_dist+0] PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10 vpclmulqdq zmm1, zmm0, zmm16, 0x10
@@ -145,6 +150,32 @@ align 16
vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge .fold_and_prefetch_256_B_loop
%endif ; fetch_dist != 0
align 16
.fold_256_B_loop:
add arg2, 256
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256 sub arg3, 256
jge .fold_256_B_loop jge .fold_256_B_loop

View File

@@ -115,30 +115,70 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
_fold_256_B_loop: %if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb _fold_256_B_loop
align 16
_fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96
PREFETCH [arg2+fetch_dist+64]
vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96
PREFETCH [arg2+fetch_dist+64*2]
vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96
PREFETCH [arg2+fetch_dist+64*3]
vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00
vpclmulqdq zmm8, zmm8, zmm16, 0x11
vpternlogq zmm8, zmm14, zmm17, 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge _fold_and_prefetch_256_B_loop
%endif
align 16
_fold_256_B_loop:
add arg2, 256 add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vmovdqu8 zmm3, [arg2+16*0] vmovdqu8 zmm3, [arg2+16*0]
vpshufb zmm3, zmm3, zmm18 vpshufb zmm3, zmm3, zmm18
vpclmulqdq zmm1, zmm0, zmm16, 0x00 vpclmulqdq zmm1, zmm0, zmm16, 0x00
vpclmulqdq zmm0, zmm0, zmm16, 0x11 vpclmulqdq zmm0, zmm0, zmm16, 0x11
vpternlogq zmm0, zmm1, zmm3, 0x96 vpternlogq zmm0, zmm1, zmm3, 0x96
PREFETCH [arg2+fetch_dist+64]
vmovdqu8 zmm9, [arg2+16*4] vmovdqu8 zmm9, [arg2+16*4]
vpshufb zmm9, zmm9, zmm18 vpshufb zmm9, zmm9, zmm18
vpclmulqdq zmm5, zmm4, zmm16, 0x00 vpclmulqdq zmm5, zmm4, zmm16, 0x00
vpclmulqdq zmm4, zmm4, zmm16, 0x11 vpclmulqdq zmm4, zmm4, zmm16, 0x11
vpternlogq zmm4, zmm5, zmm9, 0x96 vpternlogq zmm4, zmm5, zmm9, 0x96
PREFETCH [arg2+fetch_dist+64*2]
vmovdqu8 zmm11, [arg2+16*8] vmovdqu8 zmm11, [arg2+16*8]
vpshufb zmm11, zmm11, zmm18 vpshufb zmm11, zmm11, zmm18
vpclmulqdq zmm12, zmm7, zmm16, 0x00 vpclmulqdq zmm12, zmm7, zmm16, 0x00
vpclmulqdq zmm7, zmm7, zmm16, 0x11 vpclmulqdq zmm7, zmm7, zmm16, 0x11
vpternlogq zmm7, zmm12, zmm11, 0x96 vpternlogq zmm7, zmm12, zmm11, 0x96
PREFETCH [arg2+fetch_dist+64*3]
vmovdqu8 zmm17, [arg2+16*12] vmovdqu8 zmm17, [arg2+16*12]
vpshufb zmm17, zmm17, zmm18 vpshufb zmm17, zmm17, zmm18
vpclmulqdq zmm14, zmm8, zmm16, 0x00 vpclmulqdq zmm14, zmm8, zmm16, 0x00

View File

@@ -116,24 +116,56 @@ FUNCTION_NAME:
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
sub arg3, 256 sub arg3, 256
_fold_256_B_loop: %if fetch_dist != 0
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jb _fold_256_B_loop
align 16
_fold_and_prefetch_256_B_loop:
add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
PREFETCH [arg2+fetch_dist+64]
vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
PREFETCH [arg2+fetch_dist+64*2]
vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
PREFETCH [arg2+fetch_dist+64*3]
vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
sub arg3, 256
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
cmp arg3, (fetch_dist + 256)
jge _fold_and_prefetch_256_B_loop
%endif
align 16
_fold_256_B_loop:
add arg2, 256 add arg2, 256
PREFETCH [arg2+fetch_dist+0]
vpclmulqdq zmm1, zmm0, zmm16, 0x10 vpclmulqdq zmm1, zmm0, zmm16, 0x10
vpclmulqdq zmm0, zmm0, zmm16, 0x01 vpclmulqdq zmm0, zmm0, zmm16, 0x01
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
PREFETCH [arg2+fetch_dist+64]
vpclmulqdq zmm2, zmm4, zmm16, 0x10 vpclmulqdq zmm2, zmm4, zmm16, 0x10
vpclmulqdq zmm4, zmm4, zmm16, 0x01 vpclmulqdq zmm4, zmm4, zmm16, 0x01
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
PREFETCH [arg2+fetch_dist+64*2]
vpclmulqdq zmm3, zmm7, zmm16, 0x10 vpclmulqdq zmm3, zmm7, zmm16, 0x10
vpclmulqdq zmm7, zmm7, zmm16, 0x01 vpclmulqdq zmm7, zmm7, zmm16, 0x01
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
PREFETCH [arg2+fetch_dist+64*3]
vpclmulqdq zmm5, zmm8, zmm16, 0x10 vpclmulqdq zmm5, zmm8, zmm16, 0x10
vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpclmulqdq zmm8, zmm8, zmm16, 0x01
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96