mirror of
https://github.com/intel/isa-l.git
synced 2025-11-01 11:52:52 +01:00
crc: only prefetch data that will be consumed for VPCLMUL functions
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
committed by
Marcel Cornu
parent
510de484c4
commit
e677f668c8
@@ -145,7 +145,13 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
.fold_256_B_loop:
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb .fold_256_B_loop
|
||||
|
||||
align 16
|
||||
.fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
@@ -175,6 +181,39 @@ FUNCTION_NAME:
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge .fold_and_prefetch_256_B_loop
|
||||
%endif ; fetch_dist != 0
|
||||
|
||||
.fold_256_B_loop:
|
||||
add arg2, 256
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
vpshufb zmm3, zmm3, zmm18
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||
|
||||
vmovdqu8 zmm9, [arg2+16*4]
|
||||
vpshufb zmm9, zmm9, zmm18
|
||||
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||
|
||||
vmovdqu8 zmm11, [arg2+16*8]
|
||||
vpshufb zmm11, zmm11, zmm18
|
||||
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||
|
||||
vmovdqu8 zmm17, [arg2+16*12]
|
||||
vpshufb zmm17, zmm17, zmm18
|
||||
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||
|
||||
sub arg3, 256
|
||||
jge .fold_256_B_loop
|
||||
|
||||
|
||||
@@ -133,8 +133,13 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb .fold_256_B_loop
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
.fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
@@ -156,6 +161,32 @@ align 16
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge .fold_and_prefetch_256_B_loop
|
||||
%endif ; fetch_dist != 0
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
add arg2, 256
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||
|
||||
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||
|
||||
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||
|
||||
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
sub arg3, 256
|
||||
jge .fold_256_B_loop
|
||||
|
||||
|
||||
@@ -133,8 +133,13 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb .fold_256_B_loop
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
.fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
@@ -164,6 +169,40 @@ align 16
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge .fold_and_prefetch_256_B_loop
|
||||
%endif ; fetch_dist != 0
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
add arg2, 256
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
vpshufb zmm3, zmm3, zmm18
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||
|
||||
vmovdqu8 zmm9, [arg2+16*4]
|
||||
vpshufb zmm9, zmm9, zmm18
|
||||
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||
|
||||
vmovdqu8 zmm11, [arg2+16*8]
|
||||
vpshufb zmm11, zmm11, zmm18
|
||||
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||
|
||||
vmovdqu8 zmm17, [arg2+16*12]
|
||||
vpshufb zmm17, zmm17, zmm18
|
||||
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||
|
||||
sub arg3, 256
|
||||
jge .fold_256_B_loop
|
||||
|
||||
|
||||
@@ -122,8 +122,13 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb .fold_256_B_loop
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
.fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
@@ -145,6 +150,32 @@ align 16
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge .fold_and_prefetch_256_B_loop
|
||||
%endif ; fetch_dist != 0
|
||||
|
||||
align 16
|
||||
.fold_256_B_loop:
|
||||
add arg2, 256
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||
|
||||
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||
|
||||
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||
|
||||
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
sub arg3, 256
|
||||
jge .fold_256_B_loop
|
||||
|
||||
|
||||
@@ -115,30 +115,70 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
_fold_256_B_loop:
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb _fold_256_B_loop
|
||||
|
||||
align 16
|
||||
_fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
vpshufb zmm3, zmm3, zmm18
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64]
|
||||
vmovdqu8 zmm9, [arg2+16*4]
|
||||
vpshufb zmm9, zmm9, zmm18
|
||||
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*2]
|
||||
vmovdqu8 zmm11, [arg2+16*8]
|
||||
vpshufb zmm11, zmm11, zmm18
|
||||
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*3]
|
||||
vmovdqu8 zmm17, [arg2+16*12]
|
||||
vpshufb zmm17, zmm17, zmm18
|
||||
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge _fold_and_prefetch_256_B_loop
|
||||
%endif
|
||||
|
||||
align 16
|
||||
_fold_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vmovdqu8 zmm3, [arg2+16*0]
|
||||
vpshufb zmm3, zmm3, zmm18
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64]
|
||||
vmovdqu8 zmm9, [arg2+16*4]
|
||||
vpshufb zmm9, zmm9, zmm18
|
||||
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*2]
|
||||
vmovdqu8 zmm11, [arg2+16*8]
|
||||
vpshufb zmm11, zmm11, zmm18
|
||||
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*3]
|
||||
vmovdqu8 zmm17, [arg2+16*12]
|
||||
vpshufb zmm17, zmm17, zmm18
|
||||
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||
|
||||
@@ -116,24 +116,56 @@ FUNCTION_NAME:
|
||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||
sub arg3, 256
|
||||
|
||||
_fold_256_B_loop:
|
||||
%if fetch_dist != 0
|
||||
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jb _fold_256_B_loop
|
||||
|
||||
align 16
|
||||
_fold_and_prefetch_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64]
|
||||
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*2]
|
||||
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*3]
|
||||
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
sub arg3, 256
|
||||
|
||||
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||
cmp arg3, (fetch_dist + 256)
|
||||
jge _fold_and_prefetch_256_B_loop
|
||||
%endif
|
||||
|
||||
align 16
|
||||
_fold_256_B_loop:
|
||||
add arg2, 256
|
||||
PREFETCH [arg2+fetch_dist+0]
|
||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64]
|
||||
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*2]
|
||||
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||
|
||||
PREFETCH [arg2+fetch_dist+64*3]
|
||||
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||
|
||||
Reference in New Issue
Block a user