mirror of
https://github.com/intel/isa-l.git
synced 2025-11-04 20:30:59 +01:00
crc: only prefetch data that will be consumed for VPCLMUL functions
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
committed by
Marcel Cornu
parent
510de484c4
commit
e677f668c8
@@ -145,7 +145,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
.fold_256_B_loop:
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb .fold_256_B_loop
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
vmovdqu8 zmm3, [arg2+16*0]
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
@@ -175,6 +181,39 @@ FUNCTION_NAME:
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge .fold_and_prefetch_256_B_loop
|
||||||
|
%endif ; fetch_dist != 0
|
||||||
|
|
||||||
|
.fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
|
vpshufb zmm3, zmm3, zmm18
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||||
|
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm9, [arg2+16*4]
|
||||||
|
vpshufb zmm9, zmm9, zmm18
|
||||||
|
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||||
|
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm11, [arg2+16*8]
|
||||||
|
vpshufb zmm11, zmm11, zmm18
|
||||||
|
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||||
|
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm17, [arg2+16*12]
|
||||||
|
vpshufb zmm17, zmm17, zmm18
|
||||||
|
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge .fold_256_B_loop
|
jge .fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
@@ -133,8 +133,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb .fold_256_B_loop
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
.fold_256_B_loop:
|
.fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
@@ -156,6 +161,32 @@ align 16
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge .fold_and_prefetch_256_B_loop
|
||||||
|
%endif ; fetch_dist != 0
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||||
|
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||||
|
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||||
|
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge .fold_256_B_loop
|
jge .fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
@@ -133,8 +133,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb .fold_256_B_loop
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
.fold_256_B_loop:
|
.fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
vmovdqu8 zmm3, [arg2+16*0]
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
@@ -164,6 +169,40 @@ align 16
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge .fold_and_prefetch_256_B_loop
|
||||||
|
%endif ; fetch_dist != 0
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
|
vpshufb zmm3, zmm3, zmm18
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||||
|
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm9, [arg2+16*4]
|
||||||
|
vpshufb zmm9, zmm9, zmm18
|
||||||
|
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||||
|
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm11, [arg2+16*8]
|
||||||
|
vpshufb zmm11, zmm11, zmm18
|
||||||
|
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||||
|
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm17, [arg2+16*12]
|
||||||
|
vpshufb zmm17, zmm17, zmm18
|
||||||
|
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge .fold_256_B_loop
|
jge .fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
@@ -122,8 +122,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb .fold_256_B_loop
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
.fold_256_B_loop:
|
.fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
@@ -145,6 +150,32 @@ align 16
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge .fold_and_prefetch_256_B_loop
|
||||||
|
%endif ; fetch_dist != 0
|
||||||
|
|
||||||
|
align 16
|
||||||
|
.fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||||
|
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||||
|
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||||
|
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge .fold_256_B_loop
|
jge .fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
@@ -115,7 +115,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
_fold_256_B_loop:
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb _fold_256_B_loop
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
vmovdqu8 zmm3, [arg2+16*0]
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
@@ -145,6 +151,40 @@ _fold_256_B_loop:
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
vpternlogq zmm8, zmm14, zmm17, 0x96
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge _fold_and_prefetch_256_B_loop
|
||||||
|
%endif
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vmovdqu8 zmm3, [arg2+16*0]
|
||||||
|
vpshufb zmm3, zmm3, zmm18
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
||||||
|
vpternlogq zmm0, zmm1, zmm3, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm9, [arg2+16*4]
|
||||||
|
vpshufb zmm9, zmm9, zmm18
|
||||||
|
vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
||||||
|
vpternlogq zmm4, zmm5, zmm9, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm11, [arg2+16*8]
|
||||||
|
vpshufb zmm11, zmm11, zmm18
|
||||||
|
vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
||||||
|
vpternlogq zmm7, zmm12, zmm11, 0x96
|
||||||
|
|
||||||
|
vmovdqu8 zmm17, [arg2+16*12]
|
||||||
|
vpshufb zmm17, zmm17, zmm18
|
||||||
|
vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
||||||
|
vpternlogq zmm8, zmm14, zmm17, 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge _fold_256_B_loop
|
jge _fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,13 @@ FUNCTION_NAME:
|
|||||||
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
|
|
||||||
_fold_256_B_loop:
|
%if fetch_dist != 0
|
||||||
|
; check if there is at least 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jb _fold_256_B_loop
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_fold_and_prefetch_256_B_loop:
|
||||||
add arg2, 256
|
add arg2, 256
|
||||||
PREFETCH [arg2+fetch_dist+0]
|
PREFETCH [arg2+fetch_dist+0]
|
||||||
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
@@ -138,6 +144,32 @@ _fold_256_B_loop:
|
|||||||
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
|
sub arg3, 256
|
||||||
|
|
||||||
|
; check if there is another 1.5KB (fetch distance) + 256B in the buffer
|
||||||
|
cmp arg3, (fetch_dist + 256)
|
||||||
|
jge _fold_and_prefetch_256_B_loop
|
||||||
|
%endif
|
||||||
|
|
||||||
|
align 16
|
||||||
|
_fold_256_B_loop:
|
||||||
|
add arg2, 256
|
||||||
|
vpclmulqdq zmm1, zmm0, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm0, zmm0, zmm16, 0x01
|
||||||
|
vpternlogq zmm0, zmm1, [arg2+16*0], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm2, zmm4, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm4, zmm4, zmm16, 0x01
|
||||||
|
vpternlogq zmm4, zmm2, [arg2+16*4], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm3, zmm7, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm7, zmm7, zmm16, 0x01
|
||||||
|
vpternlogq zmm7, zmm3, [arg2+16*8], 0x96
|
||||||
|
|
||||||
|
vpclmulqdq zmm5, zmm8, zmm16, 0x10
|
||||||
|
vpclmulqdq zmm8, zmm8, zmm16, 0x01
|
||||||
|
vpternlogq zmm8, zmm5, [arg2+16*12], 0x96
|
||||||
|
|
||||||
sub arg3, 256
|
sub arg3, 256
|
||||||
jge _fold_256_B_loop
|
jge _fold_256_B_loop
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user