diff --git a/crc/crc16_t10dif_by16_10.asm b/crc/crc16_t10dif_by16_10.asm index a315bea..4a2ff3e 100644 --- a/crc/crc16_t10dif_by16_10.asm +++ b/crc/crc16_t10dif_by16_10.asm @@ -145,7 +145,13 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 -.fold_256_B_loop: +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb .fold_256_B_loop + +align 16 +.fold_and_prefetch_256_B_loop: add arg2, 256 PREFETCH [arg2+fetch_dist+0] vmovdqu8 zmm3, [arg2+16*0] @@ -175,6 +181,39 @@ FUNCTION_NAME: vpclmulqdq zmm8, zmm8, zmm16, 0x11 vpternlogq zmm8, zmm14, zmm17, 0x96 + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge .fold_and_prefetch_256_B_loop +%endif ; fetch_dist != 0 + +.fold_256_B_loop: + add arg2, 256 + vmovdqu8 zmm3, [arg2+16*0] + vpshufb zmm3, zmm3, zmm18 + vpclmulqdq zmm1, zmm0, zmm16, 0x00 + vpclmulqdq zmm0, zmm0, zmm16, 0x11 + vpternlogq zmm0, zmm1, zmm3, 0x96 + + vmovdqu8 zmm9, [arg2+16*4] + vpshufb zmm9, zmm9, zmm18 + vpclmulqdq zmm5, zmm4, zmm16, 0x00 + vpclmulqdq zmm4, zmm4, zmm16, 0x11 + vpternlogq zmm4, zmm5, zmm9, 0x96 + + vmovdqu8 zmm11, [arg2+16*8] + vpshufb zmm11, zmm11, zmm18 + vpclmulqdq zmm12, zmm7, zmm16, 0x00 + vpclmulqdq zmm7, zmm7, zmm16, 0x11 + vpternlogq zmm7, zmm12, zmm11, 0x96 + + vmovdqu8 zmm17, [arg2+16*12] + vpshufb zmm17, zmm17, zmm18 + vpclmulqdq zmm14, zmm8, zmm16, 0x00 + vpclmulqdq zmm8, zmm8, zmm16, 0x11 + vpternlogq zmm8, zmm14, zmm17, 0x96 + sub arg3, 256 jge .fold_256_B_loop diff --git a/crc/crc32_gzip_refl_by16_10.asm b/crc/crc32_gzip_refl_by16_10.asm index 140b432..3be7759 100644 --- a/crc/crc32_gzip_refl_by16_10.asm +++ b/crc/crc32_gzip_refl_by16_10.asm @@ -133,8 +133,13 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb .fold_256_B_loop + align 16 -.fold_256_B_loop: +.fold_and_prefetch_256_B_loop: add arg2, 256 PREFETCH [arg2+fetch_dist+0] vpclmulqdq zmm1, zmm0, zmm16, 0x10 @@ -156,6 +161,32 @@ align 16 vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge .fold_and_prefetch_256_B_loop +%endif ; fetch_dist != 0 + +align 16 +.fold_256_B_loop: + add arg2, 256 + vpclmulqdq zmm1, zmm0, zmm16, 0x10 + vpclmulqdq zmm0, zmm0, zmm16, 0x01 + vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 + + vpclmulqdq zmm2, zmm4, zmm16, 0x10 + vpclmulqdq zmm4, zmm4, zmm16, 0x01 + vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 + + vpclmulqdq zmm3, zmm7, zmm16, 0x10 + vpclmulqdq zmm7, zmm7, zmm16, 0x01 + vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 + + vpclmulqdq zmm5, zmm8, zmm16, 0x10 + vpclmulqdq zmm8, zmm8, zmm16, 0x01 + vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 + sub arg3, 256 jge .fold_256_B_loop diff --git a/crc/crc32_ieee_by16_10.asm b/crc/crc32_ieee_by16_10.asm index 56450d4..3a5c0da 100644 --- a/crc/crc32_ieee_by16_10.asm +++ b/crc/crc32_ieee_by16_10.asm @@ -133,8 +133,13 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb .fold_256_B_loop + align 16 -.fold_256_B_loop: +.fold_and_prefetch_256_B_loop: add arg2, 256 vmovdqu8 zmm3, [arg2+16*0] PREFETCH [arg2+fetch_dist+0] @@ -164,6 +169,40 @@ align 16 vpclmulqdq zmm8, zmm8, zmm16, 0x11 vpternlogq zmm8, zmm14, zmm17, 0x96 + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge .fold_and_prefetch_256_B_loop +%endif ; fetch_dist != 0 + +align 16 +.fold_256_B_loop: + add arg2, 256 + vmovdqu8 zmm3, [arg2+16*0] + vpshufb zmm3, zmm3, zmm18 + vpclmulqdq zmm1, zmm0, zmm16, 0x00 + vpclmulqdq zmm0, zmm0, zmm16, 0x11 + vpternlogq zmm0, zmm1, zmm3, 0x96 + + vmovdqu8 zmm9, [arg2+16*4] + vpshufb zmm9, zmm9, zmm18 + vpclmulqdq zmm5, zmm4, zmm16, 0x00 + vpclmulqdq zmm4, zmm4, zmm16, 0x11 + vpternlogq zmm4, zmm5, zmm9, 0x96 + + vmovdqu8 zmm11, [arg2+16*8] + vpshufb zmm11, zmm11, zmm18 + vpclmulqdq zmm12, zmm7, zmm16, 0x00 + vpclmulqdq zmm7, zmm7, zmm16, 0x11 + vpternlogq zmm7, zmm12, zmm11, 0x96 + + vmovdqu8 zmm17, [arg2+16*12] + vpshufb zmm17, zmm17, zmm18 + vpclmulqdq zmm14, zmm8, zmm16, 0x00 + vpclmulqdq zmm8, zmm8, zmm16, 0x11 + vpternlogq zmm8, zmm14, zmm17, 0x96 + sub arg3, 256 jge .fold_256_B_loop diff --git a/crc/crc32_iscsi_by16_10.asm b/crc/crc32_iscsi_by16_10.asm index 07bdd68..aed2ece 100644 --- a/crc/crc32_iscsi_by16_10.asm +++ b/crc/crc32_iscsi_by16_10.asm @@ -122,8 +122,13 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb .fold_256_B_loop + align 16 -.fold_256_B_loop: +.fold_and_prefetch_256_B_loop: add arg2, 256 PREFETCH [arg2+fetch_dist+0] vpclmulqdq zmm1, zmm0, zmm16, 0x10 @@ -145,6 +150,32 @@ align 16 vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge .fold_and_prefetch_256_B_loop +%endif ; fetch_dist != 0 + +align 16 +.fold_256_B_loop: + add arg2, 256 + vpclmulqdq zmm1, zmm0, zmm16, 0x10 + vpclmulqdq zmm0, zmm0, zmm16, 0x01 + vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 + + vpclmulqdq zmm2, zmm4, zmm16, 0x10 + vpclmulqdq zmm4, zmm4, zmm16, 0x01 + vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 + + vpclmulqdq zmm3, zmm7, zmm16, 0x10 + vpclmulqdq zmm7, zmm7, zmm16, 0x01 + vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 + + vpclmulqdq zmm5, zmm8, zmm16, 0x10 + vpclmulqdq zmm8, zmm8, zmm16, 0x01 + vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 + sub arg3, 256 jge .fold_256_B_loop diff --git a/crc/crc64_iso_norm_by16_10.asm b/crc/crc64_iso_norm_by16_10.asm index 1ace1fb..fed322a 100644 --- a/crc/crc64_iso_norm_by16_10.asm +++ b/crc/crc64_iso_norm_by16_10.asm @@ -115,30 +115,70 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 -_fold_256_B_loop: +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb _fold_256_B_loop + +align 16 +_fold_and_prefetch_256_B_loop: + add arg2, 256 + PREFETCH [arg2+fetch_dist+0] + vmovdqu8 zmm3, [arg2+16*0] + vpshufb zmm3, zmm3, zmm18 + vpclmulqdq zmm1, zmm0, zmm16, 0x00 + vpclmulqdq zmm0, zmm0, zmm16, 0x11 + vpternlogq zmm0, zmm1, zmm3, 0x96 + + PREFETCH [arg2+fetch_dist+64] + vmovdqu8 zmm9, [arg2+16*4] + vpshufb zmm9, zmm9, zmm18 + vpclmulqdq zmm5, zmm4, zmm16, 0x00 + vpclmulqdq zmm4, zmm4, zmm16, 0x11 + vpternlogq zmm4, zmm5, zmm9, 0x96 + + PREFETCH [arg2+fetch_dist+64*2] + vmovdqu8 zmm11, [arg2+16*8] + vpshufb zmm11, zmm11, zmm18 + vpclmulqdq zmm12, zmm7, zmm16, 0x00 + vpclmulqdq zmm7, zmm7, zmm16, 0x11 + vpternlogq zmm7, zmm12, zmm11, 0x96 + + PREFETCH [arg2+fetch_dist+64*3] + vmovdqu8 zmm17, [arg2+16*12] + vpshufb zmm17, zmm17, zmm18 + vpclmulqdq zmm14, zmm8, zmm16, 0x00 + vpclmulqdq zmm8, zmm8, zmm16, 0x11 + vpternlogq zmm8, zmm14, zmm17, 0x96 + + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge _fold_and_prefetch_256_B_loop +%endif + +align 16 +_fold_256_B_loop: add arg2, 256 - PREFETCH [arg2+fetch_dist+0] vmovdqu8 zmm3, [arg2+16*0] vpshufb zmm3, zmm3, zmm18 vpclmulqdq zmm1, zmm0, zmm16, 0x00 vpclmulqdq zmm0, zmm0, zmm16, 0x11 vpternlogq zmm0, zmm1, zmm3, 0x96 - PREFETCH [arg2+fetch_dist+64] vmovdqu8 zmm9, [arg2+16*4] vpshufb zmm9, zmm9, zmm18 vpclmulqdq zmm5, zmm4, zmm16, 0x00 vpclmulqdq zmm4, zmm4, zmm16, 0x11 vpternlogq zmm4, zmm5, zmm9, 0x96 - PREFETCH [arg2+fetch_dist+64*2] vmovdqu8 zmm11, [arg2+16*8] vpshufb zmm11, zmm11, zmm18 vpclmulqdq zmm12, zmm7, zmm16, 0x00 vpclmulqdq zmm7, zmm7, zmm16, 0x11 vpternlogq zmm7, zmm12, zmm11, 0x96 - PREFETCH [arg2+fetch_dist+64*3] vmovdqu8 zmm17, [arg2+16*12] vpshufb zmm17, zmm17, zmm18 vpclmulqdq zmm14, zmm8, zmm16, 0x00 diff --git a/crc/crc64_iso_refl_by16_10.asm b/crc/crc64_iso_refl_by16_10.asm index 7f131fc..a2b629a 100644 --- a/crc/crc64_iso_refl_by16_10.asm +++ b/crc/crc64_iso_refl_by16_10.asm @@ -116,24 +116,56 @@ FUNCTION_NAME: vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2 sub arg3, 256 -_fold_256_B_loop: +%if fetch_dist != 0 + ; check if there is at least 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jb _fold_256_B_loop + +align 16 +_fold_and_prefetch_256_B_loop: + add arg2, 256 + PREFETCH [arg2+fetch_dist+0] + vpclmulqdq zmm1, zmm0, zmm16, 0x10 + vpclmulqdq zmm0, zmm0, zmm16, 0x01 + vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 + + PREFETCH [arg2+fetch_dist+64] + vpclmulqdq zmm2, zmm4, zmm16, 0x10 + vpclmulqdq zmm4, zmm4, zmm16, 0x01 + vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 + + PREFETCH [arg2+fetch_dist+64*2] + vpclmulqdq zmm3, zmm7, zmm16, 0x10 + vpclmulqdq zmm7, zmm7, zmm16, 0x01 + vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 + + PREFETCH [arg2+fetch_dist+64*3] + vpclmulqdq zmm5, zmm8, zmm16, 0x10 + vpclmulqdq zmm8, zmm8, zmm16, 0x01 + vpternlogq zmm8, zmm5, [arg2+16*12], 0x96 + + sub arg3, 256 + + ; check if there is another 1.5KB (fetch distance) + 256B in the buffer + cmp arg3, (fetch_dist + 256) + jge _fold_and_prefetch_256_B_loop +%endif + +align 16 +_fold_256_B_loop: add arg2, 256 - PREFETCH [arg2+fetch_dist+0] vpclmulqdq zmm1, zmm0, zmm16, 0x10 vpclmulqdq zmm0, zmm0, zmm16, 0x01 vpternlogq zmm0, zmm1, [arg2+16*0], 0x96 - PREFETCH [arg2+fetch_dist+64] vpclmulqdq zmm2, zmm4, zmm16, 0x10 vpclmulqdq zmm4, zmm4, zmm16, 0x01 vpternlogq zmm4, zmm2, [arg2+16*4], 0x96 - PREFETCH [arg2+fetch_dist+64*2] vpclmulqdq zmm3, zmm7, zmm16, 0x10 vpclmulqdq zmm7, zmm7, zmm16, 0x01 vpternlogq zmm7, zmm3, [arg2+16*8], 0x96 - PREFETCH [arg2+fetch_dist+64*3] vpclmulqdq zmm5, zmm8, zmm16, 0x10 vpclmulqdq zmm8, zmm8, zmm16, 0x01 vpternlogq zmm8, zmm5, [arg2+16*12], 0x96