diff --git a/crc/crc32_gzip_refl_by16_10.asm b/crc/crc32_gzip_refl_by16_10.asm index c70881e..c5ea81f 100644 --- a/crc/crc32_gzip_refl_by16_10.asm +++ b/crc/crc32_gzip_refl_by16_10.asm @@ -83,34 +83,27 @@ section .text %xdefine arg1_low32 edi %endif -%define TMP 16*0 -%ifidn __OUTPUT_FORMAT__, win64 - %define XMM_SAVE 16*2 - %define VARIABLE_OFFSET 16*12+8 -%else - %define VARIABLE_OFFSET 16*2+8 -%endif - align 16 mk_global FUNCTION_NAME, function FUNCTION_NAME: endbranch not arg1_low32 - sub rsp, VARIABLE_OFFSET %ifidn __OUTPUT_FORMAT__, win64 + sub rsp, (16*10 + 8) + ; push the xmm registers into the stack to maintain - vmovdqa [rsp + XMM_SAVE + 16*0], xmm6 - vmovdqa [rsp + XMM_SAVE + 16*1], xmm7 - vmovdqa [rsp + XMM_SAVE + 16*2], xmm8 - vmovdqa [rsp + XMM_SAVE + 16*3], xmm9 - vmovdqa [rsp + XMM_SAVE + 16*4], xmm10 - vmovdqa [rsp + XMM_SAVE + 16*5], xmm11 - vmovdqa [rsp + XMM_SAVE + 16*6], xmm12 - vmovdqa [rsp + XMM_SAVE + 16*7], xmm13 - vmovdqa [rsp + XMM_SAVE + 16*8], xmm14 - vmovdqa [rsp + XMM_SAVE + 16*9], xmm15 + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 %endif ; check if smaller than 256B @@ -228,9 +221,7 @@ FUNCTION_NAME: .16B_reduction_loop: vpclmulqdq xmm8, xmm7, xmm10, 0x1 vpclmulqdq xmm7, xmm7, xmm10, 0x10 - vpxor xmm7, xmm8 - vmovdqu xmm0, [arg2] - vpxor xmm7, xmm0 + vpternlogq xmm7, xmm8, [arg2], 0x96 add arg2, 16 sub arg3, 16 ; instead of a cmp instruction, we utilize the flags with the jge instruction @@ -257,7 +248,7 @@ FUNCTION_NAME: ; get rid of the extra data that was loaded before ; load the shift constant - lea rax, [pshufb_shf_table] + lea rax, [rel pshufb_shf_table] add rax, arg3 vmovdqu xmm0, [rax] @@ -269,8 +260,7 @@ FUNCTION_NAME: ;;;;;;;;;; vpclmulqdq xmm8, xmm7, xmm10, 0x1 vpclmulqdq xmm7, xmm7, xmm10, 0x10 - vpxor xmm7, xmm8 - vpxor xmm7, xmm2 + vpternlogq xmm7, xmm8, xmm2, 0x96 .128_done: ; compute crc of a 128-bit value @@ -297,12 +287,10 @@ FUNCTION_NAME: vmovdqa xmm10, [rk7] vpclmulqdq xmm7, xmm10, 0 - vpxor xmm7, xmm2 - vpand xmm7, [mask] + vpternlogq xmm7, xmm2, [mask], 0x28 vmovdqa xmm2, xmm7 vpclmulqdq xmm7, xmm10, 0x10 - vpxor xmm7, xmm2 - vpxor xmm7, xmm1 + vpternlogq xmm7, xmm2, xmm1, 0x96 vpextrd eax, xmm7, 2 .cleanup: @@ -310,18 +298,19 @@ FUNCTION_NAME: %ifidn __OUTPUT_FORMAT__, win64 - vmovdqa xmm6, [rsp + XMM_SAVE + 16*0] - vmovdqa xmm7, [rsp + XMM_SAVE + 16*1] - vmovdqa xmm8, [rsp + XMM_SAVE + 16*2] - vmovdqa xmm9, [rsp + XMM_SAVE + 16*3] - vmovdqa xmm10, [rsp + XMM_SAVE + 16*4] - vmovdqa xmm11, [rsp + XMM_SAVE + 16*5] - vmovdqa xmm12, [rsp + XMM_SAVE + 16*6] - vmovdqa xmm13, [rsp + XMM_SAVE + 16*7] - vmovdqa xmm14, [rsp + XMM_SAVE + 16*8] - vmovdqa xmm15, [rsp + XMM_SAVE + 16*9] + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + + add rsp, (16*10 + 8) %endif - add rsp, VARIABLE_OFFSET ret @@ -375,62 +364,19 @@ align 16 align 16 .less_than_16_left: - ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + xor r10, r10 + bts r10, arg3 + dec r10 + kmovw k2, r10w + vmovdqu8 xmm7{k2}{z}, [arg2] - vpxor xmm1, xmm1 - mov r11, rsp - vmovdqa [r11], xmm1 - - cmp arg3, 4 - jl .only_less_than_4 - - ; backup the counter value - mov r9, arg3 - cmp arg3, 8 - jl .less_than_8_left - - ; load 8 Bytes - mov rax, [arg2] - mov [r11], rax - add r11, 8 - sub arg3, 8 - add arg2, 8 -.less_than_8_left: - - cmp arg3, 4 - jl .less_than_4_left - - ; load 4 Bytes - mov eax, [arg2] - mov [r11], eax - add r11, 4 - sub arg3, 4 - add arg2, 4 -.less_than_4_left: - - cmp arg3, 2 - jl .less_than_2_left - - ; load 2 Bytes - mov ax, [arg2] - mov [r11], ax - add r11, 2 - sub arg3, 2 - add arg2, 2 -.less_than_2_left: - cmp arg3, 1 - jl .zero_left - - ; load 1 Byte - mov al, [arg2] - mov [r11], al - -.zero_left: - vmovdqa xmm7, [rsp] vpxor xmm7, xmm0 ; xor the initial crc value - lea rax,[pshufb_shf_table] - vmovdqu xmm0, [rax + r9] + cmp arg3, 4 + jb .only_less_than_4 + + lea rax, [rel pshufb_shf_table] + vmovdqu xmm0, [rax + arg3] vpshufb xmm7,xmm0 jmp .128_done @@ -441,52 +387,10 @@ align 16 jmp .128_done .only_less_than_4: - cmp arg3, 3 - jl .only_less_than_3 - - ; load 3 Bytes - mov al, [arg2] - mov [r11], al - - mov al, [arg2+1] - mov [r11+1], al - - mov al, [arg2+2] - mov [r11+2], al - - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 5 - jmp .barrett - -.only_less_than_3: - cmp arg3, 2 - jl .only_less_than_2 - - ; load 2 Bytes - mov al, [arg2] - mov [r11], al - - mov al, [arg2+1] - mov [r11+1], al - - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 6 - jmp .barrett - -.only_less_than_2: - ; load 1 Byte - mov al, [arg2] - mov [r11], al - - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 7 - jmp .barrett + lea r11, [rel pshufb_shift_table] + vmovdqu xmm0, [r11 + arg3] + vpshufb xmm7, xmm0 + jmp .barrett section .data align 32 @@ -545,6 +449,13 @@ pshufb_shf_table: dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 dq 0x0706050403020100, 0x000e0d0c0b0a0908 +align 16 +pshufb_shift_table: + ;; use these values to shift data for the pshufb instruction + db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + db 0x08, 0x09, 0x0A + mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF mask3: dq 0x8080808080808080, 0x8080808080808080 diff --git a/crc/crc32_ieee_by16_10.asm b/crc/crc32_ieee_by16_10.asm index b563982..aa756b2 100644 --- a/crc/crc32_ieee_by16_10.asm +++ b/crc/crc32_ieee_by16_10.asm @@ -73,34 +73,27 @@ section .text %xdefine arg1_low32 edi %endif -%define TMP 16*0 -%ifidn __OUTPUT_FORMAT__, win64 - %define XMM_SAVE 16*2 - %define VARIABLE_OFFSET 16*12+8 -%else - %define VARIABLE_OFFSET 16*2+8 -%endif - align 16 mk_global FUNCTION_NAME, function FUNCTION_NAME: endbranch not arg1_low32 - sub rsp, VARIABLE_OFFSET %ifidn __OUTPUT_FORMAT__, win64 + sub rsp, (16*10 + 8) + ; push the xmm registers into the stack to maintain - vmovdqa [rsp + XMM_SAVE + 16*0], xmm6 - vmovdqa [rsp + XMM_SAVE + 16*1], xmm7 - vmovdqa [rsp + XMM_SAVE + 16*2], xmm8 - vmovdqa [rsp + XMM_SAVE + 16*3], xmm9 - vmovdqa [rsp + XMM_SAVE + 16*4], xmm10 - vmovdqa [rsp + XMM_SAVE + 16*5], xmm11 - vmovdqa [rsp + XMM_SAVE + 16*6], xmm12 - vmovdqa [rsp + XMM_SAVE + 16*7], xmm13 - vmovdqa [rsp + XMM_SAVE + 16*8], xmm14 - vmovdqa [rsp + XMM_SAVE + 16*9], xmm15 + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 %endif vbroadcasti32x4 zmm18, [SHUF_MASK] @@ -269,7 +262,7 @@ FUNCTION_NAME: ; get rid of the extra data that was loaded before ; load the shift constant - lea rax, [pshufb_shf_table + 16] + lea rax, [rel pshufb_shf_table + 16] sub rax, arg3 vmovdqu xmm0, [rax] @@ -280,8 +273,7 @@ FUNCTION_NAME: vpclmulqdq xmm8, xmm7, xmm10, 0x11 vpclmulqdq xmm7, xmm7, xmm10, 0x00 - vpxor xmm7, xmm8 - vpxor xmm7, xmm1 + vpternlogq xmm7, xmm8, xmm1, 0x96 .128_done: ; compute crc of a 128-bit value @@ -294,8 +286,7 @@ FUNCTION_NAME: vpxor xmm7, xmm0 ;32b fold - vmovdqa xmm0, xmm7 - vpand xmm0, [mask2] + vpand xmm0, xmm7, [mask2] vpsrldq xmm7, 12 vpclmulqdq xmm7, xmm10, 0x10 vpxor xmm7, xmm0 @@ -317,18 +308,18 @@ FUNCTION_NAME: %ifidn __OUTPUT_FORMAT__, win64 - vmovdqa xmm6, [rsp + XMM_SAVE + 16*0] - vmovdqa xmm7, [rsp + XMM_SAVE + 16*1] - vmovdqa xmm8, [rsp + XMM_SAVE + 16*2] - vmovdqa xmm9, [rsp + XMM_SAVE + 16*3] - vmovdqa xmm10, [rsp + XMM_SAVE + 16*4] - vmovdqa xmm11, [rsp + XMM_SAVE + 16*5] - vmovdqa xmm12, [rsp + XMM_SAVE + 16*6] - vmovdqa xmm13, [rsp + XMM_SAVE + 16*7] - vmovdqa xmm14, [rsp + XMM_SAVE + 16*8] - vmovdqa xmm15, [rsp + XMM_SAVE + 16*9] + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + add rsp, (16*10 + 8) %endif - add rsp, VARIABLE_OFFSET ret @@ -386,127 +377,39 @@ align 16 align 16 .less_than_16_left: - ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + xor r10, r10 + bts r10, arg3 + dec r10 + kmovw k2, r10w + vmovdqu8 xmm7{k2}{z}, [arg2] + vpshufb xmm7, xmm18 ; byte-reflect the plaintext - vpxor xmm1, xmm1 - mov r11, rsp - vmovdqa [r11], xmm1 - - cmp arg3, 4 - jl .only_less_than_4 - - ; backup the counter value - mov r9, arg3 - cmp arg3, 8 - jl .less_than_8_left - - ; load 8 Bytes - mov rax, [arg2] - mov [r11], rax - add r11, 8 - sub arg3, 8 - add arg2, 8 -.less_than_8_left: - - cmp arg3, 4 - jl .less_than_4_left - - ; load 4 Bytes - mov eax, [arg2] - mov [r11], eax - add r11, 4 - sub arg3, 4 - add arg2, 4 -.less_than_4_left: - - cmp arg3, 2 - jl .less_than_2_left - - ; load 2 Bytes - mov ax, [arg2] - mov [r11], ax - add r11, 2 - sub arg3, 2 - add arg2, 2 -.less_than_2_left: - cmp arg3, 1 - jl .zero_left - - ; load 1 Byte - mov al, [arg2] - mov [r11], al - -.zero_left: - vmovdqa xmm7, [rsp] - vpshufb xmm7, xmm18 vpxor xmm7, xmm0 ; xor the initial crc value - lea rax, [pshufb_shf_table + 16] - sub rax, r9 + cmp arg3, 4 + jb .only_less_than_4 + + lea rax, [rel pshufb_shf_table + 16] + sub rax, arg3 vmovdqu xmm0, [rax] vpxor xmm0, [mask1] vpshufb xmm7,xmm0 jmp .128_done - -align 16 +.only_less_than_4: + lea r11, [rel pshufb_shift_table + 3] + sub r11, arg3 + vmovdqu xmm0, [r11] + vpshufb xmm7, xmm0 + jmp .barrett +align 32 .exact_16_left: vmovdqu xmm7, [arg2] - vpshufb xmm7, xmm18 + vpshufb xmm7, xmm18 vpxor xmm7, xmm0 ; xor the initial crc value + jmp .128_done -.only_less_than_4: - cmp arg3, 3 - jl .only_less_than_3 - - ; load 3 Bytes - mov al, [arg2] - mov [r11], al - - mov al, [arg2+1] - mov [r11+1], al - - mov al, [arg2+2] - mov [r11+2], al - - vmovdqa xmm7, [rsp] - vpshufb xmm7, xmm18 - vpxor xmm7, xmm0 ; xor the initial crc value - - vpsrldq xmm7, 5 - jmp .barrett - -.only_less_than_3: - cmp arg3, 2 - jl .only_less_than_2 - - ; load 2 Bytes - mov al, [arg2] - mov [r11], al - - mov al, [arg2+1] - mov [r11+1], al - - vmovdqa xmm7, [rsp] - vpshufb xmm7, xmm18 - vpxor xmm7, xmm0 ; xor the initial crc value - - vpsrldq xmm7, 6 - jmp .barrett - -.only_less_than_2: - ; load 1 Byte - mov al, [arg2] - mov [r11], al - - vmovdqa xmm7, [rsp] - vpshufb xmm7, xmm18 - vpxor xmm7, xmm0 ; xor the initial crc value - - vpsrldq xmm7, 7 - jmp .barrett - section .data align 32 @@ -543,6 +446,13 @@ rk_2b: dq 0x17d3315d00000000 INCLUDE_CONSTS %endif +align 16 +pshufb_shift_table: + ;; use these values to shift data for the pshufb instruction + db 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, + db 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF + db 0xFF, 0xFF + mask1: dq 0x8080808080808080, 0x8080808080808080 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF diff --git a/crc/crc32_iscsi_by16_10.asm b/crc/crc32_iscsi_by16_10.asm index 775a348..71102d2 100644 --- a/crc/crc32_iscsi_by16_10.asm +++ b/crc/crc32_iscsi_by16_10.asm @@ -73,32 +73,24 @@ section .text %xdefine arg1_low32 edx %endif -%define TMP 16*0 -%ifidn __OUTPUT_FORMAT__, win64 - %define XMM_SAVE 16*2 - %define VARIABLE_OFFSET 16*12+8 -%else - %define VARIABLE_OFFSET 16*2+8 -%endif - align 16 mk_global FUNCTION_NAME, function FUNCTION_NAME: endbranch - sub rsp, VARIABLE_OFFSET - %ifidn __OUTPUT_FORMAT__, win64 + sub rsp, (16*10 + 8) + ; push the xmm registers into the stack to maintain - vmovdqa [rsp + XMM_SAVE + 16*0], xmm6 - vmovdqa [rsp + XMM_SAVE + 16*1], xmm7 - vmovdqa [rsp + XMM_SAVE + 16*2], xmm8 - vmovdqa [rsp + XMM_SAVE + 16*3], xmm9 - vmovdqa [rsp + XMM_SAVE + 16*4], xmm10 - vmovdqa [rsp + XMM_SAVE + 16*5], xmm11 - vmovdqa [rsp + XMM_SAVE + 16*6], xmm12 - vmovdqa [rsp + XMM_SAVE + 16*7], xmm13 - vmovdqa [rsp + XMM_SAVE + 16*8], xmm14 - vmovdqa [rsp + XMM_SAVE + 16*9], xmm15 + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 %endif ; check if smaller than 256B @@ -216,9 +208,7 @@ FUNCTION_NAME: .16B_reduction_loop: vpclmulqdq xmm8, xmm7, xmm10, 0x1 vpclmulqdq xmm7, xmm7, xmm10, 0x10 - vpxor xmm7, xmm8 - vmovdqu xmm0, [arg2] - vpxor xmm7, xmm0 + vpternlogq xmm7, xmm8, [arg2], 0x96 add arg2, 16 sub arg3, 16 ; instead of a cmp instruction, we utilize the flags with the jge instruction @@ -245,7 +235,7 @@ FUNCTION_NAME: ; get rid of the extra data that was loaded before ; load the shift constant - lea rax, [pshufb_shf_table] + lea rax, [rel pshufb_shf_table] add rax, arg3 vmovdqu xmm0, [rax] @@ -257,57 +247,32 @@ FUNCTION_NAME: ;;;;;;;;;; vpclmulqdq xmm8, xmm7, xmm10, 0x1 vpclmulqdq xmm7, xmm7, xmm10, 0x10 - vpxor xmm7, xmm8 - vpxor xmm7, xmm2 + vpternlogq xmm7, xmm8, xmm2, 0x96 .128_done: ; compute crc of a 128-bit value - vmovdqa xmm10, [rk5] - vmovdqa xmm0, xmm7 - - ;64b fold - vpclmulqdq xmm7, xmm10, 0 - vpsrldq xmm0, 8 - vpxor xmm7, xmm0 - - ;32b fold - vmovdqa xmm0, xmm7 - vpslldq xmm7, 4 - vpclmulqdq xmm7, xmm10, 0x10 - vpxor xmm7, xmm0 - - - ;barrett reduction -.barrett: - vpand xmm7, [mask2] - vmovdqa xmm1, xmm7 - vmovdqa xmm2, xmm7 - vmovdqa xmm10, [rk7] - - vpclmulqdq xmm7, xmm10, 0 - vpxor xmm7, xmm2 - vpand xmm7, [mask] - vmovdqa xmm2, xmm7 - vpclmulqdq xmm7, xmm10, 0x10 - vpxor xmm7, xmm2 - vpxor xmm7, xmm1 - vpextrd eax, xmm7, 2 + xor rax, rax + vmovq r11, xmm7 + crc32 rax, r11 + vpextrq r11, xmm7, 1 + crc32 rax, r11 .cleanup: %ifidn __OUTPUT_FORMAT__, win64 - vmovdqa xmm6, [rsp + XMM_SAVE + 16*0] - vmovdqa xmm7, [rsp + XMM_SAVE + 16*1] - vmovdqa xmm8, [rsp + XMM_SAVE + 16*2] - vmovdqa xmm9, [rsp + XMM_SAVE + 16*3] - vmovdqa xmm10, [rsp + XMM_SAVE + 16*4] - vmovdqa xmm11, [rsp + XMM_SAVE + 16*5] - vmovdqa xmm12, [rsp + XMM_SAVE + 16*6] - vmovdqa xmm13, [rsp + XMM_SAVE + 16*7] - vmovdqa xmm14, [rsp + XMM_SAVE + 16*8] - vmovdqa xmm15, [rsp + XMM_SAVE + 16*9] + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + + add rsp, (16*10 + 8) %endif - add rsp, VARIABLE_OFFSET ret @@ -361,62 +326,19 @@ align 16 align 16 .less_than_16_left: - ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. - - vpxor xmm1, xmm1 - mov r11, rsp - vmovdqa [r11], xmm1 - cmp arg3, 4 jl .only_less_than_4 - ; backup the counter value - mov r9, arg3 - cmp arg3, 8 - jl .less_than_8_left + xor r10, r10 + bts r10, arg3 + dec r10 + kmovw k2, r10w + vmovdqu8 xmm7{k2}{z}, [arg2] - ; load 8 Bytes - mov rax, [arg2] - mov [r11], rax - add r11, 8 - sub arg3, 8 - add arg2, 8 -.less_than_8_left: - - cmp arg3, 4 - jl .less_than_4_left - - ; load 4 Bytes - mov eax, [arg2] - mov [r11], eax - add r11, 4 - sub arg3, 4 - add arg2, 4 -.less_than_4_left: - - cmp arg3, 2 - jl .less_than_2_left - - ; load 2 Bytes - mov ax, [arg2] - mov [r11], ax - add r11, 2 - sub arg3, 2 - add arg2, 2 -.less_than_2_left: - cmp arg3, 1 - jl .zero_left - - ; load 1 Byte - mov al, [arg2] - mov [r11], al - -.zero_left: - vmovdqa xmm7, [rsp] vpxor xmm7, xmm0 ; xor the initial crc value - lea rax,[pshufb_shf_table] - vmovdqu xmm0, [rax + r9] + lea rax, [rel pshufb_shf_table] + vmovdqu xmm0, [rax + arg3] vpshufb xmm7,xmm0 jmp .128_done @@ -427,52 +349,23 @@ align 16 jmp .128_done .only_less_than_4: - cmp arg3, 3 - jl .only_less_than_3 - - ; load 3 Bytes - mov al, [arg2] - mov [r11], al - - mov al, [arg2+1] - mov [r11+1], al - - mov al, [arg2+2] - mov [r11+2], al - - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 5 - jmp .barrett - -.only_less_than_3: + mov eax, arg1_low32 cmp arg3, 2 - jl .only_less_than_2 + jb .only_1_left + je .only_2_left - ; load 2 Bytes - mov al, [arg2] - mov [r11], al + ; 3 bytes left + crc32 eax, word [arg2] + crc32 eax, byte [arg2 + 2] + jmp .cleanup - mov al, [arg2+1] - mov [r11+1], al +.only_2_left: + crc32 eax, word [arg2] + jmp .cleanup - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 6 - jmp .barrett - -.only_less_than_2: - ; load 1 Byte - mov al, [arg2] - mov [r11], al - - vmovdqa xmm7, [rsp] - vpxor xmm7, xmm0 ; xor the initial crc value - - vpslldq xmm7, 7 - jmp .barrett +.only_1_left: + crc32 eax, byte [arg2] + jmp .cleanup section .data align 32