crc: optimize last bytes

Change-Id: I4b8f73b23eb50c4c50ca65fab19716f217fe5780
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
Pablo de Lara 2023-08-11 16:00:12 +01:00
parent e53db85631
commit beab678fb8
3 changed files with 155 additions and 441 deletions

View File

@ -83,34 +83,27 @@ section .text
%xdefine arg1_low32 edi
%endif
%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*12+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
mk_global FUNCTION_NAME, function
FUNCTION_NAME:
endbranch
not arg1_low32
sub rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
sub rsp, (16*10 + 8)
; push the xmm registers into the stack to maintain
vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
vmovdqa [rsp + 16*0], xmm6
vmovdqa [rsp + 16*1], xmm7
vmovdqa [rsp + 16*2], xmm8
vmovdqa [rsp + 16*3], xmm9
vmovdqa [rsp + 16*4], xmm10
vmovdqa [rsp + 16*5], xmm11
vmovdqa [rsp + 16*6], xmm12
vmovdqa [rsp + 16*7], xmm13
vmovdqa [rsp + 16*8], xmm14
vmovdqa [rsp + 16*9], xmm15
%endif
; check if smaller than 256B
@ -228,9 +221,7 @@ FUNCTION_NAME:
.16B_reduction_loop:
vpclmulqdq xmm8, xmm7, xmm10, 0x1
vpclmulqdq xmm7, xmm7, xmm10, 0x10
vpxor xmm7, xmm8
vmovdqu xmm0, [arg2]
vpxor xmm7, xmm0
vpternlogq xmm7, xmm8, [arg2], 0x96
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
@ -257,7 +248,7 @@ FUNCTION_NAME:
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table]
lea rax, [rel pshufb_shf_table]
add rax, arg3
vmovdqu xmm0, [rax]
@ -269,8 +260,7 @@ FUNCTION_NAME:
;;;;;;;;;;
vpclmulqdq xmm8, xmm7, xmm10, 0x1
vpclmulqdq xmm7, xmm7, xmm10, 0x10
vpxor xmm7, xmm8
vpxor xmm7, xmm2
vpternlogq xmm7, xmm8, xmm2, 0x96
.128_done:
; compute crc of a 128-bit value
@ -297,12 +287,10 @@ FUNCTION_NAME:
vmovdqa xmm10, [rk7]
vpclmulqdq xmm7, xmm10, 0
vpxor xmm7, xmm2
vpand xmm7, [mask]
vpternlogq xmm7, xmm2, [mask], 0x28
vmovdqa xmm2, xmm7
vpclmulqdq xmm7, xmm10, 0x10
vpxor xmm7, xmm2
vpxor xmm7, xmm1
vpternlogq xmm7, xmm2, xmm1, 0x96
vpextrd eax, xmm7, 2
.cleanup:
@ -310,18 +298,19 @@ FUNCTION_NAME:
%ifidn __OUTPUT_FORMAT__, win64
vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
vmovdqa xmm6, [rsp + 16*0]
vmovdqa xmm7, [rsp + 16*1]
vmovdqa xmm8, [rsp + 16*2]
vmovdqa xmm9, [rsp + 16*3]
vmovdqa xmm10, [rsp + 16*4]
vmovdqa xmm11, [rsp + 16*5]
vmovdqa xmm12, [rsp + 16*6]
vmovdqa xmm13, [rsp + 16*7]
vmovdqa xmm14, [rsp + 16*8]
vmovdqa xmm15, [rsp + 16*9]
add rsp, (16*10 + 8)
%endif
add rsp, VARIABLE_OFFSET
ret
@ -375,62 +364,19 @@ align 16
align 16
.less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
xor r10, r10
bts r10, arg3
dec r10
kmovw k2, r10w
vmovdqu8 xmm7{k2}{z}, [arg2]
vpxor xmm1, xmm1
mov r11, rsp
vmovdqa [r11], xmm1
cmp arg3, 4
jl .only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl .less_than_8_left
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
.less_than_8_left:
cmp arg3, 4
jl .less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
.less_than_4_left:
cmp arg3, 2
jl .less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
.less_than_2_left:
cmp arg3, 1
jl .zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
.zero_left:
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
lea rax,[pshufb_shf_table]
vmovdqu xmm0, [rax + r9]
cmp arg3, 4
jb .only_less_than_4
lea rax, [rel pshufb_shf_table]
vmovdqu xmm0, [rax + arg3]
vpshufb xmm7,xmm0
jmp .128_done
@ -441,52 +387,10 @@ align 16
jmp .128_done
.only_less_than_4:
cmp arg3, 3
jl .only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 5
jmp .barrett
.only_less_than_3:
cmp arg3, 2
jl .only_less_than_2
; load 2 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 6
jmp .barrett
.only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 7
jmp .barrett
lea r11, [rel pshufb_shift_table]
vmovdqu xmm0, [r11 + arg3]
vpshufb xmm7, xmm0
jmp .barrett
section .data
align 32
@ -545,6 +449,13 @@ pshufb_shf_table:
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
align 16
pshufb_shift_table:
;; use these values to shift data for the pshufb instruction
db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
db 0x08, 0x09, 0x0A
mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3: dq 0x8080808080808080, 0x8080808080808080

View File

@ -73,34 +73,27 @@ section .text
%xdefine arg1_low32 edi
%endif
%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*12+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
mk_global FUNCTION_NAME, function
FUNCTION_NAME:
endbranch
not arg1_low32
sub rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
sub rsp, (16*10 + 8)
; push the xmm registers into the stack to maintain
vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
vmovdqa [rsp + 16*0], xmm6
vmovdqa [rsp + 16*1], xmm7
vmovdqa [rsp + 16*2], xmm8
vmovdqa [rsp + 16*3], xmm9
vmovdqa [rsp + 16*4], xmm10
vmovdqa [rsp + 16*5], xmm11
vmovdqa [rsp + 16*6], xmm12
vmovdqa [rsp + 16*7], xmm13
vmovdqa [rsp + 16*8], xmm14
vmovdqa [rsp + 16*9], xmm15
%endif
vbroadcasti32x4 zmm18, [SHUF_MASK]
@ -269,7 +262,7 @@ FUNCTION_NAME:
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table + 16]
lea rax, [rel pshufb_shf_table + 16]
sub rax, arg3
vmovdqu xmm0, [rax]
@ -280,8 +273,7 @@ FUNCTION_NAME:
vpclmulqdq xmm8, xmm7, xmm10, 0x11
vpclmulqdq xmm7, xmm7, xmm10, 0x00
vpxor xmm7, xmm8
vpxor xmm7, xmm1
vpternlogq xmm7, xmm8, xmm1, 0x96
.128_done:
; compute crc of a 128-bit value
@ -294,8 +286,7 @@ FUNCTION_NAME:
vpxor xmm7, xmm0
;32b fold
vmovdqa xmm0, xmm7
vpand xmm0, [mask2]
vpand xmm0, xmm7, [mask2]
vpsrldq xmm7, 12
vpclmulqdq xmm7, xmm10, 0x10
vpxor xmm7, xmm0
@ -317,18 +308,18 @@ FUNCTION_NAME:
%ifidn __OUTPUT_FORMAT__, win64
vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
vmovdqa xmm6, [rsp + 16*0]
vmovdqa xmm7, [rsp + 16*1]
vmovdqa xmm8, [rsp + 16*2]
vmovdqa xmm9, [rsp + 16*3]
vmovdqa xmm10, [rsp + 16*4]
vmovdqa xmm11, [rsp + 16*5]
vmovdqa xmm12, [rsp + 16*6]
vmovdqa xmm13, [rsp + 16*7]
vmovdqa xmm14, [rsp + 16*8]
vmovdqa xmm15, [rsp + 16*9]
add rsp, (16*10 + 8)
%endif
add rsp, VARIABLE_OFFSET
ret
@ -386,127 +377,39 @@ align 16
align 16
.less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
xor r10, r10
bts r10, arg3
dec r10
kmovw k2, r10w
vmovdqu8 xmm7{k2}{z}, [arg2]
vpshufb xmm7, xmm18 ; byte-reflect the plaintext
vpxor xmm1, xmm1
mov r11, rsp
vmovdqa [r11], xmm1
cmp arg3, 4
jl .only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl .less_than_8_left
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
.less_than_8_left:
cmp arg3, 4
jl .less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
.less_than_4_left:
cmp arg3, 2
jl .less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
.less_than_2_left:
cmp arg3, 1
jl .zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
.zero_left:
vmovdqa xmm7, [rsp]
vpshufb xmm7, xmm18
vpxor xmm7, xmm0 ; xor the initial crc value
lea rax, [pshufb_shf_table + 16]
sub rax, r9
cmp arg3, 4
jb .only_less_than_4
lea rax, [rel pshufb_shf_table + 16]
sub rax, arg3
vmovdqu xmm0, [rax]
vpxor xmm0, [mask1]
vpshufb xmm7,xmm0
jmp .128_done
align 16
.only_less_than_4:
lea r11, [rel pshufb_shift_table + 3]
sub r11, arg3
vmovdqu xmm0, [r11]
vpshufb xmm7, xmm0
jmp .barrett
align 32
.exact_16_left:
vmovdqu xmm7, [arg2]
vpshufb xmm7, xmm18
vpshufb xmm7, xmm18
vpxor xmm7, xmm0 ; xor the initial crc value
jmp .128_done
.only_less_than_4:
cmp arg3, 3
jl .only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
vmovdqa xmm7, [rsp]
vpshufb xmm7, xmm18
vpxor xmm7, xmm0 ; xor the initial crc value
vpsrldq xmm7, 5
jmp .barrett
.only_less_than_3:
cmp arg3, 2
jl .only_less_than_2
; load 2 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
vmovdqa xmm7, [rsp]
vpshufb xmm7, xmm18
vpxor xmm7, xmm0 ; xor the initial crc value
vpsrldq xmm7, 6
jmp .barrett
.only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
vmovdqa xmm7, [rsp]
vpshufb xmm7, xmm18
vpxor xmm7, xmm0 ; xor the initial crc value
vpsrldq xmm7, 7
jmp .barrett
section .data
align 32
@ -543,6 +446,13 @@ rk_2b: dq 0x17d3315d00000000
INCLUDE_CONSTS
%endif
align 16
pshufb_shift_table:
;; use these values to shift data for the pshufb instruction
db 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
db 0x0C, 0x0D, 0x0E, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF
db 0xFF, 0xFF
mask1: dq 0x8080808080808080, 0x8080808080808080
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

View File

@ -73,32 +73,24 @@ section .text
%xdefine arg1_low32 edx
%endif
%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*12+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
mk_global FUNCTION_NAME, function
FUNCTION_NAME:
endbranch
sub rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
sub rsp, (16*10 + 8)
; push the xmm registers into the stack to maintain
vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
vmovdqa [rsp + 16*0], xmm6
vmovdqa [rsp + 16*1], xmm7
vmovdqa [rsp + 16*2], xmm8
vmovdqa [rsp + 16*3], xmm9
vmovdqa [rsp + 16*4], xmm10
vmovdqa [rsp + 16*5], xmm11
vmovdqa [rsp + 16*6], xmm12
vmovdqa [rsp + 16*7], xmm13
vmovdqa [rsp + 16*8], xmm14
vmovdqa [rsp + 16*9], xmm15
%endif
; check if smaller than 256B
@ -216,9 +208,7 @@ FUNCTION_NAME:
.16B_reduction_loop:
vpclmulqdq xmm8, xmm7, xmm10, 0x1
vpclmulqdq xmm7, xmm7, xmm10, 0x10
vpxor xmm7, xmm8
vmovdqu xmm0, [arg2]
vpxor xmm7, xmm0
vpternlogq xmm7, xmm8, [arg2], 0x96
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
@ -245,7 +235,7 @@ FUNCTION_NAME:
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table]
lea rax, [rel pshufb_shf_table]
add rax, arg3
vmovdqu xmm0, [rax]
@ -257,57 +247,32 @@ FUNCTION_NAME:
;;;;;;;;;;
vpclmulqdq xmm8, xmm7, xmm10, 0x1
vpclmulqdq xmm7, xmm7, xmm10, 0x10
vpxor xmm7, xmm8
vpxor xmm7, xmm2
vpternlogq xmm7, xmm8, xmm2, 0x96
.128_done:
; compute crc of a 128-bit value
vmovdqa xmm10, [rk5]
vmovdqa xmm0, xmm7
;64b fold
vpclmulqdq xmm7, xmm10, 0
vpsrldq xmm0, 8
vpxor xmm7, xmm0
;32b fold
vmovdqa xmm0, xmm7
vpslldq xmm7, 4
vpclmulqdq xmm7, xmm10, 0x10
vpxor xmm7, xmm0
;barrett reduction
.barrett:
vpand xmm7, [mask2]
vmovdqa xmm1, xmm7
vmovdqa xmm2, xmm7
vmovdqa xmm10, [rk7]
vpclmulqdq xmm7, xmm10, 0
vpxor xmm7, xmm2
vpand xmm7, [mask]
vmovdqa xmm2, xmm7
vpclmulqdq xmm7, xmm10, 0x10
vpxor xmm7, xmm2
vpxor xmm7, xmm1
vpextrd eax, xmm7, 2
xor rax, rax
vmovq r11, xmm7
crc32 rax, r11
vpextrq r11, xmm7, 1
crc32 rax, r11
.cleanup:
%ifidn __OUTPUT_FORMAT__, win64
vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
vmovdqa xmm6, [rsp + 16*0]
vmovdqa xmm7, [rsp + 16*1]
vmovdqa xmm8, [rsp + 16*2]
vmovdqa xmm9, [rsp + 16*3]
vmovdqa xmm10, [rsp + 16*4]
vmovdqa xmm11, [rsp + 16*5]
vmovdqa xmm12, [rsp + 16*6]
vmovdqa xmm13, [rsp + 16*7]
vmovdqa xmm14, [rsp + 16*8]
vmovdqa xmm15, [rsp + 16*9]
add rsp, (16*10 + 8)
%endif
add rsp, VARIABLE_OFFSET
ret
@ -361,62 +326,19 @@ align 16
align 16
.less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
vpxor xmm1, xmm1
mov r11, rsp
vmovdqa [r11], xmm1
cmp arg3, 4
jl .only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl .less_than_8_left
xor r10, r10
bts r10, arg3
dec r10
kmovw k2, r10w
vmovdqu8 xmm7{k2}{z}, [arg2]
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
.less_than_8_left:
cmp arg3, 4
jl .less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
.less_than_4_left:
cmp arg3, 2
jl .less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
.less_than_2_left:
cmp arg3, 1
jl .zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
.zero_left:
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
lea rax,[pshufb_shf_table]
vmovdqu xmm0, [rax + r9]
lea rax, [rel pshufb_shf_table]
vmovdqu xmm0, [rax + arg3]
vpshufb xmm7,xmm0
jmp .128_done
@ -427,52 +349,23 @@ align 16
jmp .128_done
.only_less_than_4:
cmp arg3, 3
jl .only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 5
jmp .barrett
.only_less_than_3:
mov eax, arg1_low32
cmp arg3, 2
jl .only_less_than_2
jb .only_1_left
je .only_2_left
; load 2 Bytes
mov al, [arg2]
mov [r11], al
; 3 bytes left
crc32 eax, word [arg2]
crc32 eax, byte [arg2 + 2]
jmp .cleanup
mov al, [arg2+1]
mov [r11+1], al
.only_2_left:
crc32 eax, word [arg2]
jmp .cleanup
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 6
jmp .barrett
.only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
vmovdqa xmm7, [rsp]
vpxor xmm7, xmm0 ; xor the initial crc value
vpslldq xmm7, 7
jmp .barrett
.only_1_left:
crc32 eax, byte [arg2]
jmp .cleanup
section .data
align 32