;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2025 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc32_iscsi_by8_02(
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len, //buffer length in bytes (64-bit data)
;               UINT32 init_crc //initial CRC value, 32 bits
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
;
;       CRC-32 checksum is described in RFC 1952
;       Implementing RFC 1952 CRC:
;       http://www.ietf.org/rfc/rfc1952.txt

%include "reg_sizes.asm"

%ifndef fetch_dist
%define	fetch_dist	4096
%endif

%ifndef PREFETCH
%define PREFETCH        prefetcht1
%endif

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg3_low32 r8d
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg3_low32 edx
%endif

%define in_buf  arg1
%define buf_len arg2
%define init_crc arg3_low32

%xdefine	tmp	r10
%xdefine	tmp2	r11

%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_OFFSET 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif

align 16
mk_global  crc32_iscsi_by8_02, function
crc32_iscsi_by8_02:
	endbranch

	sub	rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        vmovdqa  [rsp + XMM_OFFSET + 16*0], xmm6
        vmovdqa  [rsp + XMM_OFFSET + 16*1], xmm7
        vmovdqa  [rsp + XMM_OFFSET + 16*2], xmm8
        vmovdqa  [rsp + XMM_OFFSET + 16*3], xmm9
        vmovdqa  [rsp + XMM_OFFSET + 16*4], xmm10
        vmovdqa  [rsp + XMM_OFFSET + 16*5], xmm11
        vmovdqa  [rsp + XMM_OFFSET + 16*6], xmm12
        vmovdqa  [rsp + XMM_OFFSET + 16*7], xmm13
%endif

	;; fastpath for short data
	mov	eax, init_crc
	cmp	buf_len, 4
	jb	_less_than_4
	cmp	buf_len, 8
	jb	_less_than_8
	cmp	buf_len, 16
	jbe	_no_more_than_16

	; check if smaller than 256
	cmp	buf_len, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	vmovd	xmm10, init_crc	; initial crc

	; receive the initial 128B data, xor the initial crc value
	vmovdqu	xmm0, [in_buf+16*0]
	vmovdqu	xmm1, [in_buf+16*1]
	vmovdqu	xmm2, [in_buf+16*2]
	vmovdqu	xmm3, [in_buf+16*3]
	vmovdqu	xmm4, [in_buf+16*4]
	vmovdqu	xmm5, [in_buf+16*5]
	vmovdqu	xmm6, [in_buf+16*6]
	vmovdqu	xmm7, [in_buf+16*7]

	; XOR the initial_crc value
	vpxor	xmm0, xmm10

	vmovdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	buf_len, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer

%if fetch_dist != 0
	; check if there is at least 4kb (fetch distance) + 128b in the buffer
        cmp     buf_len, (fetch_dist + 128)
        jb     _fold_128_B_loop

	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
align 16
_fold_and_prefetch_128_B_loop:

	; update the buffer pointer
	add	in_buf, 128		;    buf += 128;

	PREFETCH [in_buf+fetch_dist+0]
	vmovdqu	xmm9, [in_buf+16*0]
	vmovdqu	xmm12, [in_buf+16*1]
	vpclmulqdq	xmm8, xmm0, xmm10, 0x10
	vpclmulqdq	xmm0, xmm0, xmm10, 0x1
	vpclmulqdq	xmm13, xmm1, xmm10, 0x10
	vpclmulqdq	xmm1, xmm1, xmm10, 0x1
	vpxor	xmm0, xmm9
	vpxor 	xmm0, xmm8
	vpxor	xmm1, xmm12
	vpxor 	xmm1, xmm13

	vmovdqu	xmm9, [in_buf+16*2]
	vmovdqu	xmm12, [in_buf+16*3]
	vpclmulqdq	xmm8, xmm2, xmm10, 0x10
	vpclmulqdq	xmm2, xmm2, xmm10, 0x1
	vpclmulqdq	xmm13, xmm3, xmm10, 0x10
	vpclmulqdq	xmm3, xmm3, xmm10, 0x1
	vpxor	xmm2, xmm9
	vpxor 	xmm2, xmm8
	vpxor	xmm3, xmm12
	vpxor 	xmm3, xmm13

	PREFETCH [in_buf+fetch_dist+64]
	vmovdqu	xmm9, [in_buf+16*4]
	vmovdqu	xmm12, [in_buf+16*5]
	vpclmulqdq	xmm8, xmm4, xmm10, 0x10
	vpclmulqdq	xmm4, xmm4, xmm10, 0x1
	vpclmulqdq	xmm13, xmm5, xmm10, 0x10
	vpclmulqdq	xmm5, xmm5, xmm10, 0x1
	vpxor	xmm4, xmm9
	vpxor 	xmm4, xmm8
	vpxor	xmm5, xmm12
	vpxor 	xmm5, xmm13

	vmovdqu	xmm9, [in_buf+16*6]
	vmovdqu	xmm12, [in_buf+16*7]
	vmovdqa	xmm8, xmm6
	vmovdqa	xmm13, xmm7
	vpclmulqdq	xmm6, xmm10, 0x10
	vpclmulqdq	xmm8, xmm10 , 0x1
	vpclmulqdq	xmm7, xmm10, 0x10
	vpclmulqdq	xmm13, xmm10 , 0x1
	vpxor	xmm6, xmm9
	vxorps	xmm6, xmm8
	vpxor	xmm7, xmm12
	vxorps	xmm7, xmm13

	sub	buf_len, 128

	; check if there is another 4KB (fetch distance) + 128B in the buffer
        cmp     buf_len, (fetch_dist + 128)
	jge	_fold_and_prefetch_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%endif ; fetch_dist != 0

align 16
_fold_128_B_loop:

	; update the buffer pointer
	add	in_buf, 128		;    buf += 128;

	vmovdqu	xmm9, [in_buf+16*0]
	vmovdqu	xmm12, [in_buf+16*1]
	vmovdqa	xmm8, xmm0
	vmovdqa	xmm13, xmm1
	vpclmulqdq	xmm0, xmm10, 0x10
	vpclmulqdq	xmm8, xmm10 , 0x1
	vpclmulqdq	xmm1, xmm10, 0x10
	vpclmulqdq	xmm13, xmm10 , 0x1
	vpxor	xmm0, xmm9
	vxorps	xmm0, xmm8
	vpxor	xmm1, xmm12
	vxorps	xmm1, xmm13

	vmovdqu	xmm9, [in_buf+16*2]
	vmovdqu	xmm12, [in_buf+16*3]
	vmovdqa	xmm8, xmm2
	vmovdqa	xmm13, xmm3
	vpclmulqdq	xmm2, xmm10, 0x10
	vpclmulqdq	xmm8, xmm10 , 0x1
	vpclmulqdq	xmm3, xmm10, 0x10
	vpclmulqdq	xmm13, xmm10 , 0x1
	vpxor	xmm2, xmm9
	vxorps	xmm2, xmm8
	vpxor	xmm3, xmm12
	vxorps	xmm3, xmm13

	vmovdqu	xmm9, [in_buf+16*4]
	vmovdqu	xmm12, [in_buf+16*5]
	vmovdqa	xmm8, xmm4
	vmovdqa	xmm13, xmm5
	vpclmulqdq	xmm4, xmm10, 0x10
	vpclmulqdq	xmm8, xmm10 , 0x1
	vpclmulqdq	xmm5, xmm10, 0x10
	vpclmulqdq	xmm13, xmm10 , 0x1
	vpxor	xmm4, xmm9
	vxorps	xmm4, xmm8
	vpxor	xmm5, xmm12
	vxorps	xmm5, xmm13

	vmovdqu	xmm9, [in_buf+16*6]
	vmovdqu	xmm12, [in_buf+16*7]
	vpclmulqdq	xmm8, xmm6, xmm10, 0x10
	vpclmulqdq	xmm6, xmm6, xmm10, 0x1
	vpclmulqdq	xmm13, xmm7, xmm10, 0x10
	vpclmulqdq	xmm7, xmm7, xmm10, 0x1
	vpxor	xmm6, xmm9
	vpxor 	xmm6, xmm8
	vpxor	xmm7, xmm12
	vpxor 	xmm7, xmm13

	sub	buf_len, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop

	add	in_buf, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	; fold the 8 xmm registers to 1 xmm register with different constants

	vmovdqa	xmm10, [rk9]
	vpclmulqdq	xmm8, xmm0, xmm10, 0x1
	vpclmulqdq	xmm0, xmm0, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor 	xmm7, xmm0

	vmovdqa	xmm10, [rk11]
	vpclmulqdq	xmm8, xmm1, xmm10, 0x1
	vpclmulqdq	xmm1, xmm1, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor 	xmm7, xmm1

	vmovdqa	xmm10, [rk13]
	vpclmulqdq	xmm8, xmm2, xmm10, 0x1
	vpclmulqdq	xmm2, xmm2, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm2

	vmovdqa	xmm10, [rk15]
	vpclmulqdq	xmm8, xmm3, xmm10, 0x1
	vpclmulqdq	xmm3, xmm3, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor 	xmm7, xmm3

	vmovdqa	xmm10, [rk17]
	vpclmulqdq	xmm8, xmm4, xmm10, 0x1
	vpclmulqdq	xmm4, xmm4, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm4

	vmovdqa	xmm10, [rk19]
	vpclmulqdq	xmm8, xmm5, xmm10, 0x1
	vpclmulqdq	xmm5, xmm5, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor 	xmm7, xmm5

	vmovdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
									;imm value of pclmulqdq instruction will determine which constant to use
	vpclmulqdq	xmm8, xmm6, xmm10, 0x1
	vpclmulqdq	xmm6, xmm6, xmm10, 0x10
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	buf_len, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	vpclmulqdq	xmm8, xmm7, xmm10, 0x1
	vpclmulqdq	xmm7, xmm7, xmm10, 0x10
	vpxor	xmm7, xmm8
	vmovdqu	xmm0, [in_buf]
	vpxor	xmm7, xmm0
	add	in_buf, 16
	sub	buf_len, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp buf_len, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	buf_len, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
        vmovdqa xmm2, xmm7
        vmovdqu xmm1, [in_buf - 16 + buf_len]

        ; get rid of the extra data that was loaded before
        ; load the shift constant
        lea     rax, [pshufb_shf_table]
        add     rax, buf_len
        vmovdqu  xmm0, [rax]


        vpshufb  xmm7, xmm0
        vpxor    xmm0, [mask3]
        vpshufb  xmm2, xmm0

        vpblendvb  xmm2, xmm2, xmm1, xmm0
        ;;;;;;;;;;
        vpclmulqdq       xmm8, xmm7, xmm10, 0x1

        vpclmulqdq       xmm7, xmm7, xmm10, 0x10
        vpxor    xmm7, xmm8
        vpxor    xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	; using CRC32Q can be easier than barrett reduction
        vmovq   tmp, xmm7
        vpextrq tmp2, xmm7, 1
        xor     rax, rax
        crc32   rax, tmp
        crc32   rax, tmp2

_cleanup:
%ifidn __OUTPUT_FORMAT__, win64
        vmovdqa  xmm6, [rsp + XMM_OFFSET + 16*0]
        vmovdqa  xmm7, [rsp + XMM_OFFSET + 16*1]
        vmovdqa  xmm8, [rsp + XMM_OFFSET + 16*2]
        vmovdqa  xmm9, [rsp + XMM_OFFSET + 16*3]
        vmovdqa  xmm10, [rsp + XMM_OFFSET + 16*4]
        vmovdqa  xmm11, [rsp + XMM_OFFSET + 16*5]
        vmovdqa  xmm12, [rsp + XMM_OFFSET + 16*6]
        vmovdqa  xmm13, [rsp + XMM_OFFSET + 16*7]
%endif
	add	rsp,VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	buf_len, 32
	jl	_less_than_32

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	vmovd	xmm0, init_crc	; get the initial crc value
	vmovdqu	xmm7, [in_buf]	; load the plaintext
	vpxor	xmm7, xmm0

	; update the buffer pointer
	add	in_buf, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	buf_len, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; data length can't be less than 17 bytes. already dealt with these cases in fastpath.
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	vmovd	xmm0, init_crc	; get the initial crc value
	vmovdqu	xmm7, [in_buf]	; load the plaintext
	vpxor	xmm7, xmm0	; xor the initial crc value
	add	in_buf, 16
	sub	buf_len, 16
	vmovdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms


; fastpath for short data
align 16
_no_more_than_16:
	test	buf_len, 16		; check if exact 16 bytes
	jz	_less_than_16		; no, do 8 bytes check
	crc32	rax, qword[in_buf]
	crc32	rax, qword[in_buf+8]
	jmp	_cleanup		; done

align 16
_less_than_16:
	test	buf_len, 8		; check if 8 bytes remaining at least
	jz	_less_than_8		; no, do 4 bytes check
	crc32	rax, qword[in_buf]	; calculate 8 bytes anyway
	add	in_buf,8

_less_than_8:
	test	buf_len, 4		; check if 4 bytes remaining at least
	jz	_less_than_4		; no, do 2 bytes check
	crc32	eax, dword[in_buf]	; calculate 4 bytes anyway
	add	in_buf, 4

_less_than_4:
	test	buf_len, 2		; check if 2 bytes remaining at least
	jz	_less_than_2		; no, do 1 byte check
	crc32	eax, word[in_buf]	; calculate 2 bytes anyway
	add	in_buf,2

_less_than_2:
	test	buf_len,1		; check if 1 byte remaining
	jz	_cleanup		; no, done
	crc32	eax, byte[in_buf]	; calculate 1 byte
	jmp	_cleanup		; all done


section .data

; precomputed constants
align 16
rk1: dq 0x00000000493c7d27
rk2: dq 0x0000000ec1068c50
rk3: dq 0x0000000206e38d70
rk4: dq 0x000000006992cea2
rk5: dq 0x00000000493c7d27
rk6: dq 0x00000000dd45aab8
rk7: dq 0x00000000dea713f0
rk8: dq 0x0000000105ec76f0
rk9: dq 0x0000000047db8317
rk10: dq 0x000000002ad91c30
rk11: dq 0x000000000715ce53
rk12: dq 0x00000000c49f4f67
rk13: dq 0x0000000039d3b296
rk14: dq 0x00000000083a6eec
rk15: dq 0x000000009e4addf8
rk16: dq 0x00000000740eef02
rk17: dq 0x00000000ddc0152b
rk18: dq 0x000000001c291d04
rk19: dq 0x00000000ba4fc28e
rk20: dq 0x000000003da6d0cb

mask:
dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq     0x8080808080808080, 0x8080808080808080

pshufb_shf_table:
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908