From 84cfe25fff490ffaba980910b0ecec42f6af90af Mon Sep 17 00:00:00 2001
From: Pablo de Lara <pablo.de.lara.guarch@intel.com>
Date: Wed, 5 Nov 2025 14:27:37 +0000
Subject: [PATCH] crc: remove Avoton optimized implementations

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
 Makefile.nmake           |   2 -
 cmake/crc.cmake          |   2 -
 crc/Makefile.am          |   2 -
 crc/crc16_t10dif_by4.asm | 625 --------------------------------------
 crc/crc32_ieee_by4.asm   | 626 ---------------------------------------
 crc/crc_multibinary.asm  |  14 -
 include/reg_sizes.asm    |   1 -
 7 files changed, 1272 deletions(-)
 delete mode 100644 crc/crc16_t10dif_by4.asm
 delete mode 100644 crc/crc32_ieee_by4.asm

diff --git a/Makefile.nmake b/Makefile.nmake
index a850a6c..8a2b1f9 100644
--- a/Makefile.nmake
+++ b/Makefile.nmake
@@ -132,14 +132,12 @@ objs = \
 	bin\pq_gen_avx512_gfni.obj \
 	bin\raid_multibinary.obj \
 	bin\crc16_t10dif_01.obj \
-	bin\crc16_t10dif_by4.obj \
 	bin\crc16_t10dif_02.obj \
 	bin\crc16_t10dif_by16_10.obj \
 	bin\crc16_t10dif_copy_by4.obj \
 	bin\crc16_t10dif_copy_by4_02.obj \
 	bin\crc32_ieee_01.obj \
 	bin\crc32_ieee_02.obj \
-	bin\crc32_ieee_by4.obj \
 	bin\crc32_ieee_by16_10.obj \
 	bin\crc32_iscsi_01.obj \
 	bin\crc32_iscsi_by8_02.obj \
diff --git a/cmake/crc.cmake b/cmake/crc.cmake
index 5e9a915..2e4a0f0 100644
--- a/cmake/crc.cmake
+++ b/cmake/crc.cmake
@@ -37,14 +37,12 @@ set(CRC_BASE_ALIASES_SOURCES
 
 set(CRC_X86_64_SOURCES
     crc/crc16_t10dif_01.asm
-    crc/crc16_t10dif_by4.asm
     crc/crc16_t10dif_02.asm
     crc/crc16_t10dif_by16_10.asm
     crc/crc16_t10dif_copy_by4.asm
     crc/crc16_t10dif_copy_by4_02.asm
     crc/crc32_ieee_01.asm
     crc/crc32_ieee_02.asm
-    crc/crc32_ieee_by4.asm
     crc/crc32_ieee_by16_10.asm
     crc/crc32_iscsi_01.asm
     crc/crc32_iscsi_by8_02.asm
diff --git a/crc/Makefile.am b/crc/Makefile.am
index 1d66656..f3db0da 100644
--- a/crc/Makefile.am
+++ b/crc/Makefile.am
@@ -39,14 +39,12 @@ lsrc_riscv64      += crc/crc_base_aliases.c
 
 lsrc_x86_64 += \
 	crc/crc16_t10dif_01.asm \
-	crc/crc16_t10dif_by4.asm \
 	crc/crc16_t10dif_02.asm \
 	crc/crc16_t10dif_by16_10.asm \
 	crc/crc16_t10dif_copy_by4.asm \
 	crc/crc16_t10dif_copy_by4_02.asm \
 	crc/crc32_ieee_01.asm \
 	crc/crc32_ieee_02.asm \
-	crc/crc32_ieee_by4.asm \
 	crc/crc32_ieee_by16_10.asm \
 	crc/crc32_iscsi_01.asm \
 	crc/crc32_iscsi_by8_02.asm \
diff --git a/crc/crc16_t10dif_by4.asm b/crc/crc16_t10dif_by4.asm
deleted file mode 100644
index 0f63fb0..0000000
--- a/crc/crc16_t10dif_by4.asm
+++ /dev/null
@@ -1,625 +0,0 @@
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-;
-;  Redistribution and use in source and binary forms, with or without
-;  modification, are permitted provided that the following conditions
-;  are met:
-;    * Redistributions of source code must retain the above copyright
-;      notice, this list of conditions and the following disclaimer.
-;    * Redistributions in binary form must reproduce the above copyright
-;      notice, this list of conditions and the following disclaimer in
-;      the documentation and/or other materials provided with the
-;      distribution.
-;    * Neither the name of Intel Corporation nor the names of its
-;      contributors may be used to endorse or promote products derived
-;      from this software without specific prior written permission.
-;
-;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;       Function API:
-;       UINT16 crc16_t10dif_by4(
-;               UINT16 init_crc, //initial CRC value, 16 bits
-;               const unsigned char *buf, //buffer pointer to calculate CRC on
-;               UINT64 len //buffer length in bytes (64-bit data)
-;       );
-;
-;       Authors:
-;               Erdinc Ozturk
-;               Vinodh Gopal
-;               James Guilford
-;
-;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
-;
-
-%include "reg_sizes.asm"
-
-%ifndef fetch_dist
-%define	fetch_dist	4096
-%endif
-
-%ifndef PREFETCH
-%define PREFETCH        prefetcht1
-%endif
-
-[bits 64]
-default rel
-
-section .text
-%ifidn __OUTPUT_FORMAT__, win64
-	%xdefine        arg1 rcx
-	%xdefine        arg2 rdx
-	%xdefine        arg3 r8
-
-	%xdefine        arg1_low32 ecx
-%else
-	%xdefine        arg1 rdi
-	%xdefine        arg2 rsi
-	%xdefine        arg3 rdx
-
-	%xdefine        arg1_low32 edi
-%endif
-
-align 16
-mk_global 	crc16_t10dif_by4, function
-crc16_t10dif_by4:
-	endbranch
-
-	; adjust the 16-bit initial_crc value, scale it to 32 bits
-	shl	arg1_low32, 16
-
-	; After this point, code flow is exactly same as a 32-bit CRC.
-	; The only difference is before returning eax, we will shift
-	; it right 16 bits, to scale back to 16 bits.
-
-	sub	rsp,16*4+8
-
-	; push the xmm registers into the stack to maintain
-	movdqa [rsp+16*2],xmm6
-	movdqa [rsp+16*3],xmm7
-
-	; check if smaller than 128B
-	cmp	arg3, 128
-
-	; for sizes less than 128, we can't fold 64B at a time...
-	jl	_less_than_128
-
-
-	; load the initial crc value
-	movd	xmm6, arg1_low32	; initial crc
-
-	; crc value does not need to be byte-reflected, but it needs to
-	; be moved to the high part of the register.
-	; because data will be byte-reflected and will align with
-	; initial crc at correct place.
-	pslldq	xmm6, 12
-
-	movdqa xmm7, [SHUF_MASK]
-	; receive the initial 64B data, xor the initial crc value
-	movdqu	xmm0, [arg2]
-	movdqu	xmm1, [arg2+16]
-	movdqu	xmm2, [arg2+32]
-	movdqu	xmm3, [arg2+48]
-
-	pshufb	xmm0, xmm7
-	; XOR the initial_crc value
-	pxor	xmm0, xmm6
-	pshufb	xmm1, xmm7
-	pshufb	xmm2, xmm7
-	pshufb	xmm3, xmm7
-
-	movdqa	xmm6, [rk3]	;xmm6 has rk3 and rk4
-					;imm value of pclmulqdq instruction
-					;will determine which constant to use
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	; we subtract 128 instead of 64 to save one instruction from the loop
-	sub	arg3, 128
-
-	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
-	; buffer. The _fold_64_B_loop
-	; loop will fold 64B at a time until we have 64+y Bytes of buffer
-
-%if fetch_dist != 0
-	; check if there is at least 4KB (fetch distance) + 64B in the buffer
-        cmp     arg3, (fetch_dist + 64)
-        jb     _fold_64_B_loop
-
-	; fold 64B at a time. This section of the code folds 4 xmm
-	; registers in parallel
-align 16
-_fold_and_prefetch_64_B_loop:
-
-	; update the buffer pointer
-	add	arg2, 64		;    buf += 64;
-
-	PREFETCH [arg2+fetch_dist+0]
-	movdqu	xmm4, xmm0
-	movdqu	xmm5, xmm1
-
-	pclmulqdq	xmm0, xmm6 , 0x11
-	pclmulqdq	xmm1, xmm6 , 0x11
-
-	pclmulqdq	xmm4, xmm6, 0x0
-	pclmulqdq	xmm5, xmm6, 0x0
-
-	pxor	xmm0, xmm4
-   	pxor	xmm1, xmm5
-
-	movdqu	xmm4, xmm2
-	movdqu	xmm5, xmm3
-
-	pclmulqdq	xmm2, xmm6, 0x11
-	pclmulqdq	xmm3, xmm6, 0x11
-
-	pclmulqdq	xmm4, xmm6, 0x0
-	pclmulqdq	xmm5, xmm6, 0x0
-
-	pxor	xmm2, xmm4
-	pxor	xmm3, xmm5
-
-	movdqu	xmm4, [arg2]
-	movdqu	xmm5, [arg2+16]
-	pshufb	xmm4, xmm7
-	pshufb	xmm5, xmm7
-	pxor	xmm0, xmm4
-	pxor	xmm1, xmm5
-
-	movdqu	xmm4, [arg2+32]
-	movdqu	xmm5, [arg2+48]
-	pshufb	xmm4, xmm7
-	pshufb	xmm5, xmm7
-
-	pxor	xmm2, xmm4
-	pxor	xmm3, xmm5
-
-	sub	arg3, 64
-
-	; check if there is another 64B in the buffer to be able to fold
-        cmp     arg3, (fetch_dist + 64)
-	jge	_fold_and_prefetch_64_B_loop
-%endif ; fetch_dist != 0
-
-	; fold 64B at a time. This section of the code folds 4 xmm
-	; registers in parallel
-
-align 16
-_fold_64_B_loop:
-
-	; update the buffer pointer
-	add	arg2, 64		;    buf += 64;
-
-	movdqu	xmm4, xmm0
-	movdqu	xmm5, xmm1
-
-	pclmulqdq	xmm0, xmm6 , 0x11
-	pclmulqdq	xmm1, xmm6 , 0x11
-
-	pclmulqdq	xmm4, xmm6, 0x0
-	pclmulqdq	xmm5, xmm6, 0x0
-
-	pxor	xmm0, xmm4
-   	pxor	xmm1, xmm5
-
-	movdqu	xmm4, xmm2
-	movdqu	xmm5, xmm3
-
-	pclmulqdq	xmm2, xmm6, 0x11
-	pclmulqdq	xmm3, xmm6, 0x11
-
-	pclmulqdq	xmm4, xmm6, 0x0
-	pclmulqdq	xmm5, xmm6, 0x0
-
-	pxor	xmm2, xmm4
-	pxor	xmm3, xmm5
-
-	movdqu	xmm4, [arg2]
-	movdqu	xmm5, [arg2+16]
-	pshufb	xmm4, xmm7
-	pshufb	xmm5, xmm7
-	pxor	xmm0, xmm4
-	pxor	xmm1, xmm5
-
-	movdqu	xmm4, [arg2+32]
-	movdqu	xmm5, [arg2+48]
-	pshufb	xmm4, xmm7
-	pshufb	xmm5, xmm7
-
-	pxor	xmm2, xmm4
-	pxor	xmm3, xmm5
-
-	sub	arg3, 64
-
-	; check if there is another 64B in the buffer to be able to fold
-	jge	_fold_64_B_loop
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-
-	add	arg2, 64
-	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
-	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
-
-
-	; fold the 4 xmm registers to 1 xmm register with different constants
-
-	movdqa	xmm6, [rk1]	;xmm6 has rk1 and rk2
-					;imm value of pclmulqdq instruction will
-					;determine which constant to use
-
-	movdqa	xmm4, xmm0
-	pclmulqdq	xmm0, xmm6, 0x11
-	pclmulqdq	xmm4, xmm6, 0x0
-	pxor	xmm1, xmm4
-	pxor	xmm1, xmm0
-
-	movdqa	xmm4, xmm1
-	pclmulqdq	xmm1, xmm6, 0x11
-	pclmulqdq	xmm4, xmm6, 0x0
-	pxor	xmm2, xmm4
-	pxor	xmm2, xmm1
-
-	movdqa	xmm4, xmm2
-	pclmulqdq	xmm2, xmm6, 0x11
-	pclmulqdq	xmm4, xmm6, 0x0
-	pxor	xmm3, xmm4
-	pxor	xmm3, xmm2
-
-
-	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
-	; instead of a cmp instruction, we use the negative flag with the jl instruction
-	add	arg3, 64-16
-	jl	_final_reduction_for_128
-
-	; now we have 16+y bytes left to reduce. 16 Bytes
-	; is in register xmm3 and the rest is in memory
-	; we can fold 16 bytes at a time if y>=16
-	; continue folding 16B at a time
-
-_16B_reduction_loop:
-	movdqa	xmm4, xmm3
-	pclmulqdq	xmm3, xmm6, 0x11
-	pclmulqdq	xmm4, xmm6, 0x0
-	pxor	xmm3, xmm4
-	movdqu	xmm0, [arg2]
-	pshufb	xmm0, xmm7
-	pxor	xmm3, xmm0
-	add	arg2, 16
-	sub	arg3, 16
-	; instead of a cmp instruction, we utilize the flags with the jge instruction
-	; equivalent of: cmp arg3, 16-16
-	; check if there is any more 16B in the buffer to be able to fold
-	jge	_16B_reduction_loop
-
-	;now we have 16+z bytes left to reduce, where 0<= z < 16.
-	;first, we reduce the data in the xmm3 register
-
-
-_final_reduction_for_128:
-	; check if any more data to fold. If not, compute the CRC of the final 128 bits
-	add	arg3, 16
-	je	_128_done
-
-	; here we are getting data that is less than 16 bytes.
-	; since we know that there was data before the pointer,
-	; we can offset the input pointer before the actual point,
-	; to receive exactly 16 bytes.
-	; after that the registers need to be adjusted.
-_get_last_two_xmms:
-	movdqa	xmm2, xmm3
-
-	movdqu	xmm1, [arg2 - 16 + arg3]
-	pshufb	xmm1, xmm7
-
-	; get rid of the extra data that was loaded before
-	; load the shift constant
-	lea	rax, [pshufb_shf_table + 16]
-	sub	rax, arg3
-	movdqu	xmm0, [rax]
-
-	; shift xmm2 to the left by arg3 bytes
-	pshufb	xmm2, xmm0
-
-	; shift xmm3 to the right by 16-arg3 bytes
-	pxor	xmm0, [mask1]
-	pshufb	xmm3, xmm0
-	pblendvb	xmm1, xmm2	;xmm0 is implicit
-
-	; fold 16 Bytes
-	movdqa	xmm2, xmm1
-	movdqa	xmm4, xmm3
-	pclmulqdq	xmm3, xmm6, 0x11
-	pclmulqdq	xmm4, xmm6, 0x0
-	pxor	xmm3, xmm4
-	pxor	xmm3, xmm2
-
-_128_done:
-	; compute crc of a 128-bit value
-	movdqa	xmm6, [rk5]	; rk5 and rk6 in xmm6
-	movdqa	xmm0, xmm3
-
-	;64b fold
-	pclmulqdq	xmm3, xmm6, 0x1
-	pslldq	xmm0, 8
-	pxor	xmm3, xmm0
-
-	;32b fold
-	movdqa	xmm0, xmm3
-
-	pand	xmm0, [mask2]
-
-	psrldq	xmm3, 12
-	pclmulqdq	xmm3, xmm6, 0x10
-	pxor	xmm3, xmm0
-
-	;barrett reduction
-_barrett:
-	movdqa	xmm6, [rk7]	; rk7 and rk8 in xmm6
-	movdqa	xmm0, xmm3
-	pclmulqdq	xmm3, xmm6, 0x01
-	pslldq	xmm3, 4
-	pclmulqdq	xmm3, xmm6, 0x11
-
-	pslldq	xmm3, 4
-	pxor	xmm3, xmm0
-	pextrd	eax, xmm3,1
-
-_cleanup:
-	; scale the result back to 16 bits
-	shr	eax, 16
-	movdqa	xmm6, [rsp+16*2]
-	movdqa	xmm7, [rsp+16*3]
-	add	rsp,16*4+8
-	ret
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-_less_than_128:
-
-	; check if there is enough buffer to be able to fold 16B at a time
-	cmp	arg3, 32
-	jl	_less_than_32
-	movdqa xmm7, [SHUF_MASK]
-
-	; if there is, load the constants
-	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
-
-	movd	xmm0, arg1_low32	; get the initial crc value
-	pslldq	xmm0, 12	; align it to its correct place
-	movdqu	xmm3, [arg2]	; load the plaintext
-	pshufb	xmm3, xmm7	; byte-reflect the plaintext
-	pxor	xmm3, xmm0
-
-
-	; update the buffer pointer
-	add	arg2, 16
-
-	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
-	sub	arg3, 32
-
-	jmp	_16B_reduction_loop
-
-
-align 16
-_less_than_32:
-	; mov initial crc to the return value. this is necessary for zero-length buffers.
-	mov	eax, arg1_low32
-	test	arg3, arg3
-	je	_cleanup
-
-	movdqa xmm7, [SHUF_MASK]
-
-	movd	xmm0, arg1_low32	; get the initial crc value
-	pslldq	xmm0, 12		; align it to its correct place
-
-	cmp	arg3, 16
-	je	_exact_16_left
-	jl	_less_than_16_left
-
-	movdqu	xmm3, [arg2]	; load the plaintext
-	pshufb	xmm3, xmm7	; byte-reflect the plaintext
-	pxor	xmm3, xmm0	; xor the initial crc value
-	add	arg2, 16
-	sub	arg3, 16
-	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
-	jmp	_get_last_two_xmms
-
-
-align 16
-_less_than_16_left:
-	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
-
-	pxor	xmm1, xmm1
-	mov	r11, rsp
-	movdqa	[r11], xmm1
-
-	cmp	arg3, 4
-	jl	_only_less_than_4
-
-	;	backup the counter value
-	mov	r9, arg3
-	cmp	arg3, 8
-	jl	_less_than_8_left
-
-	; load 8 Bytes
-	mov	rax, [arg2]
-	mov	[r11], rax
-	add	r11, 8
-	sub	arg3, 8
-	add	arg2, 8
-_less_than_8_left:
-
-	cmp	arg3, 4
-	jl	_less_than_4_left
-
-	; load 4 Bytes
-	mov	eax, [arg2]
-	mov	[r11], eax
-	add	r11, 4
-	sub	arg3, 4
-	add	arg2, 4
-_less_than_4_left:
-
-	cmp	arg3, 2
-	jl	_less_than_2_left
-
-	; load 2 Bytes
-	mov	ax, [arg2]
-	mov	[r11], ax
-	add	r11, 2
-	sub	arg3, 2
-	add	arg2, 2
-_less_than_2_left:
-	cmp	arg3, 1
-	jl	_zero_left
-
-	; load 1 Byte
-	mov	al, [arg2]
-	mov	[r11], al
-_zero_left:
-	movdqa	xmm3, [rsp]
-	pshufb	xmm3, xmm7
-	pxor	xmm3, xmm0	; xor the initial crc value
-
-	; shl r9, 4
-	lea	rax, [pshufb_shf_table + 16]
-	sub	rax, r9
-	movdqu	xmm0, [rax]
-	pxor	xmm0, [mask1]
-
-	pshufb	xmm3, xmm0
-	jmp	_128_done
-
-align 16
-_exact_16_left:
-	movdqu	xmm3, [arg2]
-	pshufb	xmm3, xmm7
-	pxor	xmm3, xmm0	; xor the initial crc value
-
-	jmp	_128_done
-
-_only_less_than_4:
-	cmp	arg3, 3
-	jl	_only_less_than_3
-
-	; load 3 Bytes
-	mov	al, [arg2]
-	mov	[r11], al
-
-	mov	al, [arg2+1]
-	mov	[r11+1], al
-
-	mov	al, [arg2+2]
-	mov	[r11+2], al
-
-	movdqa	xmm3, [rsp]
-	pshufb	xmm3, xmm7
-	pxor	xmm3, xmm0	; xor the initial crc value
-
-	psrldq	xmm3, 5
-
-	jmp	_barrett
-_only_less_than_3:
-	cmp	arg3, 2
-	jl	_only_less_than_2
-
-	; load 2 Bytes
-	mov	al, [arg2]
-	mov	[r11], al
-
-	mov	al, [arg2+1]
-	mov	[r11+1], al
-
-	movdqa	xmm3, [rsp]
-	pshufb	xmm3, xmm7
-	pxor	xmm3, xmm0	; xor the initial crc value
-
-	psrldq	xmm3, 6
-
-	jmp	_barrett
-_only_less_than_2:
-
-	; load 1 Byte
-	mov	al, [arg2]
-	mov	[r11], al
-
-	movdqa	xmm3, [rsp]
-	pshufb	xmm3, xmm7
-	pxor	xmm3, xmm0	; xor the initial crc value
-
-	psrldq	xmm3, 7
-
-	jmp	_barrett
-
-section .data
-
-; precomputed constants
-; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
-align 16
-; Q = 0x18BB70000
-; rk1 = 2^(32*3) mod Q << 32
-; rk2 = 2^(32*5) mod Q << 32
-; rk3 = 2^(32*15) mod Q << 32
-; rk4 = 2^(32*17) mod Q << 32
-; rk5 = 2^(32*3) mod Q << 32
-; rk6 = 2^(32*2) mod Q << 32
-; rk7 = floor(2^64/Q)
-; rk8 = Q
-rk1:
-DQ 0x2d56000000000000
-rk2:
-DQ 0x06df000000000000
-rk3:
-DQ 0x044c000000000000
-rk4:
-DQ 0xe658000000000000
-rk5:
-DQ 0x2d56000000000000
-rk6:
-DQ 0x1368000000000000
-rk7:
-DQ 0x00000001f65a57f8
-rk8:
-DQ 0x000000018bb70000
-mask1:
-dq 0x8080808080808080, 0x8080808080808080
-mask2:
-dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
-
-SHUF_MASK:
-dq 0x08090A0B0C0D0E0F, 0x0001020304050607
-
-pshufb_shf_table:
-; use these values for shift constants for the pshufb instruction
-; different alignments result in values as shown:
-;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
-;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
-;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
-;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
-;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
-;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
-;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
-;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
-;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
-;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
-;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
-;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
-;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
-;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
-;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
-dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
-dq 0x0706050403020100, 0x000e0d0c0b0a0908
diff --git a/crc/crc32_ieee_by4.asm b/crc/crc32_ieee_by4.asm
deleted file mode 100644
index faf30e7..0000000
--- a/crc/crc32_ieee_by4.asm
+++ /dev/null
@@ -1,626 +0,0 @@
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
-;
-;  Redistribution and use in source and binary forms, with or without
-;  modification, are permitted provided that the following conditions
-;  are met:
-;    * Redistributions of source code must retain the above copyright
-;      notice, this list of conditions and the following disclaimer.
-;    * Redistributions in binary form must reproduce the above copyright
-;      notice, this list of conditions and the following disclaimer in
-;      the documentation and/or other materials provided with the
-;      distribution.
-;    * Neither the name of Intel Corporation nor the names of its
-;      contributors may be used to endorse or promote products derived
-;      from this software without specific prior written permission.
-;
-;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-;       Function API:
-;       UINT32 crc32_ieee_by4(
-;               UINT32 init_crc, //initial CRC value, 32 bits
-;               const unsigned char *buf, //buffer pointer to calculate CRC on
-;               UINT64 len //buffer length in bytes (64-bit data)
-;       );
-;
-;       Authors:
-;               Erdinc Ozturk
-;               Vinodh Gopal
-;               James Guilford
-;
-;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
-;
-
-%include "reg_sizes.asm"
-
-%ifndef fetch_dist
-%define	fetch_dist	4096
-%endif
-
-%ifndef PREFETCH
-%define PREFETCH        prefetcht1
-%endif
-
-[bits 64]
-default rel
-
-section .text
-
-%ifidn __OUTPUT_FORMAT__, win64
-	%xdefine        arg1 rcx
-	%xdefine        arg2 rdx
-	%xdefine        arg3 r8
-
-	%xdefine        arg1_low32 ecx
-%else
-	%xdefine        arg1 rdi
-	%xdefine        arg2 rsi
-	%xdefine        arg3 rdx
-
-	%xdefine        arg1_low32 edi
-%endif
-
-%ifidn __OUTPUT_FORMAT__, win64
-	%define XMM_SAVE 16*2
-	%define VARIABLE_OFFSET 16*4+8
-%else
-	%define VARIABLE_OFFSET 16*2+8
-%endif
-
-align 16
-mk_global 	crc32_ieee_by4, function
-crc32_ieee_by4:
-	endbranch
-
-	not arg1_low32
-
-	sub rsp,VARIABLE_OFFSET
-
-%ifidn __OUTPUT_FORMAT__, win64
-	; push the xmm registers into the stack to maintain
-	movdqa [rsp + XMM_SAVE + 16*0],xmm6
-	movdqa [rsp + XMM_SAVE + 16*1],xmm7
-%endif
-
-	; check if smaller than 128B
-	cmp arg3, 128
-	jl _less_than_128
-
-
-
-	; load the initial crc value
-	movd xmm6, arg1_low32					; initial crc
-	; crc value does not need to be byte-reflected, but it needs to be
-	; moved to the high part of the register.
-	; because data will be byte-reflected and will align with initial
-	; crc at correct place.
-	pslldq xmm6, 12
-
-
-
-	movdqa xmm7, [SHUF_MASK]
-	; receive the initial 64B data, xor the initial crc value
-	movdqu xmm0, [arg2]
-	movdqu xmm1, [arg2+16]
-	movdqu xmm2, [arg2+32]
-	movdqu xmm3, [arg2+48]
-
-
-
-	pshufb xmm0, xmm7
-	; XOR the initial_crc value
-	pxor xmm0, xmm6
-	pshufb xmm1, xmm7
-	pshufb xmm2, xmm7
-	pshufb xmm3, xmm7
-
-	movdqa xmm6, [rk3]	; k3=2^480 mod POLY << 32
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	;we subtract 128 instead of 64 to save one instruction from the loop
-	sub	arg3, 128
-
-	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
-	; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
-	;  have 64+y Bytes of buffer
-
-%if fetch_dist != 0
-	; check if there is another 4KB (fetch distance) + 128B in the buffer
-        cmp     arg3, (fetch_dist + 64)
-	jge	_fold_and_prefetch_64_B_loop
-
-	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
-align 16
-_fold_and_prefetch_64_B_loop:
-	;update the buffer pointer
-	add arg2, 64
-
-	movdqa xmm4, xmm0
-	movdqa xmm5, xmm1
-
-	pclmulqdq xmm0, xmm6 , 0x11
-	pclmulqdq xmm1, xmm6 , 0x11
-
-	pclmulqdq xmm4, xmm6, 0x0
-	pclmulqdq xmm5, xmm6, 0x0
-
-	pxor xmm0, xmm4
-   	pxor xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	movdqa xmm5, xmm3
-
-	pclmulqdq xmm2, xmm6, 0x11
-	pclmulqdq xmm3, xmm6, 0x11
-
-	pclmulqdq xmm4, xmm6, 0x0
-	pclmulqdq xmm5, xmm6, 0x0
-
-	pxor xmm2, xmm4
-	pxor xmm3, xmm5
-
-	movdqu xmm4, [arg2]
-	movdqu xmm5, [arg2+16]
-	pshufb xmm4, xmm7
-	pshufb xmm5, xmm7
-	pxor xmm0, xmm4
-	pxor xmm1, xmm5
-
-	movdqu xmm4, [arg2+32]
-	movdqu xmm5, [arg2+48]
-	pshufb xmm4, xmm7
-	pshufb xmm5, xmm7
-
-	pxor xmm2, xmm4
-	pxor xmm3, xmm5
-
-	sub	arg3, 64
-
-	; check if there is another 4KB (fetch distance) + 64B in the buffer
-        cmp     arg3, (fetch_dist + 64)
-	jge	_fold_and_prefetch_64_B_loop
-
-%endif ; fetch_dist != 0
-
-	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
-align 16
-_fold_64_B_loop:
-
-	;update the buffer pointer
-	add arg2, 64
-
-	PREFETCH [arg2+fetch_dist+0]
-	movdqa xmm4, xmm0
-	movdqa xmm5, xmm1
-
-	pclmulqdq xmm0, xmm6 , 0x11
-	pclmulqdq xmm1, xmm6 , 0x11
-
-	pclmulqdq xmm4, xmm6, 0x0
-	pclmulqdq xmm5, xmm6, 0x0
-
-	pxor xmm0, xmm4
-   	pxor xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	movdqa xmm5, xmm3
-
-	pclmulqdq xmm2, xmm6, 0x11
-	pclmulqdq xmm3, xmm6, 0x11
-
-	pclmulqdq xmm4, xmm6, 0x0
-	pclmulqdq xmm5, xmm6, 0x0
-
-	pxor xmm2, xmm4
-	pxor xmm3, xmm5
-
-	movdqu xmm4, [arg2]
-	movdqu xmm5, [arg2+16]
-	pshufb xmm4, xmm7
-	pshufb xmm5, xmm7
-	pxor xmm0, xmm4
-	pxor xmm1, xmm5
-
-	movdqu xmm4, [arg2+32]
-	movdqu xmm5, [arg2+48]
-	pshufb xmm4, xmm7
-	pshufb xmm5, xmm7
-
-	pxor xmm2, xmm4
-	pxor xmm3, xmm5
-
-	sub	arg3, 64
-
-	; check if there is another 64B in the buffer to be able to fold
-	jge	_fold_64_B_loop
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-
-	add arg2, 64
-	;at this point, the arg2 is pointing at the last y Bytes of the buffer
-	; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
-
-
-	movdqa xmm6, [rk1]		;k1
-
-	; fold the 4 xmm registers to 1 xmm register with different constants
-	movdqa xmm4, xmm0
-	pclmulqdq xmm0, xmm6, 0x11
-	pclmulqdq xmm4, xmm6, 0x0
-	pxor xmm1, xmm4
-	xorps xmm1, xmm0
-
-	movdqa xmm4, xmm1
-	pclmulqdq xmm1, xmm6, 0x11
-	pclmulqdq xmm4, xmm6, 0x0
-	pxor xmm2, xmm4
-	xorps xmm2, xmm1
-
-	movdqa xmm4, xmm2
-	pclmulqdq xmm2, xmm6, 0x11
-	pclmulqdq xmm4, xmm6, 0x0
-	pxor xmm3, xmm4
-	pxor xmm3, xmm2
-
-
-	;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
-	; instead of a cmp instruction, we use the negative flag with the jl instruction
-	add arg3, 64-16
-	jl _final_reduction_for_128
-
-; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
-; we can fold 16 bytes at a time if y>=16
-; continue folding 16B at a time
-
-_16B_reduction_loop:
-	movdqa xmm4, xmm3
-	pclmulqdq xmm3, xmm6, 0x11
-	pclmulqdq xmm4, xmm6, 0x0
-	pxor xmm3, xmm4
-	movdqu xmm0, [arg2]
-	pshufb xmm0, xmm7
-	pxor xmm3, xmm0
-	add arg2, 16
-	sub arg3, 16
-	; instead of a cmp instruction, we utilize the flags with the jge instruction
-	; equivalent of: cmp arg3, 16-16
-	; check if there is any more 16B in the buffer to be able to fold
-	jge _16B_reduction_loop
-
-	;now we have 16+z bytes left to reduce, where 0<= z < 16.
-	;first, we reduce the data in the xmm3 register
-
-
-
-_final_reduction_for_128:
-	; check if any more data to fold. If not, compute the CRC of the final 128 bits
-	add arg3, 16
-	je _128_done
-
-	; here we are getting data that is less than 16 bytes.
-	; since we know that there was data before the pointer, we can offset
-	; the input pointer before the actual point, to receive exactly 16 bytes.
-	; after that the registers need to be adjusted.
-_get_last_two_xmms:
-	movdqa xmm2, xmm3
-
-	movdqu xmm1, [arg2 - 16 + arg3]
-	pshufb xmm1, xmm7
-
-	shl arg3, 4
-	lea rax, [pshufb_shf_table + 15*16]
-	sub rax, arg3
-	movdqu xmm0, [rax]
-
-	pshufb	xmm2, xmm0
-
-	pxor xmm0, [mask3]
-
-	pshufb	xmm3, xmm0
-
-	pblendvb xmm1, xmm2	;xmm0 is implicit
-
-	movdqa xmm2, xmm1
-
-	movdqa xmm4, xmm3
-	pclmulqdq xmm3, xmm6, 0x11
-
-	pclmulqdq xmm4, xmm6, 0x0
-	pxor xmm3, xmm4
-	pxor xmm3, xmm2
-
-_128_done:
-
-	movdqa xmm6, [rk5]
-	movdqa xmm0, xmm3
-
-	;64b fold
-	pclmulqdq xmm3, xmm6, 0x1
-	pslldq xmm0, 8
-	pxor xmm3, xmm0
-
-	;32b fold
-	movdqa xmm0, xmm3
-
-	pand xmm0, [mask4]
-
-	psrldq xmm3, 12
-	pclmulqdq xmm3, xmm6, 0x10
-	pxor xmm3, xmm0
-
-	;barrett reduction
-_barrett:
-	movdqa xmm6, [rk7]
-	movdqa xmm0, xmm3
-	pclmulqdq xmm3, xmm6, 0x01
-	pslldq xmm3, 4
-	pclmulqdq xmm3, xmm6, 0x11
-
-	pslldq xmm3, 4
-	pxor xmm3, xmm0
-	pextrd eax, xmm3,1
-
-_cleanup:
-	not eax
-%ifidn __OUTPUT_FORMAT__, win64
-	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
-	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
-%endif
-	add rsp,VARIABLE_OFFSET
-
-
-	ret
-
-
-
-
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-align 16
-_less_than_128:
-
-	;check if there is enough buffer to be able to fold 16B at a time
-	cmp arg3, 32
-	jl _less_than_32
-	movdqa xmm7, [SHUF_MASK]
-
-	;if there is, load the constants
-	movdqa xmm6, [rk1]		;k1
-
-	movd xmm0, arg1_low32
-	pslldq xmm0, 12
-	movdqu xmm3, [arg2]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-
-	;update the buffer pointer
-	add arg2, 16
-
-	;update the counter. subtract 32 instead of 16 to save one instruction from the loop
-	sub arg3, 32
-
-	jmp _16B_reduction_loop
-
-
-align 16
-_less_than_32:
-	mov eax, arg1_low32
-	test arg3, arg3
-	je _cleanup
-
-	movdqa xmm7, [SHUF_MASK]
-
-	movd xmm0, arg1_low32
-	pslldq xmm0, 12
-
-	cmp arg3, 16
-	je _exact_16_left
-	jl _less_than_16_left
-	movd xmm0, arg1_low32
-	pslldq xmm0, 12
-	movdqu xmm3, [arg2]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-	add arg2, 16
-	sub arg3, 16
-	movdqa xmm6, [rk1]		;k1
-	jmp _get_last_two_xmms
-
-
-align 16
-_less_than_16_left:
-	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
-
-	pxor xmm1, xmm1
-	mov r11, rsp
-	movdqa [r11], xmm1
-
-
-
-	cmp arg3, 4
-	jl _only_less_than_4
-
-	mov r9, arg3
-
-
-	cmp arg3, 8
-	jl _less_than_8_left
-	mov rax, [arg2]
-	mov [r11], rax
-	add r11, 8
-	sub arg3, 8
-	add arg2, 8
-_less_than_8_left:
-
-	cmp arg3, 4
-	jl _less_than_4_left
-	mov eax, [arg2]
-	mov [r11], eax
-	add r11, 4
-	sub arg3, 4
-	add arg2, 4
-_less_than_4_left:
-
-	cmp arg3, 2
-	jl _less_than_2_left
-	mov ax, [arg2]
-	mov [r11], ax
-	add r11, 2
-	sub arg3, 2
-	add arg2, 2
-_less_than_2_left:
-	cmp arg3, 1
-	jl _zero_left
-
-	mov al, [arg2]
-	mov [r11], al
-
-_zero_left:
-	movdqa xmm3, [rsp]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-	shl r9, 4
-	lea rax, [pshufb_shf_table + 15*16]
-	sub rax, r9
-	movdqu xmm0, [rax]
-	pxor xmm0, [mask3]
-
-	pshufb xmm3, xmm0
-	jmp _128_done
-
-align 16
-_exact_16_left:
-	movdqu xmm3, [arg2]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-	jmp _128_done
-
-_only_less_than_4:
-	cmp arg3, 3
-	jl _only_less_than_3
-	mov al, [arg2]
-	mov [r11], al
-
-	mov al, [arg2+1]
-	mov [r11+1], al
-
-	mov al, [arg2+2]
-	mov [r11+2], al
-
-	movdqa xmm3, [rsp]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-	psrldq xmm3, 5
-
-	jmp _barrett
-_only_less_than_3:
-	cmp arg3, 2
-	jl _only_less_than_2
-	mov al, [arg2]
-	mov [r11], al
-
-	mov al, [arg2+1]
-	mov [r11+1], al
-
-	movdqa xmm3, [rsp]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-	psrldq xmm3, 6
-
-	jmp _barrett
-_only_less_than_2:
-	mov al, [arg2]
-	mov [r11], al
-
-	movdqa xmm3, [rsp]
-	pshufb xmm3, xmm7
-	pxor xmm3, xmm0
-
-	psrldq xmm3, 7
-
-	jmp _barrett
-; precomputed constants
-section .data
-
-align 16
-rk1:
-DQ 0xf200aa6600000000
-rk2:
-DQ 0x17d3315d00000000
-rk3:
-DQ 0xd3504ec700000000
-rk4:
-DQ 0x57a8445500000000
-rk5:
-DQ 0xf200aa6600000000
-rk6:
-DQ 0x490d678d00000000
-rk7:
-DQ 0x0000000104d101df
-rk8:
-DQ 0x0000000104c11db7
-mask:
-dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
-mask2:
-dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
-mask3:
-dq 0x8080808080808080, 0x8080808080808080
-mask4:
-dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
-	align 32
-pshufb_shf_table:
-
-	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
-
-	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
-
-	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
-
-	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
-
-	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
-
-	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
-
-	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
-
-	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
-
-	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
-
-	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
-
-	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
-
-	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
-
-	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
-
-	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
-
-	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
-
-
-SHUF_MASK	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
diff --git a/crc/crc_multibinary.asm b/crc/crc_multibinary.asm
index 056489f..2347257 100644
--- a/crc/crc_multibinary.asm
+++ b/crc/crc_multibinary.asm
@@ -37,12 +37,10 @@ extern crc32_iscsi_by8_02
 extern crc32_iscsi_base
 
 extern crc32_ieee_01
-extern crc32_ieee_by4  ;; Optimized for SLM
 extern crc32_ieee_02
 extern crc32_ieee_base
 
 extern crc16_t10dif_01
-extern crc16_t10dif_by4  ;; Optimized for SLM
 extern crc16_t10dif_02
 extern crc16_t10dif_base
 
@@ -101,12 +99,6 @@ crc32_ieee_dispatch_init:
 	jz	.crc_ieee_init_done ; use ieee_base
 	lea	rsi, [crc32_ieee_01 WRT_OPT]
 
-	;; Extra Avoton test
-	lea	rdx, [crc32_ieee_by4 WRT_OPT]
-	and     eax, FLAG_CPUID1_EAX_STEP_MASK
-	cmp     eax, FLAG_CPUID1_EAX_AVOTON
-	cmove   rsi, rdx
-
 	;; Test for XMM_YMM support/AVX
 	test	ecx, FLAG_CPUID1_ECX_OSXSAVE
 	je	.crc_ieee_init_done
@@ -180,12 +172,6 @@ crc16_t10dif_dispatch_init:
 	jz	.t10dif_init_done ; use t10dif_base
 	lea	rsi, [crc16_t10dif_01 WRT_OPT]
 
-	;; Extra Avoton test
-	lea	rdx, [crc16_t10dif_by4 WRT_OPT]
-	and     eax, FLAG_CPUID1_EAX_STEP_MASK
-	cmp     eax, FLAG_CPUID1_EAX_AVOTON
-	cmove   rsi, rdx
-
 	;; Test for XMM_YMM support/AVX
 	test	ecx, FLAG_CPUID1_ECX_OSXSAVE
 	je	.t10dif_init_done
diff --git a/include/reg_sizes.asm b/include/reg_sizes.asm
index 0136f13..5b8613b 100644
--- a/include/reg_sizes.asm
+++ b/include/reg_sizes.asm
@@ -70,7 +70,6 @@
 %define FLAG_XGETBV_EAX_XMM_YMM        0x6
 %define FLAG_XGETBV_EAX_ZMM_OPM        0xe0
 
-%define FLAG_CPUID1_EAX_AVOTON     0x000406d0
 %define FLAG_CPUID1_EAX_STEP_MASK  0xfffffff0
 
 ; define d and w variants for registers