Add crc unit

New crc unit adds three different polynomials: T10dif, ieee and iscsi.

Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
This commit is contained in:
Greg Tucker 2016-05-03 10:00:35 -07:00
parent d6c5e9620d
commit 61164e105b
22 changed files with 5036 additions and 7 deletions

View File

@ -26,6 +26,7 @@ perf_tests32=
include erasure_code/Makefile.am
include raid/Makefile.am
include crc/Makefile.am
# LIB version info not necessarily the same as package version
LIBISAL_CURRENT=2

View File

@ -28,9 +28,10 @@
########################################################################
objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj bin\pq_check_sse.obj bin\pq_gen_avx.obj bin\pq_gen_avx2.obj bin\pq_gen_sse.obj bin\raid_base.obj bin\raid_multibinary.obj bin\xor_check_sse.obj bin\xor_gen_avx.obj bin\xor_gen_sse.obj
objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj bin\pq_check_sse.obj bin\pq_gen_avx.obj bin\pq_gen_avx2.obj bin\pq_gen_sse.obj bin\raid_base.obj bin\raid_multibinary.obj bin\xor_check_sse.obj bin\xor_gen_avx.obj bin\xor_gen_sse.obj bin/crc16_t10dif_01.obj bin/crc16_t10dif_by4.obj bin/crc32_ieee_01.obj bin/crc32_ieee_by4.obj bin/crc32_iscsi_01.obj bin/crc32_iscsi_00.obj bin/crc_multibinary.obj bin/crc_base.obj
INCLUDES = -I./ -Ierasure_code/ -Iraid/ -Iinclude/
INCLUDES = -I./ -Ierasure_code/ -Iraid/ -Icrc/ -Iinclude/
LINKFLAGS = /nologo
CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
AFLAGS = -f win64 $(INCLUDES) $(D)
@ -59,9 +60,14 @@ isa-l.dll: $(objs)
{raid}.asm.obj:
$(AS) $(AFLAGS) -o $@ $?
{crc}.c.obj:
$(CC) $(CFLAGS) /c -Fo$@ $?
{crc}.asm.obj:
$(AS) $(AFLAGS) -o $@ $?
# Examples
ex = xor_example.exe
ex = xor_example.exe crc_simple_test.exe
ex: lib $(ex)
$(ex): $(@B).obj
@ -71,7 +77,8 @@ $(ex): $(@B).obj
# Check tests
checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe \
pq_check_test.exe pq_gen_test.exe xor_check_test.exe xor_gen_test.exe
pq_check_test.exe pq_gen_test.exe xor_check_test.exe xor_gen_test.exe \
crc16_t10dif_test.exe crc32_ieee_test.exe crc32_iscsi_test.exe
checks: lib $(checks)
$(checks): $(@B).obj
@ -85,7 +92,7 @@ tests: lib $(tests)
$(tests): $(@B).obj
# Performance tests
perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe pq_gen_perf.exe xor_gen_perf.exe
perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe pq_gen_perf.exe xor_gen_perf.exe crc16_t10dif_perf.exe crc32_ieee_perf.exe crc32_iscsi_perf.exe
perfs: lib $(perfs)
$(perfs): $(@B).obj

View File

@ -27,7 +27,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
units = erasure_code raid
units = erasure_code raid crc
default: lib

48
crc/Makefile.am Normal file
View File

@ -0,0 +1,48 @@
########################################################################
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
lsrc += \
crc/crc16_t10dif_01.asm \
crc/crc16_t10dif_by4.asm \
crc/crc32_ieee_01.asm \
crc/crc32_ieee_by4.asm \
crc/crc32_iscsi_01.asm \
crc/crc32_iscsi_00.asm \
crc/crc_multibinary.asm \
crc/crc_base.c
extern_hdrs += include/crc.h
other_src += include/reg_sizes.asm include/types.h include/test.h
check_tests += crc/crc16_t10dif_test crc/crc32_ieee_test crc/crc32_iscsi_test
perf_tests += crc/crc16_t10dif_perf crc/crc32_ieee_perf crc/crc32_iscsi_perf
examples += crc/crc_simple_test

659
crc/crc16_t10dif_01.asm Normal file
View File

@ -0,0 +1,659 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Function API:
; UINT16 crc16_t10dif_01(
; UINT16 init_crc, //initial CRC value, 16 bits
; const unsigned char *buf, //buffer pointer to calculate CRC on
; UINT64 len //buffer length in bytes (64-bit data)
; );
;
; Authors:
; Erdinc Ozturk
; Vinodh Gopal
; James Guilford
;
; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
%include "reg_sizes.asm"
[bits 64]
default rel
section .text
%ifidn __OUTPUT_FORMAT__, win64
%xdefine arg1 rcx
%xdefine arg2 rdx
%xdefine arg3 r8
%xdefine arg1_low32 ecx
%else
%xdefine arg1 rdi
%xdefine arg2 rsi
%xdefine arg3 rdx
%xdefine arg1_low32 edi
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*10+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
global crc16_t10dif_01:function
crc16_t10dif_01:
; adjust the 16-bit initial_crc value, scale it to 32 bits
shl arg1_low32, 16
; After this point, code flow is exactly same as a 32-bit CRC.
; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
sub rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
; push the xmm registers into the stack to maintain
movdqa [rsp+16*2],xmm6
movdqa [rsp+16*3],xmm7
movdqa [rsp+16*4],xmm8
movdqa [rsp+16*5],xmm9
movdqa [rsp+16*6],xmm10
movdqa [rsp+16*7],xmm11
movdqa [rsp+16*8],xmm12
movdqa [rsp+16*9],xmm13
%endif
; check if smaller than 256
cmp arg3, 256
; for sizes less than 256, we can't fold 128B at a time...
jl _less_than_256
; load the initial crc value
movd xmm10, arg1_low32 ; initial crc
; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
; because data will be byte-reflected and will align with initial crc at correct place.
pslldq xmm10, 12
movdqa xmm11, [SHUF_MASK]
; receive the initial 128B data, xor the initial crc value
movdqu xmm0, [arg2+16*0]
movdqu xmm1, [arg2+16*1]
movdqu xmm2, [arg2+16*2]
movdqu xmm3, [arg2+16*3]
movdqu xmm4, [arg2+16*4]
movdqu xmm5, [arg2+16*5]
movdqu xmm6, [arg2+16*6]
movdqu xmm7, [arg2+16*7]
pshufb xmm0, xmm11
; XOR the initial_crc value
pxor xmm0, xmm10
pshufb xmm1, xmm11
pshufb xmm2, xmm11
pshufb xmm3, xmm11
pshufb xmm4, xmm11
pshufb xmm5, xmm11
pshufb xmm6, xmm11
pshufb xmm7, xmm11
movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
;imm value of pclmulqdq instruction will determine which constant to use
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; we subtract 256 instead of 128 to save one instruction from the loop
sub arg3, 256
; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
; loop will fold 128B at a time until we have 128+y Bytes of buffer
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:
; update the buffer pointer
add arg2, 128 ; buf += 128;
movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm0
movdqa xmm13, xmm1
pclmulqdq xmm0, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm1, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm0, xmm9
xorps xmm0, xmm8
pxor xmm1, xmm12
xorps xmm1, xmm13
movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm2
movdqa xmm13, xmm3
pclmulqdq xmm2, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm3, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm2, xmm9
xorps xmm2, xmm8
pxor xmm3, xmm12
xorps xmm3, xmm13
movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm4
movdqa xmm13, xmm5
pclmulqdq xmm4, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm5, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm4, xmm9
xorps xmm4, xmm8
pxor xmm5, xmm12
xorps xmm5, xmm13
movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm6
movdqa xmm13, xmm7
pclmulqdq xmm6, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm7, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm6, xmm9
xorps xmm6, xmm8
pxor xmm7, xmm12
xorps xmm7, xmm13
sub arg3, 128
; check if there is another 128B in the buffer to be able to fold
jge _fold_128_B_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
add arg2, 128
; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
; fold the 8 xmm registers to 1 xmm register with different constants
movdqa xmm10, [rk9]
movdqa xmm8, xmm0
pclmulqdq xmm0, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm0
movdqa xmm10, [rk11]
movdqa xmm8, xmm1
pclmulqdq xmm1, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm1
movdqa xmm10, [rk13]
movdqa xmm8, xmm2
pclmulqdq xmm2, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm2
movdqa xmm10, [rk15]
movdqa xmm8, xmm3
pclmulqdq xmm3, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm3
movdqa xmm10, [rk17]
movdqa xmm8, xmm4
pclmulqdq xmm4, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm4
movdqa xmm10, [rk19]
movdqa xmm8, xmm5
pclmulqdq xmm5, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm5
movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
;imm value of pclmulqdq instruction will determine which constant to use
movdqa xmm8, xmm6
pclmulqdq xmm6, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm6
; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
; instead of a cmp instruction, we use the negative flag with the jl instruction
add arg3, 128-16
jl _final_reduction_for_128
; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time
_16B_reduction_loop:
movdqa xmm8, xmm7
pclmulqdq xmm7, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
movdqu xmm0, [arg2]
pshufb xmm0, xmm11
pxor xmm7, xmm0
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
; equivalent of: cmp arg3, 16-16
; check if there is any more 16B in the buffer to be able to fold
jge _16B_reduction_loop
;now we have 16+z bytes left to reduce, where 0<= z < 16.
;first, we reduce the data in the xmm7 register
_final_reduction_for_128:
; check if any more data to fold. If not, compute the CRC of the final 128 bits
add arg3, 16
je _128_done
; here we are getting data that is less than 16 bytes.
; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
; after that the registers need to be adjusted.
_get_last_two_xmms:
movdqa xmm2, xmm7
movdqu xmm1, [arg2 - 16 + arg3]
pshufb xmm1, xmm11
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table + 16]
sub rax, arg3
movdqu xmm0, [rax]
; shift xmm2 to the left by arg3 bytes
pshufb xmm2, xmm0
; shift xmm7 to the right by 16-arg3 bytes
pxor xmm0, [mask1]
pshufb xmm7, xmm0
pblendvb xmm1, xmm2 ;xmm0 is implicit
; fold 16 Bytes
movdqa xmm2, xmm1
movdqa xmm8, xmm7
pclmulqdq xmm7, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm2
_128_done:
; compute crc of a 128-bit value
movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
movdqa xmm0, xmm7
;64b fold
pclmulqdq xmm7, xmm10, 0x1
pslldq xmm0, 8
pxor xmm7, xmm0
;32b fold
movdqa xmm0, xmm7
pand xmm0, [mask2]
psrldq xmm7, 12
pclmulqdq xmm7, xmm10, 0x10
pxor xmm7, xmm0
;barrett reduction
_barrett:
movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
movdqa xmm0, xmm7
pclmulqdq xmm7, xmm10, 0x01
pslldq xmm7, 4
pclmulqdq xmm7, xmm10, 0x11
pslldq xmm7, 4
pxor xmm7, xmm0
pextrd eax, xmm7,1
_cleanup:
; scale the result back to 16 bits
shr eax, 16
%ifidn __OUTPUT_FORMAT__, win64
movdqa xmm6, [rsp+16*2]
movdqa xmm7, [rsp+16*3]
movdqa xmm8, [rsp+16*4]
movdqa xmm9, [rsp+16*5]
movdqa xmm10, [rsp+16*6]
movdqa xmm11, [rsp+16*7]
movdqa xmm12, [rsp+16*8]
movdqa xmm13, [rsp+16*9]
%endif
add rsp, VARIABLE_OFFSET
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
_less_than_256:
; check if there is enough buffer to be able to fold 16B at a time
cmp arg3, 32
jl _less_than_32
movdqa xmm11, [SHUF_MASK]
; if there is, load the constants
movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
movdqu xmm7, [arg2] ; load the plaintext
pshufb xmm7, xmm11 ; byte-reflect the plaintext
pxor xmm7, xmm0
; update the buffer pointer
add arg2, 16
; update the counter. subtract 32 instead of 16 to save one instruction from the loop
sub arg3, 32
jmp _16B_reduction_loop
align 16
_less_than_32:
; mov initial crc to the return value. this is necessary for zero-length buffers.
mov eax, arg1_low32
test arg3, arg3
je _cleanup
movdqa xmm11, [SHUF_MASK]
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
cmp arg3, 16
je _exact_16_left
jl _less_than_16_left
movdqu xmm7, [arg2] ; load the plaintext
pshufb xmm7, xmm11 ; byte-reflect the plaintext
pxor xmm7, xmm0 ; xor the initial crc value
add arg2, 16
sub arg3, 16
movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
jmp _get_last_two_xmms
align 16
_less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
pxor xmm1, xmm1
mov r11, rsp
movdqa [r11], xmm1
cmp arg3, 4
jl _only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl _less_than_8_left
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
_less_than_8_left:
cmp arg3, 4
jl _less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
_less_than_4_left:
cmp arg3, 2
jl _less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
_less_than_2_left:
cmp arg3, 1
jl _zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
_zero_left:
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
lea rax, [pshufb_shf_table + 16]
sub rax, r9
movdqu xmm0, [rax]
pxor xmm0, [mask1]
pshufb xmm7, xmm0
jmp _128_done
align 16
_exact_16_left:
movdqu xmm7, [arg2]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
jmp _128_done
_only_less_than_4:
cmp arg3, 3
jl _only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 5
jmp _barrett
_only_less_than_3:
cmp arg3, 2
jl _only_less_than_2
; load 2 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 6
jmp _barrett
_only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 7
jmp _barrett
section .data
; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x9d9d000000000000
rk4:
DQ 0x7cf5000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000
rk9:
DQ 0xceae000000000000
rk10:
DQ 0xbfd6000000000000
rk11:
DQ 0x1e16000000000000
rk12:
DQ 0x713c000000000000
rk13:
DQ 0xf7f9000000000000
rk14:
DQ 0x80a6000000000000
rk15:
DQ 0x044c000000000000
rk16:
DQ 0xe658000000000000
rk17:
DQ 0xad18000000000000
rk18:
DQ 0xa497000000000000
rk19:
DQ 0x6ee3000000000000
rk20:
DQ 0xe7b5000000000000
mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607
pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
;;; func core, ver, snum
slversion crc16_t10dif_01, 01, 06, 0010

558
crc/crc16_t10dif_by4.asm Normal file
View File

@ -0,0 +1,558 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Function API:
; UINT16 crc16_t10dif_by4(
; UINT16 init_crc, //initial CRC value, 16 bits
; const unsigned char *buf, //buffer pointer to calculate CRC on
; UINT64 len //buffer length in bytes (64-bit data)
; );
;
; Authors:
; Erdinc Ozturk
; Vinodh Gopal
; James Guilford
;
; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
; URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
%include "reg_sizes.asm"
[bits 64]
default rel
section .text
%ifidn __OUTPUT_FORMAT__, win64
%xdefine arg1 rcx
%xdefine arg2 rdx
%xdefine arg3 r8
%xdefine arg1_low32 ecx
%else
%xdefine arg1 rdi
%xdefine arg2 rsi
%xdefine arg3 rdx
%xdefine arg1_low32 edi
%endif
align 16
global crc16_t10dif_by4:function
crc16_t10dif_by4:
; adjust the 16-bit initial_crc value, scale it to 32 bits
shl arg1_low32, 16
; After this point, code flow is exactly same as a 32-bit CRC.
; The only difference is before returning eax, we will shift
; it right 16 bits, to scale back to 16 bits.
sub rsp,16*4+8
; push the xmm registers into the stack to maintain
movdqa [rsp+16*2],xmm6
movdqa [rsp+16*3],xmm7
; check if smaller than 128B
cmp arg3, 128
; for sizes less than 128, we can't fold 64B at a time...
jl _less_than_128
; load the initial crc value
movd xmm6, arg1_low32 ; initial crc
; crc value does not need to be byte-reflected, but it needs to
; be moved to the high part of the register.
; because data will be byte-reflected and will align with
; initial crc at correct place.
pslldq xmm6, 12
movdqa xmm7, [SHUF_MASK]
; receive the initial 64B data, xor the initial crc value
movdqu xmm0, [arg2]
movdqu xmm1, [arg2+16]
movdqu xmm2, [arg2+32]
movdqu xmm3, [arg2+48]
pshufb xmm0, xmm7
; XOR the initial_crc value
pxor xmm0, xmm6
pshufb xmm1, xmm7
pshufb xmm2, xmm7
pshufb xmm3, xmm7
movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
;imm value of pclmulqdq instruction
;will determine which constant to use
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; we subtract 128 instead of 64 to save one instruction from the loop
sub arg3, 128
; at this section of the code, there is 64*x+y (0<=y<64) bytes of
; buffer. The _fold_64_B_loop
; loop will fold 64B at a time until we have 64+y Bytes of buffer
; fold 64B at a time. This section of the code folds 4 xmm
; registers in parallel
_fold_64_B_loop:
; update the buffer pointer
add arg2, 64 ; buf += 64;
movdqu xmm4, xmm0
movdqu xmm5, xmm1
pclmulqdq xmm0, xmm6 , 0x11
pclmulqdq xmm1, xmm6 , 0x11
pclmulqdq xmm4, xmm6, 0x0
pclmulqdq xmm5, xmm6, 0x0
pxor xmm0, xmm4
pxor xmm1, xmm5
movdqu xmm4, xmm2
movdqu xmm5, xmm3
pclmulqdq xmm2, xmm6, 0x11
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pclmulqdq xmm5, xmm6, 0x0
pxor xmm2, xmm4
pxor xmm3, xmm5
movdqu xmm4, [arg2]
movdqu xmm5, [arg2+16]
pshufb xmm4, xmm7
pshufb xmm5, xmm7
pxor xmm0, xmm4
pxor xmm1, xmm5
movdqu xmm4, [arg2+32]
movdqu xmm5, [arg2+48]
pshufb xmm4, xmm7
pshufb xmm5, xmm7
pxor xmm2, xmm4
pxor xmm3, xmm5
sub arg3, 64
; check if there is another 64B in the buffer to be able to fold
jge _fold_64_B_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
add arg2, 64
; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
; fold the 4 xmm registers to 1 xmm register with different constants
movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
;imm value of pclmulqdq instruction will
;determine which constant to use
movdqa xmm4, xmm0
pclmulqdq xmm0, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm1, xmm4
pxor xmm1, xmm0
movdqa xmm4, xmm1
pclmulqdq xmm1, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm2, xmm4
pxor xmm2, xmm1
movdqa xmm4, xmm2
pclmulqdq xmm2, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
pxor xmm3, xmm2
; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
; instead of a cmp instruction, we use the negative flag with the jl instruction
add arg3, 64-16
jl _final_reduction_for_128
; now we have 16+y bytes left to reduce. 16 Bytes
; is in register xmm3 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time
_16B_reduction_loop:
movdqa xmm4, xmm3
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
movdqu xmm0, [arg2]
pshufb xmm0, xmm7
pxor xmm3, xmm0
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
; equivalent of: cmp arg3, 16-16
; check if there is any more 16B in the buffer to be able to fold
jge _16B_reduction_loop
;now we have 16+z bytes left to reduce, where 0<= z < 16.
;first, we reduce the data in the xmm3 register
_final_reduction_for_128:
; check if any more data to fold. If not, compute the CRC of the final 128 bits
add arg3, 16
je _128_done
; here we are getting data that is less than 16 bytes.
; since we know that there was data before the pointer,
; we can offset the input pointer before the actual point,
; to receive exactly 16 bytes.
; after that the registers need to be adjusted.
_get_last_two_xmms:
movdqa xmm2, xmm3
movdqu xmm1, [arg2 - 16 + arg3]
pshufb xmm1, xmm7
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table + 16]
sub rax, arg3
movdqu xmm0, [rax]
; shift xmm2 to the left by arg3 bytes
pshufb xmm2, xmm0
; shift xmm3 to the right by 16-arg3 bytes
pxor xmm0, [mask1]
pshufb xmm3, xmm0
pblendvb xmm1, xmm2 ;xmm0 is implicit
; fold 16 Bytes
movdqa xmm2, xmm1
movdqa xmm4, xmm3
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
pxor xmm3, xmm2
_128_done:
; compute crc of a 128-bit value
movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
movdqa xmm0, xmm3
;64b fold
pclmulqdq xmm3, xmm6, 0x1
pslldq xmm0, 8
pxor xmm3, xmm0
;32b fold
movdqa xmm0, xmm3
pand xmm0, [mask2]
psrldq xmm3, 12
pclmulqdq xmm3, xmm6, 0x10
pxor xmm3, xmm0
;barrett reduction
_barrett:
movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
movdqa xmm0, xmm3
pclmulqdq xmm3, xmm6, 0x01
pslldq xmm3, 4
pclmulqdq xmm3, xmm6, 0x11
pslldq xmm3, 4
pxor xmm3, xmm0
pextrd eax, xmm3,1
_cleanup:
; scale the result back to 16 bits
shr eax, 16
movdqa xmm6, [rsp+16*2]
movdqa xmm7, [rsp+16*3]
add rsp,16*4+8
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
_less_than_128:
; check if there is enough buffer to be able to fold 16B at a time
cmp arg3, 32
jl _less_than_32
movdqa xmm7, [SHUF_MASK]
; if there is, load the constants
movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
movdqu xmm3, [arg2] ; load the plaintext
pshufb xmm3, xmm7 ; byte-reflect the plaintext
pxor xmm3, xmm0
; update the buffer pointer
add arg2, 16
; update the counter. subtract 32 instead of 16 to save one instruction from the loop
sub arg3, 32
jmp _16B_reduction_loop
align 16
_less_than_32:
; mov initial crc to the return value. this is necessary for zero-length buffers.
mov eax, arg1_low32
test arg3, arg3
je _cleanup
movdqa xmm7, [SHUF_MASK]
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
cmp arg3, 16
je _exact_16_left
jl _less_than_16_left
movdqu xmm3, [arg2] ; load the plaintext
pshufb xmm3, xmm7 ; byte-reflect the plaintext
pxor xmm3, xmm0 ; xor the initial crc value
add arg2, 16
sub arg3, 16
movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
jmp _get_last_two_xmms
align 16
_less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
pxor xmm1, xmm1
mov r11, rsp
movdqa [r11], xmm1
cmp arg3, 4
jl _only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl _less_than_8_left
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
_less_than_8_left:
cmp arg3, 4
jl _less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
_less_than_4_left:
cmp arg3, 2
jl _less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
_less_than_2_left:
cmp arg3, 1
jl _zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
_zero_left:
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0 ; xor the initial crc value
; shl r9, 4
lea rax, [pshufb_shf_table + 16]
sub rax, r9
movdqu xmm0, [rax]
pxor xmm0, [mask1]
pshufb xmm3, xmm0
jmp _128_done
align 16
_exact_16_left:
movdqu xmm3, [arg2]
pshufb xmm3, xmm7
pxor xmm3, xmm0 ; xor the initial crc value
jmp _128_done
_only_less_than_4:
cmp arg3, 3
jl _only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0 ; xor the initial crc value
psrldq xmm3, 5
jmp _barrett
_only_less_than_3:
cmp arg3, 2
jl _only_less_than_2
; load 2 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0 ; xor the initial crc value
psrldq xmm3, 6
jmp _barrett
_only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0 ; xor the initial crc value
psrldq xmm3, 7
jmp _barrett
section .data
; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x044c000000000000
rk4:
DQ 0xe658000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000
mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607
pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
;;; func core, ver, snum
slversion crc16_t10dif_by4, 05, 02, 0016

86
crc/crc16_t10dif_perf.c Normal file
View File

@ -0,0 +1,86 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>
#include "crc.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 400000
# define TEST_TYPE_STR "_warm"
#else
// Uncached test. Pull from large mem base.
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN (2 * GT_L3_CACHE)
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
#endif
#ifndef TEST_SEED
# define TEST_SEED 0x1234
#endif
#define TEST_MEM TEST_LEN
int main(int argc, char *argv[])
{
int i;
void *buf;
uint16_t crc;
struct perf start, stop;
printf("crc16_t10dif_perf:\n");
if (posix_memalign(&buf, 1024, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
printf("Start timed tests\n");
fflush(0);
crc = crc16_t10dif(TEST_SEED, buf, TEST_LEN);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
crc = crc16_t10dif(TEST_SEED, buf, TEST_LEN);
}
perf_stop(&stop);
printf("crc16_t10dif" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
printf("finish 0x%x\n", crc);
return 0;
}

167
crc/crc16_t10dif_test.c Normal file
View File

@ -0,0 +1,167 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include "crc.h"
#include "types.h"
#ifndef TEST_SEED
# define TEST_SEED 0x1234
#endif
#define MAX_BUF 512
#define TEST_SIZE 20
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
void rand_buffer(unsigned char *buf, long buffer_size)
{
long i;
for (i = 0; i < buffer_size; i++)
buf[i] = rand();
}
int main(int argc, char *argv[])
{
int fail = 0;
u32 r = 0;
int verbose = argc - 1;
int i, s;
void *buf_raw;
unsigned char *buf;
printf("Test crc16_t10dif_test ");
if (posix_memalign(&buf_raw, MAX_BUF, MAX_BUF * TEST_SIZE)) {
printf("alloc error: Fail");
return -1;
}
buf = (unsigned char *)buf_raw;
srand(TEST_SEED);
// Test of all zeros
memset(buf, 0, MAX_BUF * 10);
u16 crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
u16 crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
if (crc != crc_ref) {
fail++;
printf("\n opt ref\n");
printf(" ------ ------\n");
printf("crc zero = 0x%4x 0x%4x \n", crc, crc_ref);
} else
printf(".");
// Another simple test pattern
memset(buf, 0x8a, MAX_BUF);
crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
if (crc != crc_ref) {
fail++;
printf("crc all 8a = 0x%4x 0x%4x\n", crc, crc_ref);
} else
printf(".");
// Do a few random tests
rand_buffer(buf, MAX_BUF * TEST_SIZE);
for (i = 0; i < TEST_SIZE; i++) {
crc = crc16_t10dif(TEST_SEED, buf, MAX_BUF);
crc_ref = crc16_t10dif_base(TEST_SEED, buf, MAX_BUF);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
else
printf(".");
buf += MAX_BUF;
}
// Do a few random sizes
buf = (unsigned char *)buf_raw; //reset buf
r = rand();
for (i = MAX_BUF; i >= 0; i--) {
crc = crc16_t10dif(r, buf, i);
crc_ref = crc16_t10dif_base(r, buf, i);
if (crc != crc_ref) {
fail++;
printf("fail random size%i 0x%8x 0x%8x\n", i, crc, crc_ref);
} else
printf(".");
}
// Try different seeds
for (s = 0; s < 20; s++) {
buf = (unsigned char *)buf_raw; //reset buf
r = rand(); // just to get a new seed
rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
if (verbose)
printf("seed = 0x%x\n", r);
for (i = 0; i < TEST_SIZE; i++) {
crc = crc16_t10dif(r, buf, MAX_BUF);
crc_ref = crc16_t10dif_base(r, buf, MAX_BUF);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
else
printf(".");
buf += MAX_BUF;
}
}
// Run tests at end of buffer
buf = (unsigned char *)buf_raw; //reset buf
buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
for (i = 0; i < TEST_SIZE; i++) {
crc = crc16_t10dif(TEST_SEED, buf + i, TEST_SIZE - i);
crc_ref = crc16_t10dif_base(TEST_SEED, buf + i, TEST_SIZE - i);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc eob rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
else
printf(".");
}
printf("Test done: %s\n", fail ? "Fail" : "Pass");
if (fail)
printf("\nFailed %d tests\n", fail);
return fail;
}

650
crc/crc32_ieee_01.asm Normal file
View File

@ -0,0 +1,650 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Function API:
; UINT32 crc32_ieee_01(
; UINT32 init_crc, //initial CRC value, 32 bits
; const unsigned char *buf, //buffer pointer to calculate CRC on
; UINT64 len //buffer length in bytes (64-bit data)
; );
;
; Authors:
; Erdinc Ozturk
; Vinodh Gopal
; James Guilford
;
; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
%include "reg_sizes.asm"
[bits 64]
default rel
section .text
%ifidn __OUTPUT_FORMAT__, win64
%xdefine arg1 rcx
%xdefine arg2 rdx
%xdefine arg3 r8
%xdefine arg1_low32 ecx
%else
%xdefine arg1 rdi
%xdefine arg2 rsi
%xdefine arg3 rdx
%xdefine arg1_low32 edi
%endif
%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*10+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
global crc32_ieee_01:function
crc32_ieee_01:
not arg1_low32 ;~init_crc
sub rsp,VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
; push the xmm registers into the stack to maintain
movdqa [rsp + XMM_SAVE + 16*0], xmm6
movdqa [rsp + XMM_SAVE + 16*1], xmm7
movdqa [rsp + XMM_SAVE + 16*2], xmm8
movdqa [rsp + XMM_SAVE + 16*3], xmm9
movdqa [rsp + XMM_SAVE + 16*4], xmm10
movdqa [rsp + XMM_SAVE + 16*5], xmm11
movdqa [rsp + XMM_SAVE + 16*6], xmm12
movdqa [rsp + XMM_SAVE + 16*7], xmm13
%endif
; check if smaller than 256
cmp arg3, 256
; for sizes less than 256, we can't fold 128B at a time...
jl _less_than_256
; load the initial crc value
movd xmm10, arg1_low32 ; initial crc
; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
; because data will be byte-reflected and will align with initial crc at correct place.
pslldq xmm10, 12
movdqa xmm11, [SHUF_MASK]
; receive the initial 128B data, xor the initial crc value
movdqu xmm0, [arg2+16*0]
movdqu xmm1, [arg2+16*1]
movdqu xmm2, [arg2+16*2]
movdqu xmm3, [arg2+16*3]
movdqu xmm4, [arg2+16*4]
movdqu xmm5, [arg2+16*5]
movdqu xmm6, [arg2+16*6]
movdqu xmm7, [arg2+16*7]
pshufb xmm0, xmm11
; XOR the initial_crc value
pxor xmm0, xmm10
pshufb xmm1, xmm11
pshufb xmm2, xmm11
pshufb xmm3, xmm11
pshufb xmm4, xmm11
pshufb xmm5, xmm11
pshufb xmm6, xmm11
pshufb xmm7, xmm11
movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
;imm value of pclmulqdq instruction will determine which constant to use
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; we subtract 256 instead of 128 to save one instruction from the loop
sub arg3, 256
; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
; loop will fold 128B at a time until we have 128+y Bytes of buffer
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:
; update the buffer pointer
add arg2, 128 ; buf += 128;
movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm0
movdqa xmm13, xmm1
pclmulqdq xmm0, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm1, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm0, xmm9
xorps xmm0, xmm8
pxor xmm1, xmm12
xorps xmm1, xmm13
movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm2
movdqa xmm13, xmm3
pclmulqdq xmm2, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm3, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm2, xmm9
xorps xmm2, xmm8
pxor xmm3, xmm12
xorps xmm3, xmm13
movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm4
movdqa xmm13, xmm5
pclmulqdq xmm4, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm5, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm4, xmm9
xorps xmm4, xmm8
pxor xmm5, xmm12
xorps xmm5, xmm13
movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7]
pshufb xmm9, xmm11
pshufb xmm12, xmm11
movdqa xmm8, xmm6
movdqa xmm13, xmm7
pclmulqdq xmm6, xmm10, 0x0
pclmulqdq xmm8, xmm10 , 0x11
pclmulqdq xmm7, xmm10, 0x0
pclmulqdq xmm13, xmm10 , 0x11
pxor xmm6, xmm9
xorps xmm6, xmm8
pxor xmm7, xmm12
xorps xmm7, xmm13
sub arg3, 128
; check if there is another 128B in the buffer to be able to fold
jge _fold_128_B_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
add arg2, 128
; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
; fold the 8 xmm registers to 1 xmm register with different constants
movdqa xmm10, [rk9]
movdqa xmm8, xmm0
pclmulqdq xmm0, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm0
movdqa xmm10, [rk11]
movdqa xmm8, xmm1
pclmulqdq xmm1, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm1
movdqa xmm10, [rk13]
movdqa xmm8, xmm2
pclmulqdq xmm2, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm2
movdqa xmm10, [rk15]
movdqa xmm8, xmm3
pclmulqdq xmm3, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm3
movdqa xmm10, [rk17]
movdqa xmm8, xmm4
pclmulqdq xmm4, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm4
movdqa xmm10, [rk19]
movdqa xmm8, xmm5
pclmulqdq xmm5, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
xorps xmm7, xmm5
movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
;imm value of pclmulqdq instruction will determine which constant to use
movdqa xmm8, xmm6
pclmulqdq xmm6, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm6
; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
; instead of a cmp instruction, we use the negative flag with the jl instruction
add arg3, 128-16
jl _final_reduction_for_128
; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time
_16B_reduction_loop:
movdqa xmm8, xmm7
pclmulqdq xmm7, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
movdqu xmm0, [arg2]
pshufb xmm0, xmm11
pxor xmm7, xmm0
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
; equivalent of: cmp arg3, 16-16
; check if there is any more 16B in the buffer to be able to fold
jge _16B_reduction_loop
;now we have 16+z bytes left to reduce, where 0<= z < 16.
;first, we reduce the data in the xmm7 register
_final_reduction_for_128:
; check if any more data to fold. If not, compute the CRC of the final 128 bits
add arg3, 16
je _128_done
; here we are getting data that is less than 16 bytes.
; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
; after that the registers need to be adjusted.
_get_last_two_xmms:
movdqa xmm2, xmm7
movdqu xmm1, [arg2 - 16 + arg3]
pshufb xmm1, xmm11
; get rid of the extra data that was loaded before
; load the shift constant
lea rax, [pshufb_shf_table + 16]
sub rax, arg3
movdqu xmm0, [rax]
; shift xmm2 to the left by arg3 bytes
pshufb xmm2, xmm0
; shift xmm7 to the right by 16-arg3 bytes
pxor xmm0, [mask1]
pshufb xmm7, xmm0
pblendvb xmm1, xmm2 ;xmm0 is implicit
; fold 16 Bytes
movdqa xmm2, xmm1
movdqa xmm8, xmm7
pclmulqdq xmm7, xmm10, 0x11
pclmulqdq xmm8, xmm10, 0x0
pxor xmm7, xmm8
pxor xmm7, xmm2
_128_done:
; compute crc of a 128-bit value
movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
movdqa xmm0, xmm7
;64b fold
pclmulqdq xmm7, xmm10, 0x1
pslldq xmm0, 8
pxor xmm7, xmm0
;32b fold
movdqa xmm0, xmm7
pand xmm0, [mask2]
psrldq xmm7, 12
pclmulqdq xmm7, xmm10, 0x10
pxor xmm7, xmm0
;barrett reduction
_barrett:
movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
movdqa xmm0, xmm7
pclmulqdq xmm7, xmm10, 0x01
pslldq xmm7, 4
pclmulqdq xmm7, xmm10, 0x11
pslldq xmm7, 4
pxor xmm7, xmm0
pextrd eax, xmm7,1
_cleanup:
not eax
%ifidn __OUTPUT_FORMAT__, win64
movdqa xmm6, [rsp + XMM_SAVE + 16*0]
movdqa xmm7, [rsp + XMM_SAVE + 16*1]
movdqa xmm8, [rsp + XMM_SAVE + 16*2]
movdqa xmm9, [rsp + XMM_SAVE + 16*3]
movdqa xmm10, [rsp + XMM_SAVE + 16*4]
movdqa xmm11, [rsp + XMM_SAVE + 16*5]
movdqa xmm12, [rsp + XMM_SAVE + 16*6]
movdqa xmm13, [rsp + XMM_SAVE + 16*7]
%endif
add rsp,VARIABLE_OFFSET
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
_less_than_256:
; check if there is enough buffer to be able to fold 16B at a time
cmp arg3, 32
jl _less_than_32
movdqa xmm11, [SHUF_MASK]
; if there is, load the constants
movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
movdqu xmm7, [arg2] ; load the plaintext
pshufb xmm7, xmm11 ; byte-reflect the plaintext
pxor xmm7, xmm0
; update the buffer pointer
add arg2, 16
; update the counter. subtract 32 instead of 16 to save one instruction from the loop
sub arg3, 32
jmp _16B_reduction_loop
align 16
_less_than_32:
; mov initial crc to the return value. this is necessary for zero-length buffers.
mov eax, arg1_low32
test arg3, arg3
je _cleanup
movdqa xmm11, [SHUF_MASK]
movd xmm0, arg1_low32 ; get the initial crc value
pslldq xmm0, 12 ; align it to its correct place
cmp arg3, 16
je _exact_16_left
jl _less_than_16_left
movdqu xmm7, [arg2] ; load the plaintext
pshufb xmm7, xmm11 ; byte-reflect the plaintext
pxor xmm7, xmm0 ; xor the initial crc value
add arg2, 16
sub arg3, 16
movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
jmp _get_last_two_xmms
align 16
_less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
pxor xmm1, xmm1
mov r11, rsp
movdqa [r11], xmm1
cmp arg3, 4
jl _only_less_than_4
; backup the counter value
mov r9, arg3
cmp arg3, 8
jl _less_than_8_left
; load 8 Bytes
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
_less_than_8_left:
cmp arg3, 4
jl _less_than_4_left
; load 4 Bytes
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
_less_than_4_left:
cmp arg3, 2
jl _less_than_2_left
; load 2 Bytes
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
_less_than_2_left:
cmp arg3, 1
jl _zero_left
; load 1 Byte
mov al, [arg2]
mov [r11], al
_zero_left:
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
; shl r9, 4
lea rax, [pshufb_shf_table + 16]
sub rax, r9
movdqu xmm0, [rax]
pxor xmm0, [mask1]
pshufb xmm7, xmm0
jmp _128_done
align 16
_exact_16_left:
movdqu xmm7, [arg2]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
jmp _128_done
_only_less_than_4:
cmp arg3, 3
jl _only_less_than_3
; load 3 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 5
jmp _barrett
_only_less_than_3:
cmp arg3, 2
jl _only_less_than_2
; load 2 Bytes
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 6
jmp _barrett
_only_less_than_2:
; load 1 Byte
mov al, [arg2]
mov [r11], al
movdqa xmm7, [rsp]
pshufb xmm7, xmm11
pxor xmm7, xmm0 ; xor the initial crc value
psrldq xmm7, 7
jmp _barrett
section .data
; precomputed constants
align 16
rk1 :
DQ 0xf200aa6600000000
rk2 :
DQ 0x17d3315d00000000
rk3 :
DQ 0x022ffca500000000
rk4 :
DQ 0x9d9ee22f00000000
rk5 :
DQ 0xf200aa6600000000
rk6 :
DQ 0x490d678d00000000
rk7 :
DQ 0x0000000104d101df
rk8 :
DQ 0x0000000104c11db7
rk9 :
DQ 0x6ac7e7d700000000
rk10 :
DQ 0xfcd922af00000000
rk11 :
DQ 0x34e45a6300000000
rk12 :
DQ 0x8762c1f600000000
rk13 :
DQ 0x5395a0ea00000000
rk14 :
DQ 0x54f2d5c700000000
rk15 :
DQ 0xd3504ec700000000
rk16 :
DQ 0x57a8445500000000
rk17 :
DQ 0xc053585d00000000
rk18 :
DQ 0x766f1b7800000000
rk19 :
DQ 0xcd8c54b500000000
rk20 :
DQ 0xab40b71e00000000
mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607
pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
;;; func core, ver, snum
slversion crc32_ieee_01, 01, 06, 0011

561
crc/crc32_ieee_by4.asm Normal file
View File

@ -0,0 +1,561 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Function API:
; UINT32 crc32_ieee_by4(
; UINT32 init_crc, //initial CRC value, 32 bits
; const unsigned char *buf, //buffer pointer to calculate CRC on
; UINT64 len //buffer length in bytes (64-bit data)
; );
;
; Authors:
; Erdinc Ozturk
; Vinodh Gopal
; James Guilford
;
; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
; URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
%include "reg_sizes.asm"
[bits 64]
default rel
section .text
%ifidn __OUTPUT_FORMAT__, win64
%xdefine arg1 rcx
%xdefine arg2 rdx
%xdefine arg3 r8
%xdefine arg1_low32 ecx
%else
%xdefine arg1 rdi
%xdefine arg2 rsi
%xdefine arg3 rdx
%xdefine arg1_low32 edi
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define XMM_SAVE 16*2
%define VARIABLE_OFFSET 16*4+8
%else
%define VARIABLE_OFFSET 16*2+8
%endif
align 16
global crc32_ieee_by4:function
crc32_ieee_by4:
not arg1_low32
sub rsp,VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
; push the xmm registers into the stack to maintain
movdqa [rsp + XMM_SAVE + 16*0],xmm6
movdqa [rsp + XMM_SAVE + 16*1],xmm7
%endif
; check if smaller than 128B
cmp arg3, 128
jl _less_than_128
; load the initial crc value
movd xmm6, arg1_low32 ; initial crc
; crc value does not need to be byte-reflected, but it needs to be
; moved to the high part of the register.
; because data will be byte-reflected and will align with initial
; crc at correct place.
pslldq xmm6, 12
movdqa xmm7, [SHUF_MASK]
; receive the initial 64B data, xor the initial crc value
movdqu xmm0, [arg2]
movdqu xmm1, [arg2+16]
movdqu xmm2, [arg2+32]
movdqu xmm3, [arg2+48]
pshufb xmm0, xmm7
; XOR the initial_crc value
pxor xmm0, xmm6
pshufb xmm1, xmm7
pshufb xmm2, xmm7
pshufb xmm3, xmm7
movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;we subtract 128 instead of 64 to save one instruction from the loop
sub arg3, 128
; at this section of the code, there is 64*x+y (0<=y<64) bytes of
; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
; have 64+y Bytes of buffer
; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
_fold_64_B_loop:
;update the buffer pointer
add arg2, 64
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pclmulqdq xmm0, xmm6 , 0x11
pclmulqdq xmm1, xmm6 , 0x11
pclmulqdq xmm4, xmm6, 0x0
pclmulqdq xmm5, xmm6, 0x0
pxor xmm0, xmm4
pxor xmm1, xmm5
movdqa xmm4, xmm2
movdqa xmm5, xmm3
pclmulqdq xmm2, xmm6, 0x11
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pclmulqdq xmm5, xmm6, 0x0
pxor xmm2, xmm4
pxor xmm3, xmm5
movdqu xmm4, [arg2]
movdqu xmm5, [arg2+16]
pshufb xmm4, xmm7
pshufb xmm5, xmm7
pxor xmm0, xmm4
pxor xmm1, xmm5
movdqu xmm4, [arg2+32]
movdqu xmm5, [arg2+48]
pshufb xmm4, xmm7
pshufb xmm5, xmm7
pxor xmm2, xmm4
pxor xmm3, xmm5
sub arg3, 64
; check if there is another 64B in the buffer to be able to fold
jge _fold_64_B_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
add arg2, 64
;at this point, the arg2 is pointing at the last y Bytes of the buffer
; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
movdqa xmm6, [rk1] ;k1
; fold the 4 xmm registers to 1 xmm register with different constants
movdqa xmm4, xmm0
pclmulqdq xmm0, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm1, xmm4
xorps xmm1, xmm0
movdqa xmm4, xmm1
pclmulqdq xmm1, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm2, xmm4
xorps xmm2, xmm1
movdqa xmm4, xmm2
pclmulqdq xmm2, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
pxor xmm3, xmm2
;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
; instead of a cmp instruction, we use the negative flag with the jl instruction
add arg3, 64-16
jl _final_reduction_for_128
; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time
_16B_reduction_loop:
movdqa xmm4, xmm3
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
movdqu xmm0, [arg2]
pshufb xmm0, xmm7
pxor xmm3, xmm0
add arg2, 16
sub arg3, 16
; instead of a cmp instruction, we utilize the flags with the jge instruction
; equivalent of: cmp arg3, 16-16
; check if there is any more 16B in the buffer to be able to fold
jge _16B_reduction_loop
;now we have 16+z bytes left to reduce, where 0<= z < 16.
;first, we reduce the data in the xmm3 register
_final_reduction_for_128:
; check if any more data to fold. If not, compute the CRC of the final 128 bits
add arg3, 16
je _128_done
; here we are getting data that is less than 16 bytes.
; since we know that there was data before the pointer, we can offset
; the input pointer before the actual point, to receive exactly 16 bytes.
; after that the registers need to be adjusted.
_get_last_two_xmms:
movdqa xmm2, xmm3
movdqu xmm1, [arg2 - 16 + arg3]
pshufb xmm1, xmm7
shl arg3, 4
lea rax, [pshufb_shf_table + 15*16]
sub rax, arg3
movdqu xmm0, [rax]
pshufb xmm2, xmm0
pxor xmm0, [mask3]
pshufb xmm3, xmm0
pblendvb xmm1, xmm2 ;xmm0 is implicit
movdqa xmm2, xmm1
movdqa xmm4, xmm3
pclmulqdq xmm3, xmm6, 0x11
pclmulqdq xmm4, xmm6, 0x0
pxor xmm3, xmm4
pxor xmm3, xmm2
_128_done:
movdqa xmm6, [rk5]
movdqa xmm0, xmm3
;64b fold
pclmulqdq xmm3, xmm6, 0x1
pslldq xmm0, 8
pxor xmm3, xmm0
;32b fold
movdqa xmm0, xmm3
pand xmm0, [mask4]
psrldq xmm3, 12
pclmulqdq xmm3, xmm6, 0x10
pxor xmm3, xmm0
;barrett reduction
_barrett:
movdqa xmm6, [rk7]
movdqa xmm0, xmm3
pclmulqdq xmm3, xmm6, 0x01
pslldq xmm3, 4
pclmulqdq xmm3, xmm6, 0x11
pslldq xmm3, 4
pxor xmm3, xmm0
pextrd eax, xmm3,1
_cleanup:
not eax
%ifidn __OUTPUT_FORMAT__, win64
movdqa xmm6, [rsp + XMM_SAVE + 16*0]
movdqa xmm7, [rsp + XMM_SAVE + 16*1]
%endif
add rsp,VARIABLE_OFFSET
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
_less_than_128:
;check if there is enough buffer to be able to fold 16B at a time
cmp arg3, 32
jl _less_than_32
movdqa xmm7, [SHUF_MASK]
;if there is, load the constants
movdqa xmm6, [rk1] ;k1
movd xmm0, arg1_low32
pslldq xmm0, 12
movdqu xmm3, [arg2]
pshufb xmm3, xmm7
pxor xmm3, xmm0
;update the buffer pointer
add arg2, 16
;update the counter. subtract 32 instead of 16 to save one instruction from the loop
sub arg3, 32
jmp _16B_reduction_loop
align 16
_less_than_32:
mov eax, arg1_low32
test arg3, arg3
je _cleanup
movdqa xmm7, [SHUF_MASK]
movd xmm0, arg1_low32
pslldq xmm0, 12
cmp arg3, 16
je _exact_16_left
jl _less_than_16_left
movd xmm0, arg1_low32
pslldq xmm0, 12
movdqu xmm3, [arg2]
pshufb xmm3, xmm7
pxor xmm3, xmm0
add arg2, 16
sub arg3, 16
movdqa xmm6, [rk1] ;k1
jmp _get_last_two_xmms
align 16
_less_than_16_left:
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
pxor xmm1, xmm1
mov r11, rsp
movdqa [r11], xmm1
cmp arg3, 4
jl _only_less_than_4
mov r9, arg3
cmp arg3, 8
jl _less_than_8_left
mov rax, [arg2]
mov [r11], rax
add r11, 8
sub arg3, 8
add arg2, 8
_less_than_8_left:
cmp arg3, 4
jl _less_than_4_left
mov eax, [arg2]
mov [r11], eax
add r11, 4
sub arg3, 4
add arg2, 4
_less_than_4_left:
cmp arg3, 2
jl _less_than_2_left
mov ax, [arg2]
mov [r11], ax
add r11, 2
sub arg3, 2
add arg2, 2
_less_than_2_left:
cmp arg3, 1
jl _zero_left
mov al, [arg2]
mov [r11], al
_zero_left:
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0
shl r9, 4
lea rax, [pshufb_shf_table + 15*16]
sub rax, r9
movdqu xmm0, [rax]
pxor xmm0, [mask3]
pshufb xmm3, xmm0
jmp _128_done
align 16
_exact_16_left:
movdqu xmm3, [arg2]
pshufb xmm3, xmm7
pxor xmm3, xmm0
jmp _128_done
_only_less_than_4:
cmp arg3, 3
jl _only_less_than_3
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
mov al, [arg2+2]
mov [r11+2], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0
psrldq xmm3, 5
jmp _barrett
_only_less_than_3:
cmp arg3, 2
jl _only_less_than_2
mov al, [arg2]
mov [r11], al
mov al, [arg2+1]
mov [r11+1], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0
psrldq xmm3, 6
jmp _barrett
_only_less_than_2:
mov al, [arg2]
mov [r11], al
movdqa xmm3, [rsp]
pshufb xmm3, xmm7
pxor xmm3, xmm0
psrldq xmm3, 7
jmp _barrett
; precomputed constants
section .data
align 16
rk1:
DQ 0xf200aa6600000000
rk2:
DQ 0x17d3315d00000000
rk3:
DQ 0xd3504ec700000000
rk4:
DQ 0x57a8445500000000
rk5:
DQ 0xf200aa6600000000
rk6:
DQ 0x490d678d00000000
rk7:
DQ 0x0000000104d101df
rk8:
DQ 0x0000000104c11db7
mask:
dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq 0x8080808080808080, 0x8080808080808080
mask4:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
align 32
pshufb_shf_table:
dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
;;; func core, ver, snum
slversion crc32_ieee_by4, 05, 02, 0017

86
crc/crc32_ieee_perf.c Normal file
View File

@ -0,0 +1,86 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>
#include "crc.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 400000
# define TEST_TYPE_STR "_warm"
#else
// Uncached test. Pull from large mem base.
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN (2 * GT_L3_CACHE)
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
#endif
#ifndef TEST_SEED
# define TEST_SEED 0x1234
#endif
#define TEST_MEM TEST_LEN
int main(int argc, char *argv[])
{
int i;
void *buf;
uint32_t crc;
struct perf start, stop;
printf("crc32_ieee_perf:\n");
if (posix_memalign(&buf, 1024, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
printf("Start timed tests\n");
fflush(0);
crc = crc32_ieee(TEST_SEED, buf, TEST_LEN);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
crc = crc32_ieee(TEST_SEED, buf, TEST_LEN);
}
perf_stop(&stop);
printf("crc32_ieee" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
printf("finish 0x%x\n", crc);
return 0;
}

174
crc/crc32_ieee_test.c Normal file
View File

@ -0,0 +1,174 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "crc.h"
#include "types.h"
#ifndef TEST_SEED
# define TEST_SEED 0x1234
#endif
#define MAX_BUF 512
#define TEST_SIZE 20
typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
// Generates pseudo-random data
void rand_buffer(unsigned char *buf, long buffer_size)
{
long i;
for (i = 0; i < buffer_size; i++)
buf[i] = rand();
}
int main(int argc, char *argv[])
{
int fail = 0;
u32 r;
int verbose = argc - 1;
int i, s, ret;
void *buf_alloc;
unsigned char *buf;
printf("Test crc32_ieee ");
// Align to MAX_BUF boundary
ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE);
if (ret) {
printf("alloc error: Fail");
return -1;
}
buf = (unsigned char *)buf_alloc;
srand(TEST_SEED);
// Test of all zeros
memset(buf, 0, MAX_BUF * 10);
u32 crc = crc32_ieee(TEST_SEED, buf, MAX_BUF);
u32 crc_ref = crc32_ieee_base(TEST_SEED, buf, MAX_BUF);
if (crc != crc_ref) {
fail++;
printf("\n opt ref\n");
printf(" ------ ------\n");
printf("crc zero = 0x%8x 0x%8x \n", crc, crc_ref);
} else
printf(".");
// Another simple test pattern
memset(buf, 0x8a, MAX_BUF);
crc = crc32_ieee(TEST_SEED, buf, MAX_BUF);
crc_ref = crc32_ieee_base(TEST_SEED, buf, MAX_BUF);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc all 8a = 0x%8x 0x%8x\n", crc, crc_ref);
else
printf(".");
// Do a few random tests
r = rand();
rand_buffer(buf, MAX_BUF * TEST_SIZE);
for (i = 0; i < TEST_SIZE; i++) {
crc = crc32_ieee(r, buf, MAX_BUF);
crc_ref = crc32_ieee_base(r, buf, MAX_BUF);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
else
printf(".");
buf += MAX_BUF;
}
// Do a few random sizes
buf = (unsigned char *)buf_alloc; //reset buf
r = rand();
for (i = MAX_BUF; i >= 0; i--) {
crc = crc32_ieee(r, buf, i);
crc_ref = crc32_ieee_base(r, buf, i);
if (crc != crc_ref) {
fail++;
printf("fail random size%i 0x%8x 0x%8x\n", i, crc, crc_ref);
} else
printf(".");
}
// Try different seeds
for (s = 0; s < 20; s++) {
buf = (unsigned char *)buf_alloc; //reset buf
r = rand(); // just to get a new seed
rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data
if (verbose)
printf("seed = 0x%x\n", r);
for (i = 0; i < TEST_SIZE; i++) {
crc = crc32_ieee(r, buf, MAX_BUF);
crc_ref = crc32_ieee_base(r, buf, MAX_BUF);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
else
printf(".");
buf += MAX_BUF;
}
}
// Run tests at end of buffer
buf = (unsigned char *)buf_alloc; //reset buf
buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end
for (i = 0; i < TEST_SIZE; i++) {
crc = crc32_ieee(TEST_SEED, buf + i, TEST_SIZE - i);
crc_ref = crc32_ieee_base(TEST_SEED, buf + i, TEST_SIZE - i);
if (crc != crc_ref)
fail++;
if (verbose)
printf("crc eob rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
else
printf(".");
}
printf("Test done: %s\n", fail ? "Fail" : "Pass");
if (fail)
printf("\nFailed %d tests\n", fail);
return fail;
}

653
crc/crc32_iscsi_00.asm Normal file
View File

@ -0,0 +1,653 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Function to compute iscsi CRC32 with table-based recombination
; crc done "by 3" with block sizes 1920, 960, 480, 240
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%include "reg_sizes.asm"
default rel
; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
%macro crcB3 3
%define %%bSize %1 ; 1/3 of buffer size
%define %%td2 %2 ; table offset for crc0 (2/3 of buffer)
%define %%td1 %3 ; table offset for crc1 (1/3 of buffer)
%IF %%bSize=640
sub len, %%bSize*3
js %%crcB3_end ;; jump to next level if 3*blockSize > len
%ELSE
cmp len, %%bSize*3
jnae %%crcB3_end ;; jump to next level if 3*blockSize > len
%ENDIF
;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
%%crcB3_loop:
;; rax = crc0 = initial crc
xor rbx, rbx ;; rbx = crc1 = 0;
xor r10, r10 ;; r10 = crc2 = 0;
%assign i 0
%rep %%bSize/8 - 1
crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
crc32 r10, qword [bufptmp+i + 2*%%bSize] ;; update crc2
%assign i (i+8)
%endrep
crc32 rax, qword [bufptmp+i + 0*%%bSize] ;; update crc0
crc32 rbx, qword [bufptmp+i + 1*%%bSize] ;; update crc1
; SKIP ;crc32 r10, [bufptmp+i + 2*%%bSize] ;; update crc2
; merge in crc0
movzx bufp_dw, al
mov r9d, [crc_init + bufp*4 + %%td2]
movzx bufp_dw, ah
shr eax, 16
mov r11d, [crc_init + bufp*4 + %%td2]
shl r11, 8
xor r9, r11
movzx bufp_dw, al
mov r11d, [crc_init + bufp*4 + %%td2]
movzx bufp_dw, ah
shl r11, 16
xor r9, r11
mov r11d, [crc_init + bufp*4 + %%td2]
shl r11, 24
xor r9, r11
; merge in crc1
movzx bufp_dw, bl
mov r11d, [crc_init + bufp*4 + %%td1]
movzx bufp_dw, bh
shr ebx, 16
xor r9, r11
mov r11d, [crc_init + bufp*4 + %%td1]
shl r11, 8
xor r9, r11
movzx bufp_dw, bl
mov r11d, [crc_init + bufp*4 + %%td1]
movzx bufp_dw, bh
shl r11, 16
xor r9, r11
mov r11d, [crc_init + bufp*4 + %%td1]
shl r11, 24
xor r9, r11
xor r9, [bufptmp+i + 2*%%bSize]
crc32 r10, r9
mov rax, r10
add bufptmp, %%bSize*3 ;; move to next block
sub len, %%bSize*3
%IF %%bSize=640
jns %%crcB3_loop
%ENDIF
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%%crcB3_end:
%IF %%bSize=640
add len, %%bSize*3
%ENDIF
je do_return ;; return if remaining data is zero
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; ISCSI CRC 32 Implementation with crc32 Instruction
;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
;;;
;;; *buf = rcx
;;; len = rdx
;;; crc_init = r8
;;;
global crc32_iscsi_00:function
crc32_iscsi_00:
%ifidn __OUTPUT_FORMAT__, elf64
%define bufp rdi
%define bufp_dw edi
%define bufp_w di
%define bufp_b dil
%define bufptmp rcx
%define block_0 rcx
%define block_1 r8
%define block_2 r11
%define len rsi
%define len_dw esi
%define len_w si
%define len_b sil
%define crc_init rdx
%define crc_init_dw edx
%else
%define bufp rcx
%define bufp_dw ecx
%define bufp_w cx
%define bufp_b cl
%define bufptmp rdi
%define block_0 rdi
%define block_1 rsi
%define block_2 r11
%define len rdx
%define len_dw edx
%define len_w dx
%define len_b dl
%define crc_init r8
%define crc_init_dw r8d
%endif
push rdi
push rbx
mov rax, crc_init ;; rax = crc_init;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov bufptmp, bufp ;; rdi = *buf
neg bufp
and bufp, 7 ;; calculate the unalignment
;; amount of the address
je proc_block ;; Skip if aligned
cmp len, 8
jb less_than_8
;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;;
mov rbx, [bufptmp] ;; load a quadword from the buffer
add bufptmp, bufp ;; align buffer pointer for
;; quadword processing
sub len, bufp ;; update buffer length
align_loop:
crc32 eax, bl ;; compute crc32 of 1-byte
shr rbx, 8 ;; get next byte
dec bufp
jne align_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
proc_block:
cmp len, 240
jb bit8
lea crc_init, [mul_table_72] ;; load table base address
crcB3 640, 0x1000, 0x0c00 ; 640*3 = 1920 (Tables 1280, 640)
crcB3 320, 0x0c00, 0x0800 ; 320*3 = 960 (Tables 640, 320)
crcB3 160, 0x0800, 0x0400 ; 160*3 = 480 (Tables 320, 160)
crcB3 80, 0x0400, 0x0000 ; 80*3 = 240 (Tables 160, 80)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
bit8:
shl len_b, 1 ;; shift-out MSB (bit-7)
jnc bit7 ;; jump to bit-6 if bit-7 == 0
%assign i 0
%rep 16
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return ;; return if remaining data is zero
add bufptmp, 128 ;; buf +=64; (next 64 bytes)
bit7:
shl len_b, 1 ;; shift-out MSB (bit-7)
jnc bit6 ;; jump to bit-6 if bit-7 == 0
%assign i 0
%rep 8
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return ;; return if remaining data is zero
add bufptmp, 64 ;; buf +=64; (next 64 bytes)
bit6:
shl len_b, 1 ;; shift-out MSB (bit-6)
jnc bit5 ;; jump to bit-5 if bit-6 == 0
%assign i 0
%rep 4
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return ;; return if remaining data is zero
add bufptmp, 32 ;; buf +=32; (next 32 bytes)
bit5:
shl len_b, 1 ;; shift-out MSB (bit-5)
jnc bit4 ;; jump to bit-4 if bit-5 == 0
%assign i 0
%rep 2
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return ;; return if remaining data is zero
add bufptmp, 16 ;; buf +=16; (next 16 bytes)
bit4:
shl len_b, 1 ;; shift-out MSB (bit-4)
jnc bit3 ;; jump to bit-3 if bit-4 == 0
crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
je do_return ;; return if remaining data is zero
add bufptmp, 8 ;; buf +=8; (next 8 bytes)
bit3:
mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
shl len_b, 1 ;; shift-out MSB (bit-3)
jnc bit2 ;; jump to bit-2 if bit-3 == 0
crc32 eax, ebx ;; compute crc32 of 4-byte data
je do_return ;; return if remaining data is zero
shr rbx, 32 ;; get next 3 bytes
bit2:
shl len_b, 1 ;; shift-out MSB (bit-2)
jnc bit1 ;; jump to bit-1 if bit-2 == 0
crc32 eax, bx ;; compute crc32 of 2-byte data
je do_return ;; return if remaining data is zero
shr rbx, 16 ;; next byte
bit1:
test len_b,len_b
je do_return
crc32 eax, bl ;; compute crc32 of 1-byte data
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_return:
pop rbx
pop rdi
ret
less_than_8:
test len,4
jz less_than_4
crc32 eax, dword[bufptmp]
add bufptmp,4
less_than_4:
test len,2
jz less_than_2
crc32 eax, word[bufptmp]
add bufptmp,2
less_than_2:
test len,1
jz do_return
crc32 rax, byte[bufptmp]
pop rbx
pop bufptmp
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
section .data
align 8
mul_table_72:
DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
mul_table_152:
DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
mul_table_312:
DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
mul_table_632:
DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
mul_table_1272:
DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
;;; func core, ver, snum
slversion crc32_iscsi_00, 00, 03, 0014

568
crc/crc32_iscsi_01.asm Normal file
View File

@ -0,0 +1,568 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
%include "reg_sizes.asm"
default rel
%define CONCAT(a,b,c) a %+ b %+ c
; Define threshold where buffers are considered "small" and routed to more
; efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
; SMALL_SIZE can be no larger than 256.
%define SMALL_SIZE 200
%if (SMALL_SIZE > 256)
%error SMALL_ SIZE must be <= 256
% error ; needed because '%error' actually generates only a warning
%endif
;;; unsigned int crc32_iscsi_01(unsigned char * buffer, int len, unsigned int crc_init);
;;;
;;; *buf = rcx
;;; len = rdx
;;; crc_init = r8
global crc32_iscsi_01:function
crc32_iscsi_01:
%ifidn __OUTPUT_FORMAT__, elf64
%define bufp rdi
%define bufp_dw edi
%define bufp_w di
%define bufp_b dil
%define bufptmp rcx
%define block_0 rcx
%define block_1 rdx
%define block_2 r11
%define len rsi
%define len_dw esi
%define len_w si
%define len_b sil
%define crc_init_arg rdx
%else
%define bufp rcx
%define bufp_dw ecx
%define bufp_w cx
%define bufp_b cl
%define bufptmp rdi
%define block_0 rdi
%define block_1 rsi
%define block_2 r11
%define len rdx
%define len_dw edx
%define len_w dx
%define len_b dl
%endif
%define tmp rbx
%define crc_init r8
%define crc_init_dw r8d
%define crc1 r9
%define crc2 r10
push rbx
push rdi
push rsi
;; Move crc_init for Linux to a different reg
%ifidn __OUTPUT_FORMAT__, elf64
mov crc_init, crc_init_arg
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov bufptmp, bufp ;; rdi = *buf
neg bufp
and bufp, 7 ;; calculate the unalignment amount of
;; the address
je proc_block ;; Skip if aligned
;; If len is less than 8 and we're unaligned, we need to jump
;; to special code to avoid reading beyond the end of the buffer
cmp len, 8
jb less_than_8
;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;
mov tmp, [bufptmp] ;; load a quadword from the buffer
add bufptmp, bufp ;; align buffer pointer for quadword
;; processing
sub len, bufp ;; update buffer length
align_loop:
crc32 crc_init_dw, bl ;; compute crc32 of 1-byte
shr tmp, 8 ;; get next byte
dec bufp
jne align_loop
proc_block:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 2) PROCESS BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; compute num of bytes to be processed
mov tmp, len ;; save num bytes in tmp
cmp len, 128*24
jae full_block
continue_block:
cmp len, SMALL_SIZE
jb small
;; len < 128*24
mov rax, 2731 ;; 2731 = ceil(2^16 / 24)
mul len_dw
shr rax, 16
;; eax contains floor(bytes / 24) = num 24-byte chunks to do
;; process rax 24-byte chunks (128 >= rax >= 0)
;; compute end address of each block
;; rdi -> block 0 (base addr + RAX * 8)
;; rsi -> block 1 (base addr + RAX * 16)
;; r11 -> block 2 (base addr + RAX * 24)
lea block_0, [bufptmp + rax * 8]
lea block_1, [block_0 + rax * 8]
lea block_2, [block_1 + rax * 8]
xor crc1,crc1
xor crc2,crc2
;; branch into array
lea bufp, [jump_table]
movzx len, word [bufp + rax * 2] ;; len is offset from crc_array
lea bufp, [bufp + len + crc_array - jump_table]
jmp bufp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 2a) PROCESS FULL BLOCKS: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
full_block:
mov rax, 128
lea block_1, [block_0 + 128*8*2]
lea block_2, [block_0 + 128*8*3]
add block_0, 128*8*1
xor crc1,crc1
xor crc2,crc2
; ;; branch into array
; jmp CONCAT(crc_,128,)
; Fall thruogh into top of crc array (crc_128)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 3) CRC Array: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
crc_array:
%assign i 128
%rep 128-1
CONCAT(crc_,i,:)
crc32 crc_init, qword [block_0 - i*8]
crc32 crc1, qword [block_1 - i*8]
crc32 crc2, qword [block_2 - i*8]
%assign i (i-1)
%endrep
CONCAT(crc_,i,:)
crc32 crc_init, qword [block_0 - i*8]
crc32 crc1, qword [block_1 - i*8]
; SKIP ;crc32 crc2, [block_2 - i*8] ; Don't do this one yet
mov block_0, block_2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 4) Combine three results: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
lea bufp, [K_table - 16] ; first entry is for idx 1
shl rax, 3 ; rax *= 8
sub tmp, rax ; tmp -= rax*8
shl rax, 1
sub tmp, rax ; tmp -= rax*16 (total tmp -= rax*24)
add bufp, rax
movdqa xmm0, [bufp] ; 2 consts: K1:K2
movq xmm1, crc_init ; CRC for block 1
pclmulqdq xmm1, xmm0, 0x00 ; Multiply by K2
movq xmm2, crc1 ; CRC for block 2
pclmulqdq xmm2, xmm0, 0x10 ; Multiply by K1
pxor xmm1, xmm2
movq rax, xmm1
xor rax, [block_2 - i*8]
mov crc_init, crc2
crc32 crc_init, rax
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 5) Check for end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
CONCAT(crc_,0,:)
mov len, tmp
cmp tmp, 128*24
jae full_block
cmp tmp, 24
jae continue_block
fewer_than_24:
;; now fewer than 24 bytes remain
cmp tmp, 16
jae do_16
cmp tmp, 8
jae do_8
;; 0 <= tmp <= 7
shl ebx, 29 ; size now in bits 31:29
jz do_return
check_4:
mov bufp, [bufptmp]
shl ebx, 1 ; shift out into carry MSB (orig size & 4)
jnc check_2
crc32 crc_init_dw, bufp_dw
jz do_return
shr bufp, 32 ; shift data down by 4 bytes
check_2:
shl ebx, 1 ; shift out into carry MSB (orig size & 2)
jnc check_1
crc32 crc_init_dw, bufp_w
jz do_return
shr bufp, 16 ; shift data down by 2 bytes
check_1:
crc32 crc_init_dw, bufp_b
do_return:
mov rax, crc_init
pop rsi
pop rdi
pop rbx
ret
do_8:
crc32 crc_init, qword [bufptmp]
add bufptmp, 8
shl ebx, 29 ; size (0...7) in bits 31:29
jnz check_4
mov rax, crc_init
pop rsi
pop rdi
pop rbx
ret
do_16:
crc32 crc_init, qword [bufptmp]
crc32 crc_init, qword [bufptmp+8]
add bufptmp, 16
shl ebx, 29 ; size (0...7) in bits 31:29
jnz check_4
mov rax, crc_init
pop rsi
pop rdi
pop rbx
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Handle the case of fewer than 8 bytes, unaligned. In this case
;; we can't read 8 bytes, as this might go beyond the end of the buffer
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
less_than_8:
test len,4
jz less_than_4
crc32 crc_init_dw, dword[bufptmp]
add bufptmp,4
less_than_4:
test len,2
jz less_than_2
crc32 crc_init_dw, word[bufptmp]
add bufptmp,2
less_than_2:
test len,1
jz do_return
crc32 crc_init_dw, byte[bufptmp]
mov rax, crc_init
pop rsi
pop rdi
pop rbx
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
small:
mov rax, crc_init
bit8:
shl len_b, 1 ;; shift-out MSB (bit-7)
jnc bit7 ;; jump to bit-6 if bit-7 == 0
%assign i 0
%rep 16
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return2 ;; return if remaining data is zero
add bufptmp, 128 ;; buf +=64; (next 64 bytes)
bit7:
shl len_b, 1 ;; shift-out MSB (bit-7)
jnc bit6 ;; jump to bit-6 if bit-7 == 0
%assign i 0
%rep 8
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return2 ;; return if remaining data is zero
add bufptmp, 64 ;; buf +=64; (next 64 bytes)
bit6:
shl len_b, 1 ;; shift-out MSB (bit-6)
jnc bit5 ;; jump to bit-5 if bit-6 == 0
%assign i 0
%rep 4
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return2 ;; return if remaining data is zero
add bufptmp, 32 ;; buf +=32; (next 32 bytes)
bit5:
shl len_b, 1 ;; shift-out MSB (bit-5)
jnc bit4 ;; jump to bit-4 if bit-5 == 0
%assign i 0
%rep 2
crc32 rax, qword [bufptmp+i] ;; compute crc32 of 8-byte data
%assign i (i+8)
%endrep
je do_return2 ;; return if remaining data is zero
add bufptmp, 16 ;; buf +=16; (next 16 bytes)
bit4:
shl len_b, 1 ;; shift-out MSB (bit-4)
jnc bit3 ;; jump to bit-3 if bit-4 == 0
crc32 rax, qword [bufptmp] ;; compute crc32 of 8-byte data
je do_return2 ;; return if remaining data is zero
add bufptmp, 8 ;; buf +=8; (next 8 bytes)
bit3:
mov rbx, qword [bufptmp] ;; load a 8-bytes from the buffer:
shl len_b, 1 ;; shift-out MSB (bit-3)
jnc bit2 ;; jump to bit-2 if bit-3 == 0
crc32 eax, ebx ;; compute crc32 of 4-byte data
je do_return2 ;; return if remaining data is zero
shr rbx, 32 ;; get next 3 bytes
bit2:
shl len_b, 1 ;; shift-out MSB (bit-2)
jnc bit1 ;; jump to bit-1 if bit-2 == 0
crc32 eax, bx ;; compute crc32 of 2-byte data
je do_return2 ;; return if remaining data is zero
shr rbx, 16 ;; next byte
bit1:
test len_b,len_b
je do_return2
crc32 eax, bl ;; compute crc32 of 1-byte data
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_return2:
pop rsi
pop rdi
pop rbx
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; jump table ;; Table is 129 entries x 2 bytes each
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 4
jump_table:
%assign i 0
%rep 129
dw CONCAT(crc_,i,) - crc_array
%assign i (i+1)
%endrep
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; PCLMULQDQ tables
;; Table is 128 entries x 2 quad words each
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section .data
align 64
K_table:
dq 0x14cd00bd6, 0x105ec76f0
dq 0x0ba4fc28e, 0x14cd00bd6
dq 0x1d82c63da, 0x0f20c0dfe
dq 0x09e4addf8, 0x0ba4fc28e
dq 0x039d3b296, 0x1384aa63a
dq 0x102f9b8a2, 0x1d82c63da
dq 0x14237f5e6, 0x01c291d04
dq 0x00d3b6092, 0x09e4addf8
dq 0x0c96cfdc0, 0x0740eef02
dq 0x18266e456, 0x039d3b296
dq 0x0daece73e, 0x0083a6eec
dq 0x0ab7aff2a, 0x102f9b8a2
dq 0x1248ea574, 0x1c1733996
dq 0x083348832, 0x14237f5e6
dq 0x12c743124, 0x02ad91c30
dq 0x0b9e02b86, 0x00d3b6092
dq 0x018b33a4e, 0x06992cea2
dq 0x1b331e26a, 0x0c96cfdc0
dq 0x17d35ba46, 0x07e908048
dq 0x1bf2e8b8a, 0x18266e456
dq 0x1a3e0968a, 0x11ed1f9d8
dq 0x0ce7f39f4, 0x0daece73e
dq 0x061d82e56, 0x0f1d0f55e
dq 0x0d270f1a2, 0x0ab7aff2a
dq 0x1c3f5f66c, 0x0a87ab8a8
dq 0x12ed0daac, 0x1248ea574
dq 0x065863b64, 0x08462d800
dq 0x11eef4f8e, 0x083348832
dq 0x1ee54f54c, 0x071d111a8
dq 0x0b3e32c28, 0x12c743124
dq 0x0064f7f26, 0x0ffd852c6
dq 0x0dd7e3b0c, 0x0b9e02b86
dq 0x0f285651c, 0x0dcb17aa4
dq 0x010746f3c, 0x018b33a4e
dq 0x1c24afea4, 0x0f37c5aee
dq 0x0271d9844, 0x1b331e26a
dq 0x08e766a0c, 0x06051d5a2
dq 0x093a5f730, 0x17d35ba46
dq 0x06cb08e5c, 0x11d5ca20e
dq 0x06b749fb2, 0x1bf2e8b8a
dq 0x1167f94f2, 0x021f3d99c
dq 0x0cec3662e, 0x1a3e0968a
dq 0x19329634a, 0x08f158014
dq 0x0e6fc4e6a, 0x0ce7f39f4
dq 0x08227bb8a, 0x1a5e82106
dq 0x0b0cd4768, 0x061d82e56
dq 0x13c2b89c4, 0x188815ab2
dq 0x0d7a4825c, 0x0d270f1a2
dq 0x10f5ff2ba, 0x105405f3e
dq 0x00167d312, 0x1c3f5f66c
dq 0x0f6076544, 0x0e9adf796
dq 0x026f6a60a, 0x12ed0daac
dq 0x1a2adb74e, 0x096638b34
dq 0x19d34af3a, 0x065863b64
dq 0x049c3cc9c, 0x1e50585a0
dq 0x068bce87a, 0x11eef4f8e
dq 0x1524fa6c6, 0x19f1c69dc
dq 0x16cba8aca, 0x1ee54f54c
dq 0x042d98888, 0x12913343e
dq 0x1329d9f7e, 0x0b3e32c28
dq 0x1b1c69528, 0x088f25a3a
dq 0x02178513a, 0x0064f7f26
dq 0x0e0ac139e, 0x04e36f0b0
dq 0x0170076fa, 0x0dd7e3b0c
dq 0x141a1a2e2, 0x0bd6f81f8
dq 0x16ad828b4, 0x0f285651c
dq 0x041d17b64, 0x19425cbba
dq 0x1fae1cc66, 0x010746f3c
dq 0x1a75b4b00, 0x18db37e8a
dq 0x0f872e54c, 0x1c24afea4
dq 0x01e41e9fc, 0x04c144932
dq 0x086d8e4d2, 0x0271d9844
dq 0x160f7af7a, 0x052148f02
dq 0x05bb8f1bc, 0x08e766a0c
dq 0x0a90fd27a, 0x0a3c6f37a
dq 0x0b3af077a, 0x093a5f730
dq 0x04984d782, 0x1d22c238e
dq 0x0ca6ef3ac, 0x06cb08e5c
dq 0x0234e0b26, 0x063ded06a
dq 0x1d88abd4a, 0x06b749fb2
dq 0x04597456a, 0x04d56973c
dq 0x0e9e28eb4, 0x1167f94f2
dq 0x07b3ff57a, 0x19385bf2e
dq 0x0c9c8b782, 0x0cec3662e
dq 0x13a9cba9e, 0x0e417f38a
dq 0x093e106a4, 0x19329634a
dq 0x167001a9c, 0x14e727980
dq 0x1ddffc5d4, 0x0e6fc4e6a
dq 0x00df04680, 0x0d104b8fc
dq 0x02342001e, 0x08227bb8a
dq 0x00a2a8d7e, 0x05b397730
dq 0x168763fa6, 0x0b0cd4768
dq 0x1ed5a407a, 0x0e78eb416
dq 0x0d2c3ed1a, 0x13c2b89c4
dq 0x0995a5724, 0x1641378f0
dq 0x19b1afbc4, 0x0d7a4825c
dq 0x109ffedc0, 0x08d96551c
dq 0x0f2271e60, 0x10f5ff2ba
dq 0x00b0bf8ca, 0x00bf80dd2
dq 0x123888b7a, 0x00167d312
dq 0x1e888f7dc, 0x18dcddd1c
dq 0x002ee03b2, 0x0f6076544
dq 0x183e8d8fe, 0x06a45d2b2
dq 0x133d7a042, 0x026f6a60a
dq 0x116b0f50c, 0x1dd3e10e8
dq 0x05fabe670, 0x1a2adb74e
dq 0x130004488, 0x0de87806c
dq 0x000bcf5f6, 0x19d34af3a
dq 0x18f0c7078, 0x014338754
dq 0x017f27698, 0x049c3cc9c
dq 0x058ca5f00, 0x15e3e77ee
dq 0x1af900c24, 0x068bce87a
dq 0x0b5cfca28, 0x0dd07448e
dq 0x0ded288f8, 0x1524fa6c6
dq 0x059f229bc, 0x1d8048348
dq 0x06d390dec, 0x16cba8aca
dq 0x037170390, 0x0a3e3e02c
dq 0x06353c1cc, 0x042d98888
dq 0x0c4584f5c, 0x0d73c7bea
dq 0x1f16a3418, 0x1329d9f7e
dq 0x0531377e2, 0x185137662
dq 0x1d8d9ca7c, 0x1b1c69528
dq 0x0b25b29f2, 0x18a08b5bc
dq 0x19fb2a8b0, 0x02178513a
dq 0x1a08fe6ac, 0x1da758ae0
dq 0x045cddf4e, 0x0e0ac139e
dq 0x1a91647f2, 0x169cf9eb0
dq 0x1a0f717c4, 0x0170076fa
;;; func core, ver, snum
slversion crc32_iscsi_01, 01, 03, 0015

86
crc/crc32_iscsi_perf.c Normal file
View File

@ -0,0 +1,86 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>
#include "crc.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 1000000
# define TEST_TYPE_STR "_warm"
#else
// Uncached test. Pull from large mem base.
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN (2 * GT_L3_CACHE)
# define TEST_LOOPS 500
# define TEST_TYPE_STR "_cold"
#endif
#ifndef TEST_SEED
# define TEST_SEED 0x1234
#endif
#define TEST_MEM TEST_LEN
int main(int argc, char *argv[])
{
int i;
void *buf;
uint32_t crc;
struct perf start, stop;
printf("crc32_iscsi_perf:\n");
if (posix_memalign(&buf, 1024, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
printf("Start timed tests\n");
fflush(0);
crc = crc32_iscsi(buf, TEST_LEN, TEST_SEED);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
crc = crc32_iscsi(buf, TEST_LEN, TEST_SEED);
}
perf_stop(&stop);
printf("crc32_iscsi" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
printf("finish 0x%x\n", crc);
return 0;
}

171
crc/crc32_iscsi_test.c Normal file
View File

@ -0,0 +1,171 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "crc.h"
#include "types.h"
unsigned long crc32_table_iscsi[256] = {
0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
};
#define PAGESIZE 10240
int main(void)
{
unsigned int i, j, good, test, init_crc = 1;
printf("crc32_iscsi_test: ");
unsigned char *q_buf = malloc(PAGESIZE);
if (q_buf == NULL) {
printf("alloc of q_buf failed\n");
return -1;
}
// fill q_buf with semi-random data
for (i = 0; i < PAGESIZE; i++)
q_buf[i] = (unsigned char)(i ^ (13 + (i >> 8)) ^ ((i >> 16) - 13));
// Test case 1: Compare against base/simple crc32 implementation and
// try all offsets/alignments of buffer.
for (j = 0; j < 128; j++) {
for (i = 0; i < PAGESIZE - j; i++) {
good = crc32_iscsi_base(q_buf + j, i, -1);
test = crc32_iscsi(q_buf + j, i, -1);
if (good != test) {
printf("Error for size %d offset %d, %08X should be %08X\n",
i, j, test, good);
return -1;
}
} // end for i
putchar('.');
fflush(0);
} // end for j
// Test case 2: Also vary initial CRC
for (j = 0; j < 128; j++) { // do all offsets
for (i = 0; i < PAGESIZE - j; i++) {
good = crc32_iscsi_base(q_buf + j, i, init_crc);
test = crc32_iscsi(q_buf + j, i, init_crc);
if (good != test) {
printf("Error for size %d offset %d, %08X should be %08X\n",
i, j, test, good);
return -1;
}
// modify init_crc semi-randomly
init_crc ^= 1 << ((i * 3 + j * 5) & 31);
} // end for i
putchar('.');
fflush(0);
} // end for j
// Test case 3: do end of buffer
for (i = 0; i < PAGESIZE; i++) {
good = crc32_iscsi_base(q_buf + i, PAGESIZE - i, -1);
test = crc32_iscsi(q_buf + i, PAGESIZE - i, -1);
if (good != test) {
printf("Error for size %d at eob, %08X should be %08X\n",
i, test, good);
return -1;
}
} // end for i
putchar('.');
fflush(0);
printf("Pass\n");
return 0;
}

170
crc/crc_base.c Normal file
View File

@ -0,0 +1,170 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdlib.h>
#include "crc.h"
#define MAX_ITER 8
uint32_t crc32_table_iscsi_base[256] = {
0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
};
// iSCSI CRC baseline function
unsigned int crc32_iscsi_base(unsigned char *buffer, int len, unsigned int crc_init)
{
unsigned int crc;
unsigned char *p_buf;
p_buf = (unsigned char *)buffer;
unsigned char *p_end = buffer + len;
crc = crc_init;
while (p_buf < (unsigned char *)p_end) {
crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++];
}
return crc;
}
// crc16_t10dif baseline function
// Slow crc16 from the definition. Can be sped up with a lookup table.
uint16_t crc16_t10dif_base(uint16_t seed, uint8_t * buf, uint64_t len)
{
size_t rem = seed;
unsigned int i, j;
uint16_t poly = 0x8bb7; // t10dif standard
for (i = 0; i < len; i++) {
rem = rem ^ (buf[i] << 8);
for (j = 0; j < MAX_ITER; j++) {
rem = rem << 1;
rem = (rem & 0x10000) ? rem ^ poly : rem;
}
}
return rem;
}
// crc32_ieee baseline function
// Slow crc32 from the definition. Can be sped up with a lookup table.
uint32_t crc32_ieee_base(uint32_t seed, uint8_t * buf, uint64_t len)
{
uint64_t rem = ~seed;
unsigned int i, j;
uint32_t poly = 0x04C11DB7; // IEEE standard
for (i = 0; i < len; i++) {
rem = rem ^ (buf[i] << 24);
for (j = 0; j < MAX_ITER; j++) {
rem = rem << 1;
rem = (rem & 0x100000000ULL) ? rem ^ poly : rem;
}
}
return ~rem;
}
struct slver {
unsigned short snum;
unsigned char ver;
unsigned char core;
};
struct slver crc32_iscsi_base_slver_0001011d;
struct slver crc32_iscsi_base_slver = { 0x011d, 0x02, 0x00 };
struct slver crc16_t10dif_base_slver_0001011e;
struct slver crc16_t10dif_base_slver = { 0x011e, 0x02, 0x00 };
struct slver crc32_ieee_base_slver_0001011f;
struct slver crc32_ieee_base_slver = { 0x011f, 0x02, 0x00 };

180
crc/crc_multibinary.asm Normal file
View File

@ -0,0 +1,180 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
default rel
[bits 64]
%ifidn __OUTPUT_FORMAT__, elf64
%define WRT_OPT wrt ..plt
%else
%define WRT_OPT
%endif
%include "reg_sizes.asm"
extern crc32_iscsi_00
extern crc32_iscsi_01
extern crc32_iscsi_base
extern crc32_ieee_01
extern crc32_ieee_by4 ;; Optimized for SLM
extern crc32_ieee_base
extern crc16_t10dif_01
extern crc16_t10dif_by4 ;; Optimized for SLM
extern crc16_t10dif_base
section .data
;;; *_mbinit are initial values for *_dispatched; is updated on first call.
;;; Therefore, *_dispatch_init is only executed on first call.
crc32_iscsi_dispatched:
dq crc32_iscsi_mbinit
crc32_ieee_dispatched:
dq crc32_ieee_mbinit
crc16_t10dif_dispatched:
dq crc16_t10dif_mbinit
section .text
;;;;
; crc32_iscsi multibinary function
;;;;
global crc32_iscsi:function
crc32_iscsi_mbinit:
call crc32_iscsi_dispatch_init
crc32_iscsi:
jmp qword [crc32_iscsi_dispatched]
crc32_iscsi_dispatch_init:
push rax
push rbx
push rcx
push rdx
push rsi
lea rsi, [crc32_iscsi_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea rbx, [crc32_iscsi_00 WRT_OPT]
lea rax, [crc32_iscsi_01 WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE4_2
cmovne rsi, rbx
test ecx, FLAG_CPUID1_ECX_CLMUL
cmovne rsi, rax
mov [crc32_iscsi_dispatched], rsi
pop rsi
pop rdx
pop rcx
pop rbx
pop rax
ret
;;;;
; crc32_ieee multibinary function
;;;;
global crc32_ieee:function
crc32_ieee_mbinit:
call crc32_ieee_dispatch_init
crc32_ieee:
jmp qword [crc32_ieee_dispatched]
crc32_ieee_dispatch_init:
push rax
push rbx
push rcx
push rdx
push rsi
lea rsi, [crc32_ieee_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea rbx, [crc32_ieee_01 WRT_OPT]
lea rdx, [crc32_ieee_by4 WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE3
jz use_ieee_base
test ecx, FLAG_CPUID1_ECX_CLMUL
cmovne rsi, rbx
and eax, FLAG_CPUID1_EAX_STEP_MASK
cmp eax, FLAG_CPUID1_EAX_AVOTON
cmove rsi, rdx
use_ieee_base:
mov [crc32_ieee_dispatched], rsi
pop rsi
pop rdx
pop rcx
pop rbx
pop rax
ret
;;;;
; crc16_t10dif multibinary function
;;;;
global crc16_t10dif:function
crc16_t10dif_mbinit:
call crc16_t10dif_dispatch_init
crc16_t10dif:
jmp qword [crc16_t10dif_dispatched]
crc16_t10dif_dispatch_init:
push rax
push rbx
push rcx
push rdx
push rsi
lea rsi, [crc16_t10dif_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea rbx, [crc16_t10dif_01 WRT_OPT]
lea rdx, [crc16_t10dif_by4 WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE3
jz use_t10dif_base
test ecx, FLAG_CPUID1_ECX_CLMUL
cmovne rsi, rbx
and eax, FLAG_CPUID1_EAX_STEP_MASK
cmp eax, FLAG_CPUID1_EAX_AVOTON
cmove rsi, rdx
use_t10dif_base:
mov [crc16_t10dif_dispatched], rsi
pop rsi
pop rdx
pop rcx
pop rbx
pop rax
ret
;;; func core, ver, snum
slversion crc16_t10dif, 00, 03, 011a
slversion crc32_ieee, 00, 03, 011b
slversion crc32_iscsi, 00, 03, 011c

63
crc/crc_simple_test.c Normal file
View File

@ -0,0 +1,63 @@
/**********************************************************************
Copyright(c) 2011-2013 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdint.h>
#include "crc.h"
const uint16_t init_crc_16 = 0x1234;
const uint16_t t10_dif_expected = 0x60b3;
const uint32_t init_crc_32 = 0x12345678;
const uint32_t ieee_expected = 0x2ceadbe3;
int main(void)
{
unsigned char p_buf[48];
uint16_t t10_dif_computed;
uint32_t ieee_computed;
int i;
for (i = 0; i < 48; i++)
p_buf[i] = i;
t10_dif_computed = crc16_t10dif(init_crc_16, p_buf, 48);
if (t10_dif_computed != t10_dif_expected)
printf("WRONG CRC-16(T10 DIF) value\n");
else
printf("CORRECT CRC-16(T10 DIF) value\n");
ieee_computed = crc32_ieee(init_crc_32, p_buf, 48);
if (ieee_computed != ieee_expected)
printf("WRONG CRC-32(IEEE) value\n");
else
printf("CORRECT CRC-32(IEEE) value\n");
return 0;
}

134
include/crc.h Normal file
View File

@ -0,0 +1,134 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
/**
* @file crc.h
* @brief CRC functions.
*/
#ifndef _CRC_H_
#define _CRC_H_
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Multi-binary functions */
/**
* @brief Generate CRC from the T10 standard, runs appropriate version.
*
* This function determines what instruction sets are enabled and
* selects the appropriate version at runtime.
*
* @returns 16 bit CRC
*/
uint16_t crc16_t10dif(
uint16_t init_crc, //!< initial CRC value, 16 bits
const unsigned char *buf, //!< buffer to calculate CRC on
uint64_t len //!< buffer length in bytes (64-bit data)
);
/**
* @brief Generate CRC from the IEEE standard, runs appropriate version.
*
* This function determines what instruction sets are enabled and
* selects the appropriate version at runtime.
*
* @returns 32 bit CRC
*/
uint32_t crc32_ieee(
uint32_t init_crc, //!< initial CRC value, 32 bits
const unsigned char *buf, //!< buffer to calculate CRC on
uint64_t len //!< buffer length in bytes (64-bit data)
);
/**
* @brief ISCSI CRC function, runs appropriate version.
*
* This function determines what instruction sets are enabled and
* selects the appropriate version at runtime.
*
* @returns 32 bit CRC
*/
unsigned int crc32_iscsi(
unsigned char *buffer, //!< buffer to calculate CRC on
int len, //!< buffer length in bytes
unsigned int init_crc //!< initial CRC value
);
/* Base functions */
/**
* @brief ISCSI CRC function, baseline version
* @returns 32 bit CRC
*/
unsigned int crc32_iscsi_base(
unsigned char *buffer, //!< buffer to calculate CRC on
int len, //!< buffer length in bytes
unsigned int crc_init //!< initial CRC value
);
/**
* @brief Generate CRC from the T10 standard, runs baseline version
* @returns 16 bit CRC
*/
uint16_t crc16_t10dif_base(
uint16_t seed, //!< initial CRC value, 16 bits
uint8_t *buf, //!< buffer to calculate CRC on
uint64_t len //!< buffer length in bytes (64-bit data)
);
/**
* @brief Generate CRC from the IEEE standard, runs baseline version
* @returns 32 bit CRC
*/
uint32_t crc32_ieee_base(
uint32_t seed, //!< initial CRC value, 32 bits
uint8_t *buf, //!< buffer to calculate CRC on
uint64_t len //!< buffer length in bytes (64-bit data)
);
#ifdef __cplusplus
}
#endif
#endif // _CRC_H_

View File

@ -68,7 +68,8 @@
%define FLAG_XGETBV_EAX_XMM_YMM 0x6
%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0
%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0
; define d and w variants for registers

View File

@ -69,3 +69,9 @@ pq_gen_base @65
xor_gen_base @66
xor_check_base @67
pq_check_base @68
crc16_t10dif @69
crc32_ieee @70
crc32_iscsi @71
crc16_t10dif_base @72
crc32_ieee_base @73
crc32_iscsi_base @74