mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 09:23:50 +01:00
crc: use k-mask to load final bytes of data
Change-Id: Ibd8d2144bc6942e11911e25a6365c1cb108af477 Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
parent
9f2b68f057
commit
22d33cf795
@ -61,32 +61,25 @@ section .text
|
|||||||
%xdefine arg3 rdx
|
%xdefine arg3 rdx
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__, win64
|
|
||||||
%define XMM_SAVE 16*2
|
|
||||||
%define VARIABLE_OFFSET 16*12+8
|
|
||||||
%else
|
|
||||||
%define VARIABLE_OFFSET 16*2+8
|
|
||||||
%endif
|
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
mk_global FUNCTION_NAME, function
|
mk_global FUNCTION_NAME, function
|
||||||
FUNCTION_NAME:
|
FUNCTION_NAME:
|
||||||
endbranch
|
endbranch
|
||||||
not arg1
|
not arg1
|
||||||
sub rsp, VARIABLE_OFFSET
|
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__, win64
|
%ifidn __OUTPUT_FORMAT__, win64
|
||||||
|
sub rsp, 16*10 + 8
|
||||||
|
|
||||||
; push the xmm registers into the stack to maintain
|
; push the xmm registers into the stack to maintain
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
|
vmovdqa [rsp + 16*0], xmm6
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
|
vmovdqa [rsp + 16*1], xmm7
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
|
vmovdqa [rsp + 16*2], xmm8
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
|
vmovdqa [rsp + 16*3], xmm9
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
|
vmovdqa [rsp + 16*4], xmm10
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
|
vmovdqa [rsp + 16*5], xmm11
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
|
vmovdqa [rsp + 16*6], xmm12
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
|
vmovdqa [rsp + 16*7], xmm13
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
|
vmovdqa [rsp + 16*8], xmm14
|
||||||
vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
|
vmovdqa [rsp + 16*9], xmm15
|
||||||
%endif
|
%endif
|
||||||
vbroadcasti32x4 zmm18, [SHUF_MASK]
|
vbroadcasti32x4 zmm18, [SHUF_MASK]
|
||||||
cmp arg3, 256
|
cmp arg3, 256
|
||||||
@ -296,18 +289,19 @@ _cleanup:
|
|||||||
|
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__, win64
|
%ifidn __OUTPUT_FORMAT__, win64
|
||||||
vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
|
vmovdqa xmm6, [rsp + 16*0]
|
||||||
vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
|
vmovdqa xmm7, [rsp + 16*1]
|
||||||
vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
|
vmovdqa xmm8, [rsp + 16*2]
|
||||||
vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
|
vmovdqa xmm9, [rsp + 16*3]
|
||||||
vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
|
vmovdqa xmm10, [rsp + 16*4]
|
||||||
vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
|
vmovdqa xmm11, [rsp + 16*5]
|
||||||
vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
|
vmovdqa xmm12, [rsp + 16*6]
|
||||||
vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
|
vmovdqa xmm13, [rsp + 16*7]
|
||||||
vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
|
vmovdqa xmm14, [rsp + 16*8]
|
||||||
vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
|
vmovdqa xmm15, [rsp + 16*9]
|
||||||
|
|
||||||
|
add rsp, 16*10 + 8
|
||||||
%endif
|
%endif
|
||||||
add rsp, VARIABLE_OFFSET
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
@ -364,62 +358,16 @@ _less_than_32:
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
_less_than_16_left:
|
_less_than_16_left:
|
||||||
; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
|
lea rax, [rel byte_len_to_mask_table]
|
||||||
|
kmovw k1, [rax + arg3*2]
|
||||||
vpxor xmm1, xmm1
|
vmovdqu8 xmm7{k1}{z}, [arg2]
|
||||||
mov r11, rsp
|
|
||||||
vmovdqa [r11], xmm1
|
|
||||||
|
|
||||||
; backup the counter value
|
|
||||||
mov r9, arg3
|
|
||||||
cmp arg3, 8
|
|
||||||
jl _less_than_8_left
|
|
||||||
|
|
||||||
; load 8 Bytes
|
|
||||||
mov rax, [arg2]
|
|
||||||
mov [r11], rax
|
|
||||||
add r11, 8
|
|
||||||
sub arg3, 8
|
|
||||||
add arg2, 8
|
|
||||||
_less_than_8_left:
|
|
||||||
|
|
||||||
cmp arg3, 4
|
|
||||||
jl _less_than_4_left
|
|
||||||
|
|
||||||
; load 4 Bytes
|
|
||||||
mov eax, [arg2]
|
|
||||||
mov [r11], eax
|
|
||||||
add r11, 4
|
|
||||||
sub arg3, 4
|
|
||||||
add arg2, 4
|
|
||||||
_less_than_4_left:
|
|
||||||
|
|
||||||
cmp arg3, 2
|
|
||||||
jl _less_than_2_left
|
|
||||||
|
|
||||||
; load 2 Bytes
|
|
||||||
mov ax, [arg2]
|
|
||||||
mov [r11], ax
|
|
||||||
add r11, 2
|
|
||||||
sub arg3, 2
|
|
||||||
add arg2, 2
|
|
||||||
_less_than_2_left:
|
|
||||||
cmp arg3, 1
|
|
||||||
jl _zero_left
|
|
||||||
|
|
||||||
; load 1 Byte
|
|
||||||
mov al, [arg2]
|
|
||||||
mov [r11], al
|
|
||||||
|
|
||||||
_zero_left:
|
|
||||||
vmovdqa xmm7, [rsp]
|
|
||||||
vpshufb xmm7, xmm18
|
vpshufb xmm7, xmm18
|
||||||
vpxor xmm7, xmm0 ; xor the initial crc value
|
vpxor xmm7, xmm0 ; xor the initial crc value
|
||||||
|
|
||||||
lea rax, [pshufb_shf_table + 16]
|
lea rax, [pshufb_shf_table + 16]
|
||||||
sub rax, r9
|
sub rax, arg3
|
||||||
|
|
||||||
cmp r9, 8
|
cmp arg3, 8
|
||||||
jl _end_1to7
|
jl _end_1to7
|
||||||
|
|
||||||
_end_8to15:
|
_end_8to15:
|
||||||
@ -482,7 +430,6 @@ INCLUDE_CONSTS
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
mask1: dq 0x8080808080808080, 0x8080808080808080
|
mask1: dq 0x8080808080808080, 0x8080808080808080
|
||||||
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
|
|
||||||
mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
|
mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
|
||||||
|
|
||||||
SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
|
SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
|
||||||
@ -510,6 +457,13 @@ dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
|
|||||||
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
|
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
|
||||||
dq 0x8080808080808080, 0x8080808080808080
|
dq 0x8080808080808080, 0x8080808080808080
|
||||||
|
|
||||||
|
align 16
|
||||||
|
byte_len_to_mask_table:
|
||||||
|
dw 0x0000, 0x0001, 0x0003, 0x0007,
|
||||||
|
dw 0x000f, 0x001f, 0x003f, 0x007f,
|
||||||
|
dw 0x00ff, 0x01ff, 0x03ff, 0x07ff,
|
||||||
|
dw 0x0fff, 0x1fff, 0x3fff, 0x7fff,
|
||||||
|
|
||||||
|
|
||||||
%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
|
%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
|
||||||
%ifidn __OUTPUT_FORMAT__, win64
|
%ifidn __OUTPUT_FORMAT__, win64
|
||||||
|
Loading…
Reference in New Issue
Block a user