isa-l/igzip/igzip_buffer_utils_04.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%ifndef BUFFER_UTILS
%define	BUFFER_UTILS

%include "options.asm"

extern pshufb_shf_table
extern mask3

%ifdef FIX_CACHE_READ
%define vmovntdqa vmovdqa
%else
%macro prefetchnta 1
%endm
%endif

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; code for doing the CRC calculation as part of copy-in, using pclmulqdq

; "shift" 4 input registers down 4 places
; macro FOLD4	xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
%macro FOLD4	7
%define %%xmm0	%1	; xmm reg, in/out
%define %%xmm1	%2	; xmm reg, in/out
%define %%xmm2	%3	; xmm reg, in/out
%define %%xmm3	%4	; xmm reg, in/out
%define %%const	%5	; xmm reg, in
%define %%tmp0	%6	; xmm reg, tmp
%define %%tmp1	%7	; xmm reg, tmp

	vmovaps		%%tmp0, %%xmm0
	vmovaps		%%tmp1, %%xmm1

	vpclmulqdq	%%xmm0, %%const, 0x01
	vpclmulqdq	%%xmm1, %%const, 0x01

	vpclmulqdq	%%tmp0, %%const, 0x10
	vpclmulqdq	%%tmp1, %%const, 0x10

	vxorps		%%xmm0, %%tmp0
	vxorps		%%xmm1, %%tmp1


	vmovaps		%%tmp0, %%xmm2
	vmovaps		%%tmp1, %%xmm3

	vpclmulqdq	%%xmm2, %%const, 0x01
	vpclmulqdq	%%xmm3, %%const, 0x01

	vpclmulqdq	%%tmp0, %%const, 0x10
	vpclmulqdq	%%tmp1, %%const, 0x10

	vxorps		%%xmm2, %%tmp0
	vxorps		%%xmm3, %%tmp1
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; "shift" 3 input registers down 4 places
; macro FOLD3	x0, x1, x2, x3, const, tmp0
;     x0 x1 x2 x3
; In   A  B  C  D
; Out  D  A' B' C'
%macro FOLD3	6
%define %%x0	%1	; xmm reg, in/out
%define %%x1	%2	; xmm reg, in/out
%define %%x2	%3	; xmm reg, in/out
%define %%x3	%4	; xmm reg, in/out
%define %%const	%5	; xmm reg, in
%define %%tmp0	%6	; xmm reg, tmp

	vmovdqa		%%tmp0, %%x3

	vmovaps		%%x3, %%x2
	vpclmulqdq	%%x2, %%const, 0x01
	vpclmulqdq	%%x3, %%const, 0x10
	vxorps		%%x3, %%x2

	vmovaps		%%x2, %%x1
	vpclmulqdq	%%x1, %%const, 0x01
	vpclmulqdq	%%x2, %%const, 0x10
	vxorps		%%x2, %%x1

	vmovaps		%%x1, %%x0
	vpclmulqdq	%%x0, %%const, 0x01
	vpclmulqdq	%%x1, %%const, 0x10
	vxorps		%%x1, %%x0

	vmovdqa		%%x0, %%tmp0
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; "shift" 2 input registers down 4 places
; macro FOLD2	x0, x1, x2, x3, const, tmp0
;     x0 x1 x2 x3
; In   A  B  C  D
; Out  C  D  A' B'
%macro FOLD2	6
%define %%x0	%1	; xmm reg, in/out
%define %%x1	%2	; xmm reg, in/out
%define %%x2	%3	; xmm reg, in/out
%define %%x3	%4	; xmm reg, in/out
%define %%const	%5	; xmm reg, in
%define %%tmp0	%6	; xmm reg, tmp

	vmovdqa		%%tmp0, %%x3

	vmovaps		%%x3, %%x1
	vpclmulqdq	%%x1, %%const, 0x01
	vpclmulqdq	%%x3, %%const, 0x10
	vxorps		%%x3, %%x1

	vmovdqa		%%x1, %%tmp0
	vmovdqa		%%tmp0, %%x2

	vmovaps		%%x2, %%x0
	vpclmulqdq	%%x0, %%const, 0x01
	vpclmulqdq	%%x2, %%const, 0x10
	vxorps		%%x2, %%x0

	vmovdqa		%%x0, %%tmp0
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; "shift" 1 input registers down 4 places
; macro FOLD1	x0, x1, x2, x3, const, tmp0
;     x0 x1 x2 x3
; In   A  B  C  D
; Out  B  C  D  A'
%macro FOLD1	6
%define %%x0	%1	; xmm reg, in/out
%define %%x1	%2	; xmm reg, in/out
%define %%x2	%3	; xmm reg, in/out
%define %%x3	%4	; xmm reg, in/out
%define %%const	%5	; xmm reg, in
%define %%tmp0	%6	; xmm reg, tmp

	vmovdqa		%%tmp0, %%x3

	vmovaps		%%x3, %%x0
	vpclmulqdq	%%x0, %%const, 0x01
	vpclmulqdq	%%x3, %%const, 0x10
	vxorps		%%x3, %%x0

	vmovdqa		%%x0, %%x1
	vmovdqa		%%x1, %%x2
	vmovdqa		%%x2, %%tmp0
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3

;                  XP X3 X2 X1 X0 tmp2
; Initial state    xI HG FE DC BA
; after shift         IH GF ED CB A0
; after fold          ff GF ED CB      ff = merge(IH, A0)
;
%macro PARTIAL_FOLD 12
%define %%x0	%1	; xmm reg, in/out
%define %%x1	%2	; xmm reg, in/out
%define %%x2	%3	; xmm reg, in/out
%define %%x3	%4	; xmm reg, in/out
%define %%xp    %5	; xmm partial reg, in/clobbered
%define %%size  %6      ; GPR,     in/clobbered  (1...15)
%define %%const	%7	; xmm reg, in
%define %%shl	%8	; xmm reg, tmp
%define %%shr	%9	; xmm reg, tmp
%define %%tmp2	%10	; xmm reg, tmp
%define %%tmp3	%11	; xmm reg, tmp
%define %%gtmp  %12	; GPR,     tmp

	; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
	shl	%%size, 4	; size *= 16
	lea	%%gtmp, [pshufb_shf_table - 16 WRT_OPT]
	vmovdqa	%%shl, [%%gtmp + %%size]		; shl constant
	vmovdqa	%%shr, %%shl
	vpxor %%shr, [mask3 WRT_OPT]		; shr constant

	vmovdqa	%%tmp2, %%x0	; tmp2 = BA
	vpshufb	%%tmp2, %%shl	; tmp2 = A0

	vpshufb	%%x0, %%shr	; x0 = 0B
	vmovdqa	%%tmp3, %%x1	; tmp3 = DC
	vpshufb	%%tmp3, %%shl	; tmp3 = C0
	vpor	%%x0, %%tmp3	; x0 = CB

	vpshufb	%%x1, %%shr	; x1 = 0D
	vmovdqa	%%tmp3, %%x2	; tmp3 = FE
	vpshufb	%%tmp3, %%shl	; tmp3 = E0
	vpor	%%x1, %%tmp3	; x1 = ED

	vpshufb	%%x2, %%shr	; x2 = 0F
	vmovdqa	%%tmp3, %%x3	; tmp3 = HG
	vpshufb	%%tmp3, %%shl	; tmp3 = G0
	vpor	%%x2, %%tmp3	; x2 = GF

	vpshufb	%%x3, %%shr	; x3 = 0H
	vpshufb	%%xp, %%shl	; xp = I0
	vpor	%%x3, %%xp	; x3 = IH

	; fold tmp2 into X3
	vmovaps		%%tmp3, %%tmp2
	vpclmulqdq	%%tmp2, %%const, 0x01
	vpclmulqdq	%%tmp3, %%const, 0x10
	vxorps		%%x3, %%tmp2
	vxorps		%%x3, %%tmp3
%endm


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
; Returns 0 if data has length 0.
; Input: The input data (src), that data's length (size).
; Output: The packed xmm register (xmm_out).
; size is clobbered.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro LOAD_FRACTIONAL_XMM	3
%define	%%xmm_out		%1 	; %%xmm_out is an xmm register
%define	%%src			%2
%define	%%size			%3

	vpxor	%%xmm_out, %%xmm_out

	cmp	%%size, 0
	je	%%_done

	add	%%src, %%size

	cmp	%%size, 8
	jl	%%_byte_loop

	sub	%%src, 8
	vpinsrq	%%xmm_out, [%%src], 0	;Read in 8 bytes if they exists
	sub	%%size, 8

	je	%%_done

%%_byte_loop:				;Read in data 1 byte at a time while data is left
	vpslldq	%%xmm_out, 1

	dec	%%src
	vpinsrb	%%xmm_out, BYTE [%%src], 0
	dec	%%size

	jg	%%_byte_loop

%%_done:

%endmacro ; LOAD_FRACTIONAL_XMM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; copy x bytes (rounded up to 16 bytes) from src to dst
; src & dst are unaligned
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
;		    xt0, xt1, xt2, xt3, xt4
%macro COPY_IN_CRC 14
%define %%dst	%1	; reg, in/clobbered
%define %%src	%2	; reg, in/clobbered
%define %%size	%3	; reg, in/clobbered
%define %%tmp	%4	; reg, tmp
%define %%x0	%5	; xmm, in/out: crc state
%define %%x1	%6	; xmm, in/out: crc state
%define %%x2	%7	; xmm, in/out: crc state
%define %%x3	%8	; xmm, in/out: crc state
%define %%xfold %9	; xmm, in:	(loaded from fold4)
%define %%xtmp0	%10	; xmm, tmp
%define %%xtmp1	%11	; xmm, tmp
%define %%xtmp2	%12	; xmm, tmp
%define %%xtmp3	%13	; xmm, tmp
%define %%xtmp4	%14	; xmm, tmp

	cmp	%%size, 16
	jl	%%lt_16

	; align source
	xor	%%tmp, %%tmp
	sub	%%tmp, %%src
	and	%%tmp, 15
	jz	%%already_aligned

	; need to align, tmp contains number of bytes to transfer
	vmovdqu	%%xtmp0, [%%src]
	vmovdqu	[%%dst], %%xtmp0
	add	%%dst, %%tmp
	add	%%src, %%tmp
	sub	%%size, %%tmp

%ifndef DEFLATE
	push	%%dst

	PARTIAL_FOLD	%%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
			%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
	pop	%%dst
%endif

%%already_aligned:
	sub	%%size, 64
	jl	%%end_loop
	jmp	%%loop
align 16
%%loop:
	vmovntdqa	%%xtmp0, [%%src+0*16]
	vmovntdqa	%%xtmp1, [%%src+1*16]
	vmovntdqa	%%xtmp2, [%%src+2*16]

%ifndef DEFLATE
	FOLD4		%%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
%endif
	vmovntdqa	%%xtmp3, [%%src+3*16]

	vmovdqu		[%%dst+0*16], %%xtmp0
	vmovdqu		[%%dst+1*16], %%xtmp1
	vmovdqu		[%%dst+2*16], %%xtmp2
	vmovdqu		[%%dst+3*16], %%xtmp3

%ifndef DEFLATE
	vpxor		%%x0, %%xtmp0
	vpxor		%%x1, %%xtmp1
	vpxor		%%x2, %%xtmp2
	vpxor		%%x3, %%xtmp3
%endif
	add	%%src,  4*16
	add	%%dst,  4*16
	sub	%%size, 4*16
	jge	%%loop

%%end_loop:
	; %%size contains (num bytes left - 64)
	add	%%size, 16
	jge	%%three_full_regs
	add	%%size, 16
	jge	%%two_full_regs
	add	%%size, 16
	jge	%%one_full_reg
	add	%%size, 16

%%no_full_regs:		; 0 <= %%size < 16, no full regs
	jz		%%done	; if no bytes left, we're done
	jmp		%%partial

	;; Handle case where input is <16 bytes
%%lt_16:
	test		%%size, %%size
	jz		%%done	; if no bytes left, we're done
	jmp		%%partial


%%one_full_reg:
	vmovntdqa	%%xtmp0, [%%src+0*16]

%ifndef DEFLATE
	FOLD1		%%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
	vmovdqu		[%%dst+0*16], %%xtmp0

%ifndef DEFLATE
	vpxor		%%x3, %%xtmp0
%endif
	test		%%size, %%size
	jz		%%done	; if no bytes left, we're done

	add		%%dst, 1*16
	add		%%src, 1*16
	jmp		%%partial


%%two_full_regs:
	vmovntdqa	%%xtmp0, [%%src+0*16]
	vmovntdqa	%%xtmp1, [%%src+1*16]

%ifndef DEFLATE
	FOLD2		%%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
	vmovdqu		[%%dst+0*16], %%xtmp0
	vmovdqu		[%%dst+1*16], %%xtmp1

%ifndef DEFLATE
	vpxor		%%x2, %%xtmp0
	vpxor		%%x3, %%xtmp1
%endif
	test		%%size, %%size
	jz		%%done	; if no bytes left, we're done

	add		%%dst, 2*16
	add		%%src, 2*16
	jmp		%%partial


%%three_full_regs:
	vmovntdqa	%%xtmp0, [%%src+0*16]
	vmovntdqa	%%xtmp1, [%%src+1*16]
	vmovntdqa	%%xtmp2, [%%src+2*16]

%ifndef DEFLATE
	FOLD3		%%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
	vmovdqu		[%%dst+0*16], %%xtmp0
	vmovdqu		[%%dst+1*16], %%xtmp1
	vmovdqu		[%%dst+2*16], %%xtmp2

%ifndef DEFLATE
	vpxor		%%x1, %%xtmp0
	vpxor		%%x2, %%xtmp1
	vpxor		%%x3, %%xtmp2
%endif
	test		%%size, %%size
	jz		%%done	; if no bytes left, we're done

	add		%%dst, 3*16
	add		%%src, 3*16

	; fall through to %%partial
%%partial:		; 0 <= %%size < 16

%ifndef DEFLATE
	mov	%%tmp, %%size
%endif

	LOAD_FRACTIONAL_XMM	%%xtmp0, %%src, %%size

	vmovdqu		[%%dst], %%xtmp0

%ifndef DEFLATE
	PARTIAL_FOLD	%%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
			%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
%endif

%%done:
%endm


;%assign K   1024;
;%assign D   8 * K;       ; Amount of history
;%assign LA  17 * 16;     ; Max look-ahead, rounded up to 32 byte boundary

; copy D + LA bytes from src to dst
; dst is aligned
;void copy_D_LA(uint8_t *dst, uint8_t *src);
; arg 1: rcx : dst
; arg 2: rdx : src
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
%macro	copy_D_LA 7
%define %%dst	%1	; reg, clobbered
%define %%src	%2	; reg, clobbered
%define %%tmp	%3
%define %%ytmp0 %4
%define %%ytmp1 %5
%define %%ytmp2 %6
%define %%ytmp3 %7

%define %%xtmp0 %4x

%assign %%SIZE (D + LA) / 32   ; number of DQ words to be copied
%assign %%SIZE4 %%SIZE/4
%assign %%MOD16 ((D + LA) - 32 * %%SIZE) / 16

	lea	%%tmp, [%%dst + 4 * 32 * %%SIZE4]
	jmp	%%copy_D_LA_1
align 16
%%copy_D_LA_1:
	vmovdqu	%%ytmp0, [%%src]
	vmovdqu	%%ytmp1, [%%src + 1 * 32]
	vmovdqu	%%ytmp2, [%%src + 2 * 32]
	vmovdqu	%%ytmp3, [%%src + 3 * 32]
	vmovdqa	[%%dst],    %%ytmp0
	vmovdqa	[%%dst + 1 * 32], %%ytmp1
	vmovdqa	[%%dst + 2 * 32], %%ytmp2
	vmovdqa	[%%dst + 3 * 32], %%ytmp3
	add	%%src, 4*32
	add	%%dst, 4*32
	cmp	%%dst, %%tmp
	jne	%%copy_D_LA_1
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)

%if (%%i == 0)
	vmovdqu	%%ytmp0, [%%src + %%i*32]
%elif (%%i == 1)
	vmovdqu	%%ytmp1, [%%src + %%i*32]
%elif (%%i == 2)
	vmovdqu	%%ytmp2, [%%src + %%i*32]
%elif (%%i == 3)
	vmovdqu	%%ytmp3, [%%src + %%i*32]
%else
	%error too many i
	 % error
%endif

%assign %%i %%i+1
%endrep
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)

%if (%%i == 0)
	vmovdqa	[%%dst + %%i*32], %%ytmp0
%elif (%%i == 1)
	vmovdqa	[%%dst + %%i*32], %%ytmp1
%elif (%%i == 2)
	vmovdqa	[%%dst + %%i*32], %%ytmp2
%elif (%%i == 3)
	vmovdqa	[%%dst + %%i*32], %%ytmp3
%else
	%error too many i
	 % error
%endif

%assign %%i %%i+1
%endrep

%rep %%MOD16
	vmovdqu %%xtmp0, [%%src + (%%SIZE - 4 * %%SIZE4)*32]
	vmovdqa [%%dst + (%%SIZE - 4 * %%SIZE4)*32], %%xtmp0
%endrep

%endm
%endif