mirror of
https://github.com/intel/isa-l.git
synced 2025-02-21 06:37:40 +01:00
igzip: Improve igzip stateful on large buffers
Change-Id: I3e9e56958e8ea3e636df3369b29c3d4b94dce9d8 Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
parent
547e8e1893
commit
d941b66162
@ -28,8 +28,6 @@
|
||||
########################################################################
|
||||
|
||||
lsrc += igzip/igzip.c igzip/hufftables_c.c \
|
||||
igzip/crc_utils_01.asm \
|
||||
igzip/crc_utils_04.asm \
|
||||
igzip/igzip_body_01.asm \
|
||||
igzip/igzip_body_02.asm \
|
||||
igzip/igzip_body_04.asm \
|
||||
@ -64,8 +62,6 @@ other_tests += igzip/igzip_file_perf igzip/igzip_sync_flush_file_perf igzip/igz
|
||||
|
||||
other_src += igzip/bitbuf2.asm igzip/data_struct2.asm \
|
||||
igzip/inflate_data_structs.asm \
|
||||
igzip/igzip_buffer_utils_01.asm \
|
||||
igzip/igzip_buffer_utils_04.asm \
|
||||
igzip/igzip_body.asm igzip/igzip_finish.asm \
|
||||
igzip/lz0a_const.asm igzip/options.asm igzip/stdmac.asm igzip/igzip_compare_types.asm \
|
||||
igzip/bitbuf2.h igzip/repeated_char_result.h \
|
||||
|
@ -84,8 +84,8 @@ section .text
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global crc32_gzip
|
||||
crc32_gzip:
|
||||
global crc32_gzip_01
|
||||
crc32_gzip_01:
|
||||
|
||||
; unsigned long c = crc ^ 0xffffffffL;
|
||||
not arg1_low32 ;
|
||||
|
@ -8,43 +8,6 @@ section .data
|
||||
|
||||
align 32
|
||||
|
||||
global pshufb_shf_table:data internal
|
||||
pshufb_shf_table:
|
||||
dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
|
||||
dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
|
||||
dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
|
||||
dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
|
||||
dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
|
||||
dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
|
||||
dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
|
||||
dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
|
||||
dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
|
||||
dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
|
||||
dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
|
||||
dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
|
||||
dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
|
||||
dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
|
||||
dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
|
||||
|
||||
;; ; MAGIC value, which when folded 4 times gives FFFFFF00000...0000
|
||||
;; global crc_init_4
|
||||
;; crc_init_4:
|
||||
;; dq 0x9db42487
|
||||
;; dq 0x0
|
||||
;; dq 0x0
|
||||
;; dq 0x0
|
||||
|
||||
; constant used to shift/fold one XMM reg down by 4 XMM widths
|
||||
global fold_4:data internal
|
||||
fold_4:
|
||||
dq 0x00000001c6e41596
|
||||
dq 0x0000000154442bd4
|
||||
|
||||
|
||||
;value, which when xored with pshufb_shf_table entry gives shr value
|
||||
global mask3:data internal
|
||||
mask3: dq 0x8080808080808080, 0x8080808080808080
|
||||
|
||||
%ifndef CRC_TABLE
|
||||
%define CRC_TABLE
|
||||
; Place marker in library to avoid linker warning
|
||||
|
@ -1,195 +0,0 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%include "options.asm"
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
; Functional versions of CRC macros
|
||||
|
||||
%include "igzip_buffer_utils_01.asm"
|
||||
|
||||
extern fold_4
|
||||
|
||||
%define crc_0 xmm0 ; in/out: crc state
|
||||
%define crc_1 xmm1 ; in/out: crc state
|
||||
%define crc_2 xmm2 ; in/out: crc state
|
||||
%define crc_3 xmm3 ; in/out: crc state
|
||||
%define crc_fold xmm4 ; in: (loaded from fold_4)
|
||||
%define crc_tmp0 xmm5 ; tmp
|
||||
%define crc_tmp1 xmm6 ; tmp
|
||||
%define crc_tmp2 xmm7 ; tmp
|
||||
%define crc_tmp3 xmm8 ; tmp
|
||||
%define crc_tmp4 xmm9 ; tmp
|
||||
%define tmp4 rax
|
||||
|
||||
; copy x bytes (rounded up to 16 bytes) from src to dst with crc
|
||||
; src & dst are unaligned
|
||||
; void copy_in_crc(uint8_t *dst, uint8_t *src, uint32_t size, uint32_t *crc)
|
||||
; arg 1: rcx: pointer to dst
|
||||
; arg 2: rdx: pointer to src
|
||||
; arg 3: r8: size (in bytes)
|
||||
; arg 4: r9: pointer to CRC
|
||||
;; %if 0
|
||||
global copy_in_crc_01
|
||||
copy_in_crc_01:
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
mov r9, rcx
|
||||
mov r8, rdx
|
||||
mov rdx, rsi
|
||||
mov rcx, rdi
|
||||
%endif
|
||||
|
||||
; Save xmm registers that need to be preserved.
|
||||
sub rsp, 8 + 4*16
|
||||
movdqa [rsp+0*16], xmm6
|
||||
movdqa [rsp+1*16], xmm7
|
||||
movdqa [rsp+2*16], xmm8
|
||||
movdqa [rsp+3*16], xmm9
|
||||
|
||||
movdqa crc_0, [r9 + 0*16]
|
||||
movdqa crc_1, [r9 + 1*16]
|
||||
movdqa crc_2, [r9 + 2*16]
|
||||
movdqa crc_3, [r9 + 3*16]
|
||||
|
||||
movdqa crc_fold, [fold_4 WRT_OPT]
|
||||
COPY_IN_CRC rcx, rdx, r8, tmp4, crc_0, crc_1, crc_2, crc_3, \
|
||||
crc_fold, \
|
||||
crc_tmp0, crc_tmp1, crc_tmp2, crc_tmp3, crc_tmp4
|
||||
|
||||
movdqa [r9 + 0*16], crc_0
|
||||
movdqa [r9 + 1*16], crc_1
|
||||
movdqa [r9 + 2*16], crc_2
|
||||
movdqa [r9 + 3*16], crc_3
|
||||
|
||||
movdqa xmm9, [rsp+3*16]
|
||||
movdqa xmm8, [rsp+2*16]
|
||||
movdqa xmm7, [rsp+1*16]
|
||||
movdqa xmm6, [rsp+0*16]
|
||||
add rsp, 8 + 4*16
|
||||
ret
|
||||
|
||||
; Convert 512-bit CRC data to real 32-bit value
|
||||
; uint32_t crc_512to32(uint32_t *crc)
|
||||
; arg 1: rcx: pointer to CRC
|
||||
; returns: eax: 32 bit crc
|
||||
global crc_512to32_01
|
||||
crc_512to32_01:
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
mov rcx, rdi
|
||||
%endif
|
||||
|
||||
movdqa crc_0, [rcx + 0*16]
|
||||
movdqa crc_1, [rcx + 1*16]
|
||||
movdqa crc_2, [rcx + 2*16]
|
||||
movdqa crc_3, [rcx + 3*16]
|
||||
|
||||
movdqa crc_fold, [rk1 WRT_OPT] ;k1
|
||||
|
||||
; fold the 4 xmm registers to 1 xmm register with different constants
|
||||
movdqa crc_tmp0, crc_0
|
||||
pclmulqdq crc_0, crc_fold, 0x1
|
||||
pclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
pxor crc_1, crc_tmp0
|
||||
pxor crc_1, crc_0
|
||||
|
||||
movdqa crc_tmp0, crc_1
|
||||
pclmulqdq crc_1, crc_fold, 0x1
|
||||
pclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
pxor crc_2, crc_tmp0
|
||||
pxor crc_2, crc_1
|
||||
|
||||
movdqa crc_tmp0, crc_2
|
||||
pclmulqdq crc_2, crc_fold, 0x1
|
||||
pclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
pxor crc_3, crc_tmp0
|
||||
pxor crc_3, crc_2
|
||||
|
||||
|
||||
movdqa crc_fold, [rk5 WRT_OPT]
|
||||
movdqa crc_0, crc_3
|
||||
|
||||
pclmulqdq crc_3, crc_fold, 0
|
||||
|
||||
psrldq crc_0, 8
|
||||
|
||||
pxor crc_3, crc_0
|
||||
|
||||
movdqa crc_0, crc_3
|
||||
|
||||
|
||||
pslldq crc_3, 4
|
||||
|
||||
pclmulqdq crc_3, crc_fold, 0x10
|
||||
|
||||
|
||||
pxor crc_3, crc_0
|
||||
|
||||
pand crc_3, [mask2 WRT_OPT]
|
||||
|
||||
movdqa crc_1, crc_3
|
||||
|
||||
movdqa crc_2, crc_3
|
||||
|
||||
movdqa crc_fold, [rk7 WRT_OPT]
|
||||
|
||||
|
||||
pclmulqdq crc_3, crc_fold, 0
|
||||
pxor crc_3, crc_2
|
||||
|
||||
pand crc_3, [mask WRT_OPT]
|
||||
|
||||
movdqa crc_2, crc_3
|
||||
|
||||
pclmulqdq crc_3, crc_fold, 0x10
|
||||
|
||||
pxor crc_3, crc_2
|
||||
|
||||
pxor crc_3, crc_1
|
||||
|
||||
pextrd eax, crc_3, 2
|
||||
|
||||
not eax
|
||||
|
||||
ret
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
rk1: dq 0x00000000ccaa009e
|
||||
rk2: dq 0x00000001751997d0
|
||||
rk5: dq 0x00000000ccaa009e
|
||||
rk6: dq 0x0000000163cd6124
|
||||
rk7: dq 0x00000001f7011640
|
||||
rk8: dq 0x00000001db710640
|
||||
|
||||
mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
|
||||
mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
|
@ -1,194 +0,0 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%include "options.asm"
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
; Functional versions of CRC macros
|
||||
|
||||
%include "igzip_buffer_utils_04.asm"
|
||||
|
||||
extern fold_4
|
||||
|
||||
%define crc_0 xmm0 ; in/out: crc state
|
||||
%define crc_1 xmm1 ; in/out: crc state
|
||||
%define crc_2 xmm2 ; in/out: crc state
|
||||
%define crc_3 xmm3 ; in/out: crc state
|
||||
%define crc_fold xmm4 ; in: (loaded from fold_4)
|
||||
%define crc_tmp0 xmm5 ; tmp
|
||||
%define crc_tmp1 xmm6 ; tmp
|
||||
%define crc_tmp2 xmm7 ; tmp
|
||||
%define crc_tmp3 xmm8 ; tmp
|
||||
%define crc_tmp4 xmm9 ; tmp
|
||||
%define tmp4 rax
|
||||
|
||||
; copy x bytes (rounded up to 16 bytes) from src to dst with crc
|
||||
; src & dst are unaligned
|
||||
; void copy_in_crc(uint8_t *dst, uint8_t *src, uint32_t size, uint32_t *crc)
|
||||
; arg 1: rcx: pointer to dst
|
||||
; arg 2: rdx: pointer to src
|
||||
; arg 3: r8: size (in bytes)
|
||||
; arg 4: r9: pointer to CRC
|
||||
;; %if 0
|
||||
global copy_in_crc_04
|
||||
copy_in_crc_04:
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
mov r9, rcx
|
||||
mov r8, rdx
|
||||
mov rdx, rsi
|
||||
mov rcx, rdi
|
||||
%endif
|
||||
|
||||
; Save xmm registers that need to be preserved.
|
||||
sub rsp, 8 + 4*16
|
||||
vmovdqa [rsp+0*16], xmm6
|
||||
vmovdqa [rsp+1*16], xmm7
|
||||
vmovdqa [rsp+2*16], xmm8
|
||||
vmovdqa [rsp+3*16], xmm9
|
||||
|
||||
vmovdqa crc_0, [r9 + 0*16]
|
||||
vmovdqa crc_1, [r9 + 1*16]
|
||||
vmovdqa crc_2, [r9 + 2*16]
|
||||
vmovdqa crc_3, [r9 + 3*16]
|
||||
|
||||
vmovdqa crc_fold, [fold_4 WRT_OPT]
|
||||
COPY_IN_CRC rcx, rdx, r8, tmp4, crc_0, crc_1, crc_2, crc_3, \
|
||||
crc_fold, \
|
||||
crc_tmp0, crc_tmp1, crc_tmp2, crc_tmp3, crc_tmp4
|
||||
|
||||
vmovdqa [r9 + 0*16], crc_0
|
||||
vmovdqa [r9 + 1*16], crc_1
|
||||
vmovdqa [r9 + 2*16], crc_2
|
||||
vmovdqa [r9 + 3*16], crc_3
|
||||
|
||||
vmovdqa xmm9, [rsp+3*16]
|
||||
vmovdqa xmm8, [rsp+2*16]
|
||||
vmovdqa xmm7, [rsp+1*16]
|
||||
vmovdqa xmm6, [rsp+0*16]
|
||||
add rsp, 8 + 4*16
|
||||
ret
|
||||
|
||||
; Convert 512-bit CRC data to real 32-bit value
|
||||
; uint32_t crc_512to32(uint32_t *crc)
|
||||
; arg 1: rcx: pointer to CRC
|
||||
; returns: eax: 32 bit crc
|
||||
global crc_512to32_04
|
||||
crc_512to32_04:
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
mov rcx, rdi
|
||||
%endif
|
||||
|
||||
vmovdqa crc_0, [rcx + 0*16]
|
||||
vmovdqa crc_1, [rcx + 1*16]
|
||||
vmovdqa crc_2, [rcx + 2*16]
|
||||
vmovdqa crc_3, [rcx + 3*16]
|
||||
|
||||
vmovdqa crc_fold, [rk1 WRT_OPT] ;k1
|
||||
|
||||
; fold the 4 xmm registers to 1 xmm register with different constants
|
||||
vmovdqa crc_tmp0, crc_0
|
||||
vpclmulqdq crc_0, crc_fold, 0x1
|
||||
vpclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
vpxor crc_1, crc_tmp0
|
||||
vpxor crc_1, crc_0
|
||||
|
||||
vmovdqa crc_tmp0, crc_1
|
||||
vpclmulqdq crc_1, crc_fold, 0x1
|
||||
vpclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
vpxor crc_2, crc_tmp0
|
||||
vpxor crc_2, crc_1
|
||||
|
||||
vmovdqa crc_tmp0, crc_2
|
||||
vpclmulqdq crc_2, crc_fold, 0x1
|
||||
vpclmulqdq crc_tmp0, crc_fold, 0x10
|
||||
vpxor crc_3, crc_tmp0
|
||||
vpxor crc_3, crc_2
|
||||
|
||||
|
||||
vmovdqa crc_fold, [rk5 WRT_OPT]
|
||||
vmovdqa crc_0, crc_3
|
||||
|
||||
vpclmulqdq crc_3, crc_fold, 0
|
||||
|
||||
vpsrldq crc_0, 8
|
||||
|
||||
vpxor crc_3, crc_0
|
||||
|
||||
vmovdqa crc_0, crc_3
|
||||
|
||||
|
||||
vpslldq crc_3, 4
|
||||
|
||||
vpclmulqdq crc_3, crc_fold, 0x10
|
||||
|
||||
|
||||
vpxor crc_3, crc_0
|
||||
|
||||
vpand crc_3, [mask2 WRT_OPT]
|
||||
|
||||
vmovdqa crc_1, crc_3
|
||||
|
||||
vmovdqa crc_2, crc_3
|
||||
|
||||
vmovdqa crc_fold, [rk7 WRT_OPT]
|
||||
|
||||
vpclmulqdq crc_3, crc_fold, 0
|
||||
vpxor crc_3, crc_2
|
||||
|
||||
vpand crc_3, [mask WRT_OPT]
|
||||
|
||||
vmovdqa crc_2, crc_3
|
||||
|
||||
vpclmulqdq crc_3, crc_fold, 0x10
|
||||
|
||||
vpxor crc_3, crc_2
|
||||
|
||||
vpxor crc_3, crc_1
|
||||
|
||||
vpextrd eax, crc_3, 2
|
||||
|
||||
not eax
|
||||
|
||||
ret
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
rk1: dq 0x00000000ccaa009e
|
||||
rk2: dq 0x00000001751997d0
|
||||
rk5: dq 0x00000000ccaa009e
|
||||
rk6: dq 0x0000000163cd6124
|
||||
rk7: dq 0x00000001f7011640
|
||||
rk8: dq 0x00000001db710640
|
||||
|
||||
mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
|
||||
mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
|
@ -73,14 +73,13 @@ START_FIELDS ;; isal_zstate
|
||||
FIELD _b_bytes_valid, 4, 4
|
||||
FIELD _b_bytes_processed, 4, 4
|
||||
FIELD _file_start, 8, 8
|
||||
FIELD _crc, 64, 16
|
||||
FIELD _crc, 4, 4
|
||||
FIELD _bitbuf, _BitBuf2_size, _BitBuf2_align
|
||||
FIELD _state, 4, 4
|
||||
FIELD _count, 4, 4
|
||||
FIELD _tmp_out_buff, 16, 1
|
||||
FIELD _tmp_out_start, 4, 4
|
||||
FIELD _tmp_out_end, 4, 4
|
||||
FIELD _last_flush, 4, 4
|
||||
FIELD _has_gzip_hdr, 4, 4
|
||||
FIELD _has_eob, 4, 4
|
||||
FIELD _has_eob_hdr, 4, 4
|
||||
@ -128,7 +127,6 @@ _internal_state_count equ _internal_state+_count
|
||||
_internal_state_tmp_out_buff equ _internal_state+_tmp_out_buff
|
||||
_internal_state_tmp_out_start equ _internal_state+_tmp_out_start
|
||||
_internal_state_tmp_out_end equ _internal_state+_tmp_out_end
|
||||
_internal_state_last_flush equ _internal_state+_last_flush
|
||||
_internal_state_has_gzip_hdr equ _internal_state+_has_gzip_hdr
|
||||
_internal_state_has_eob equ _internal_state+_has_eob
|
||||
_internal_state_has_eob_hdr equ _internal_state+_has_eob_hdr
|
||||
|
@ -43,9 +43,6 @@
|
||||
# define DEFLATE 1
|
||||
#endif
|
||||
|
||||
|
||||
extern uint32_t CrcTable[256];
|
||||
|
||||
static inline uint32_t bsr(uint32_t val)
|
||||
{
|
||||
uint32_t msb;
|
||||
@ -210,16 +207,3 @@ static inline int compare258(uint8_t * str1, uint8_t * str2, uint32_t max_length
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static inline void update_crc(uint32_t* crc, uint8_t * start, uint32_t length)
|
||||
{
|
||||
#ifndef DEFLATE
|
||||
uint8_t *end = start + length;
|
||||
|
||||
while (start < end)
|
||||
*crc = (*crc >> 8) ^ CrcTable[(*crc & 0x000000FF) ^ *start++];
|
||||
#else
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
244
igzip/igzip.c
244
igzip/igzip.c
@ -54,6 +54,7 @@ extern const uint8_t gzip_hdr[];
|
||||
extern const uint32_t gzip_hdr_bytes;
|
||||
extern const uint32_t gzip_trl_bytes;
|
||||
extern const struct isal_hufftables hufftables_default;
|
||||
extern uint32_t CrcTable[256];
|
||||
|
||||
extern uint32_t crc32_gzip(uint32_t init_crc, const unsigned char *buf, uint64_t len);
|
||||
|
||||
@ -76,8 +77,6 @@ unsigned int detect_repeated_char(uint8_t * buf, uint32_t size);
|
||||
|
||||
void isal_deflate_body(struct isal_zstream *stream);
|
||||
void isal_deflate_finish(struct isal_zstream *stream);
|
||||
uint32_t crc_512to32_01(uint32_t * crc);
|
||||
uint32_t get_crc(uint32_t * crc);
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
@ -107,12 +106,6 @@ struct slver isal_deflate_stateless_slver_01010083;
|
||||
struct slver isal_deflate_stateless_slver = { 0x0083, 0x01, 0x01 };
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
uint32_t file_size(struct isal_zstate *state)
|
||||
{
|
||||
return state->b_bytes_valid + (uint32_t) (state->buffer - state->file_start);
|
||||
}
|
||||
|
||||
static
|
||||
void sync_flush(struct isal_zstream *stream)
|
||||
{
|
||||
@ -213,9 +206,14 @@ static void flush_write_buffer(struct isal_zstream *stream)
|
||||
}
|
||||
}
|
||||
|
||||
static void isal_deflate_int(struct isal_zstream *stream)
|
||||
static void isal_deflate_pass(struct isal_zstream *stream)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
|
||||
#ifndef DEFLATE
|
||||
uint8_t *start_in = stream->next_in;
|
||||
#endif
|
||||
|
||||
if (state->state == ZSTATE_NEW_HDR || state->state == ZSTATE_HDR)
|
||||
write_header(stream);
|
||||
|
||||
@ -231,10 +229,79 @@ static void isal_deflate_int(struct isal_zstream *stream)
|
||||
if (state->state == ZSTATE_FLUSH_WRITE_BUFFER)
|
||||
flush_write_buffer(stream);
|
||||
|
||||
#ifndef DEFLATE
|
||||
state->crc = crc32_gzip(state->crc, start_in, stream->next_in - start_in);
|
||||
#endif
|
||||
if (state->state == ZSTATE_TRL)
|
||||
write_trailer(stream);
|
||||
}
|
||||
|
||||
static void isal_deflate_int(struct isal_zstream *stream)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
uint32_t size;
|
||||
|
||||
/* Move data from temporary output buffer to output buffer */
|
||||
if (state->state >= ZSTATE_TMP_OFFSET) {
|
||||
size = state->tmp_out_end - state->tmp_out_start;
|
||||
if (size > stream->avail_out)
|
||||
size = stream->avail_out;
|
||||
memcpy(stream->next_out, state->tmp_out_buff + state->tmp_out_start, size);
|
||||
stream->next_out += size;
|
||||
stream->avail_out -= size;
|
||||
stream->total_out += size;
|
||||
state->tmp_out_start += size;
|
||||
|
||||
if (state->tmp_out_start == state->tmp_out_end)
|
||||
state->state -= ZSTATE_TMP_OFFSET;
|
||||
|
||||
if (stream->avail_out == 0 || state->state == ZSTATE_END
|
||||
|| state->state == ZSTATE_NEW_HDR)
|
||||
return;
|
||||
}
|
||||
assert(state->tmp_out_start == state->tmp_out_end);
|
||||
|
||||
isal_deflate_pass(stream);
|
||||
|
||||
/* Fill temporary output buffer then complete filling output buffer */
|
||||
if (stream->avail_out > 0 && stream->avail_out < 8 && state->state != ZSTATE_NEW_HDR) {
|
||||
uint8_t *next_out;
|
||||
uint32_t avail_out;
|
||||
uint32_t total_out;
|
||||
|
||||
next_out = stream->next_out;
|
||||
avail_out = stream->avail_out;
|
||||
total_out = stream->total_out;
|
||||
|
||||
stream->next_out = state->tmp_out_buff;
|
||||
stream->avail_out = sizeof(state->tmp_out_buff);
|
||||
stream->total_out = 0;
|
||||
|
||||
isal_deflate_pass(stream);
|
||||
|
||||
state->tmp_out_start = 0;
|
||||
state->tmp_out_end = stream->total_out;
|
||||
|
||||
stream->next_out = next_out;
|
||||
stream->avail_out = avail_out;
|
||||
stream->total_out = total_out;
|
||||
if (state->tmp_out_end) {
|
||||
size = state->tmp_out_end;
|
||||
if (size > stream->avail_out)
|
||||
size = stream->avail_out;
|
||||
memcpy(stream->next_out, state->tmp_out_buff, size);
|
||||
stream->next_out += size;
|
||||
stream->avail_out -= size;
|
||||
stream->total_out += size;
|
||||
state->tmp_out_start += size;
|
||||
if (state->tmp_out_start != state->tmp_out_end)
|
||||
state->state += ZSTATE_TMP_OFFSET;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static uint32_t write_constant_compressed_stateless(struct isal_zstream *stream,
|
||||
uint32_t repeated_char,
|
||||
uint32_t repeated_length,
|
||||
@ -396,6 +463,7 @@ static int isal_deflate_int_stateless(struct isal_zstream *stream, uint8_t * nex
|
||||
return STATELESS_OVERFLOW;
|
||||
|
||||
memset(stream->internal_state.head, 0, sizeof(stream->internal_state.head));
|
||||
stream->internal_state.file_start = stream->next_in;
|
||||
isal_deflate_body_stateless(stream);
|
||||
|
||||
if (!stream->internal_state.has_eob)
|
||||
@ -494,13 +562,11 @@ static inline void reset_match_history(struct isal_zstream *stream)
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < sizeof(state->head) / 2; i++) {
|
||||
head[i] =
|
||||
(uint16_t) (state->b_bytes_processed + state->buffer - state->file_start -
|
||||
IGZIP_HIST_SIZE);
|
||||
head[i] = (uint16_t) (stream->total_in);
|
||||
}
|
||||
}
|
||||
|
||||
void isal_deflate_init_01(struct isal_zstream *stream)
|
||||
void isal_deflate_init(struct isal_zstream *stream)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
|
||||
@ -514,7 +580,6 @@ void isal_deflate_init_01(struct isal_zstream *stream)
|
||||
state->has_eob = 0;
|
||||
state->has_eob_hdr = 0;
|
||||
state->left_over = 0;
|
||||
state->last_flush = 0;
|
||||
state->has_gzip_hdr = 0;
|
||||
state->state = ZSTATE_NEW_HDR;
|
||||
state->count = 0;
|
||||
@ -522,12 +587,11 @@ void isal_deflate_init_01(struct isal_zstream *stream)
|
||||
state->tmp_out_start = 0;
|
||||
state->tmp_out_end = 0;
|
||||
|
||||
state->file_start = state->buffer;
|
||||
state->file_start = stream->next_in;
|
||||
|
||||
init(&state->bitbuf);
|
||||
|
||||
memset(state->crc, 0, sizeof(state->crc));
|
||||
*state->crc = 0x9db42487;
|
||||
state->crc = 0;
|
||||
|
||||
memset(state->head, 0, sizeof(state->head));
|
||||
|
||||
@ -544,6 +608,15 @@ void isal_deflate_stateless_init(struct isal_zstream *stream)
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t crc32_gzip_base(uint32_t crc, uint8_t * start, uint32_t length)
|
||||
{
|
||||
uint8_t *end = start + length;
|
||||
crc = ~crc;
|
||||
while (start < end)
|
||||
crc = (crc >> 8) ^ CrcTable[(crc & 0x000000FF) ^ *start++];
|
||||
return ~crc;
|
||||
}
|
||||
|
||||
int isal_deflate_stateless(struct isal_zstream *stream)
|
||||
{
|
||||
uint8_t *next_in = stream->next_in;
|
||||
@ -634,74 +707,96 @@ int isal_deflate_stateless(struct isal_zstream *stream)
|
||||
int isal_deflate(struct isal_zstream *stream)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
uint32_t size;
|
||||
int ret = COMP_OK;
|
||||
uint8_t *next_in;
|
||||
uint32_t avail_in, avail_in_start;
|
||||
uint32_t flush_type = stream->flush;
|
||||
uint32_t end_of_stream = stream->end_of_stream;
|
||||
int size = 0;
|
||||
uint8_t *copy_down_src = NULL;
|
||||
uint64_t copy_down_size = 0;
|
||||
uint32_t processed = 0;
|
||||
|
||||
if (stream->flush < 3) {
|
||||
if (stream->flush >= 3)
|
||||
return INVALID_FLUSH;
|
||||
|
||||
state->last_flush = stream->flush;
|
||||
next_in = stream->next_in;
|
||||
avail_in = stream->avail_in;
|
||||
stream->total_in -= state->b_bytes_valid - state->b_bytes_processed;
|
||||
|
||||
if (state->state >= ZSTATE_TMP_OFFSET) {
|
||||
size = state->tmp_out_end - state->tmp_out_start;
|
||||
if (size > stream->avail_out)
|
||||
size = stream->avail_out;
|
||||
memcpy(stream->next_out, state->tmp_out_buff + state->tmp_out_start,
|
||||
size);
|
||||
stream->next_out += size;
|
||||
stream->avail_out -= size;
|
||||
stream->total_out += size;
|
||||
state->tmp_out_start += size;
|
||||
|
||||
if (state->tmp_out_start == state->tmp_out_end)
|
||||
state->state -= ZSTATE_TMP_OFFSET;
|
||||
|
||||
if (stream->avail_out == 0 || state->state == ZSTATE_END)
|
||||
return ret;
|
||||
while (processed < IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD) {
|
||||
size = avail_in;
|
||||
if (size > sizeof(state->buffer) - state->b_bytes_valid) {
|
||||
size = sizeof(state->buffer) - state->b_bytes_valid;
|
||||
stream->flush = NO_FLUSH;
|
||||
stream->end_of_stream = 0;
|
||||
}
|
||||
assert(state->tmp_out_start == state->tmp_out_end);
|
||||
memcpy(&state->buffer[state->b_bytes_valid], next_in, size);
|
||||
|
||||
isal_deflate_int(stream);
|
||||
next_in += size;
|
||||
avail_in -= size;
|
||||
state->b_bytes_valid += size;
|
||||
|
||||
if (stream->avail_out == 0)
|
||||
return ret;
|
||||
stream->next_in = &state->buffer[state->b_bytes_processed];
|
||||
stream->avail_in = state->b_bytes_valid - state->b_bytes_processed;
|
||||
state->file_start = stream->next_in - stream->total_in;
|
||||
|
||||
else if (stream->avail_out < 8) {
|
||||
uint8_t *next_out;
|
||||
uint32_t avail_out;
|
||||
uint32_t total_out;
|
||||
if (stream->avail_in > IGZIP_HIST_SIZE
|
||||
|| stream->end_of_stream || stream->flush != NO_FLUSH) {
|
||||
avail_in_start = stream->avail_in;
|
||||
isal_deflate_int(stream);
|
||||
state->b_bytes_processed += avail_in_start - stream->avail_in;
|
||||
|
||||
next_out = stream->next_out;
|
||||
avail_out = stream->avail_out;
|
||||
total_out = stream->total_out;
|
||||
if (state->b_bytes_processed > IGZIP_HIST_SIZE) {
|
||||
copy_down_src =
|
||||
&state->buffer[state->b_bytes_processed - IGZIP_HIST_SIZE];
|
||||
copy_down_size =
|
||||
state->b_bytes_valid - state->b_bytes_processed +
|
||||
IGZIP_HIST_SIZE;
|
||||
memmove(state->buffer, copy_down_src, copy_down_size);
|
||||
|
||||
stream->next_out = state->tmp_out_buff;
|
||||
stream->avail_out = sizeof(state->tmp_out_buff);
|
||||
stream->total_out = 0;
|
||||
state->b_bytes_valid -= copy_down_src - state->buffer;
|
||||
state->b_bytes_processed -= copy_down_src - state->buffer;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
stream->flush = flush_type;
|
||||
stream->end_of_stream = end_of_stream;
|
||||
if (avail_in <= 0 || stream->avail_out <= 0)
|
||||
break;
|
||||
processed += size;
|
||||
}
|
||||
|
||||
if (processed >= IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD) {
|
||||
stream->next_in = next_in - stream->avail_in;
|
||||
stream->avail_in = avail_in + stream->avail_in;
|
||||
|
||||
state->file_start = stream->next_in - stream->total_in;
|
||||
|
||||
if (stream->avail_in > 0 && stream->avail_out > 0)
|
||||
isal_deflate_int(stream);
|
||||
|
||||
state->tmp_out_start = 0;
|
||||
state->tmp_out_end = stream->total_out;
|
||||
size = stream->avail_in;
|
||||
if (stream->avail_in > IGZIP_HIST_SIZE)
|
||||
size = 0;
|
||||
|
||||
stream->next_out = next_out;
|
||||
stream->avail_out = avail_out;
|
||||
stream->total_out = total_out;
|
||||
if (state->tmp_out_end) {
|
||||
size = state->tmp_out_end;
|
||||
if (size > stream->avail_out)
|
||||
size = stream->avail_out;
|
||||
memcpy(stream->next_out, state->tmp_out_buff, size);
|
||||
stream->next_out += size;
|
||||
stream->avail_out -= size;
|
||||
stream->total_out += size;
|
||||
state->tmp_out_start += size;
|
||||
if (state->tmp_out_start != state->tmp_out_end)
|
||||
state->state += ZSTATE_TMP_OFFSET;
|
||||
memmove(state->buffer, stream->next_in - IGZIP_HIST_SIZE,
|
||||
size + IGZIP_HIST_SIZE);
|
||||
state->b_bytes_processed = IGZIP_HIST_SIZE;
|
||||
state->b_bytes_valid = size + IGZIP_HIST_SIZE;
|
||||
|
||||
}
|
||||
}
|
||||
} else
|
||||
ret = INVALID_FLUSH;
|
||||
stream->next_in += size;
|
||||
stream->avail_in -= size;
|
||||
stream->total_in += size;
|
||||
|
||||
} else {
|
||||
stream->total_in += state->b_bytes_valid - state->b_bytes_processed;
|
||||
stream->next_in = next_in;
|
||||
stream->avail_in = avail_in;
|
||||
state->file_start = stream->next_in - stream->total_in;
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -912,11 +1007,6 @@ void write_header(struct isal_zstream *stream)
|
||||
|
||||
}
|
||||
|
||||
uint32_t get_crc_01(uint32_t * crc)
|
||||
{
|
||||
return crc_512to32_01(crc);
|
||||
}
|
||||
|
||||
void write_trailer(struct isal_zstream *stream)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
@ -950,11 +1040,11 @@ void write_trailer(struct isal_zstream *stream)
|
||||
bytes = buffer_used(&state->bitbuf);
|
||||
|
||||
#ifndef DEFLATE
|
||||
uint32_t *crc = state->crc;
|
||||
uint32_t crc = state->crc;
|
||||
|
||||
if (!is_full(&state->bitbuf)) {
|
||||
*(uint64_t *) stream->next_out =
|
||||
((uint64_t) file_size(state) << 32) | get_crc(crc);
|
||||
((uint64_t) stream->total_in << 32) | crc;
|
||||
stream->next_out += 8;
|
||||
bytes += 8;
|
||||
state->state = ZSTATE_END;
|
||||
|
@ -6,52 +6,15 @@
|
||||
|
||||
extern const struct isal_hufftables hufftables_default;
|
||||
|
||||
void isal_deflate_init_base(struct isal_zstream *stream)
|
||||
static inline void update_state(struct isal_zstream *stream, uint8_t * start_in,
|
||||
uint8_t * next_in, uint8_t * end_in)
|
||||
{
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
int i;
|
||||
|
||||
uint32_t *crc = state->crc;
|
||||
|
||||
stream->total_in = 0;
|
||||
stream->total_out = 0;
|
||||
stream->hufftables = (struct isal_hufftables *)&hufftables_default;
|
||||
stream->flush = 0;
|
||||
state->b_bytes_valid = 0;
|
||||
state->b_bytes_processed = 0;
|
||||
state->has_eob = 0;
|
||||
state->has_eob_hdr = 0;
|
||||
state->left_over = 0;
|
||||
state->last_flush = 0;
|
||||
state->has_gzip_hdr = 0;
|
||||
state->state = ZSTATE_NEW_HDR;
|
||||
state->count = 0;
|
||||
|
||||
state->tmp_out_start = 0;
|
||||
state->tmp_out_end = 0;
|
||||
|
||||
state->file_start = state->buffer;
|
||||
|
||||
init(&state->bitbuf);
|
||||
|
||||
*crc = ~0;
|
||||
|
||||
for (i = 0; i < IGZIP_HASH_SIZE; i++)
|
||||
state->head[i] = (uint16_t) - (IGZIP_HIST_SIZE + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t get_crc_base(uint32_t * crc)
|
||||
{
|
||||
return ~*crc;
|
||||
}
|
||||
|
||||
static inline void update_state(struct isal_zstream *stream, struct isal_zstate *state,
|
||||
uint8_t * start_in)
|
||||
{
|
||||
uint32_t bytes_written;
|
||||
|
||||
stream->total_in += stream->next_in - start_in;
|
||||
stream->next_in = next_in;
|
||||
stream->total_in += next_in - start_in;
|
||||
stream->avail_in = end_in - next_in;
|
||||
|
||||
bytes_written = buffer_used(&state->bitbuf);
|
||||
stream->total_out += bytes_written;
|
||||
@ -65,163 +28,27 @@ void isal_deflate_body_base(struct isal_zstream *stream)
|
||||
uint32_t literal, hash;
|
||||
uint8_t *start_in, *next_in, *end_in, *end, *next_hash;
|
||||
uint16_t match_length;
|
||||
uint32_t dist, bytes_to_buffer, offset;
|
||||
uint64_t code, code_len, code2, code_len2;
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
uint16_t *last_seen = state->head;
|
||||
uint32_t *crc = state->crc;
|
||||
|
||||
if (stream->avail_in == 0) {
|
||||
if (stream->end_of_stream || stream->flush != NO_FLUSH)
|
||||
state->state = ZSTATE_FLUSH_READ_BUFFER;
|
||||
return;
|
||||
}
|
||||
|
||||
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
|
||||
start_in = stream->next_in;
|
||||
|
||||
while (stream->avail_in != 0) {
|
||||
bytes_to_buffer =
|
||||
IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD - (state->b_bytes_valid -
|
||||
state->b_bytes_processed);
|
||||
|
||||
if (bytes_to_buffer > IGZIP_HIST_SIZE)
|
||||
bytes_to_buffer = IGZIP_HIST_SIZE;
|
||||
|
||||
if (stream->avail_in < IGZIP_HIST_SIZE)
|
||||
bytes_to_buffer = stream->avail_in;
|
||||
|
||||
if (bytes_to_buffer > sizeof(state->buffer) - state->b_bytes_valid) {
|
||||
if (state->b_bytes_valid - state->b_bytes_processed > ISAL_LOOK_AHEAD) {
|
||||
/* There was an out buffer overflow last round,
|
||||
* complete the processing of data */
|
||||
bytes_to_buffer = 0;
|
||||
|
||||
} else {
|
||||
/* Not enough room in the buffer, shift the
|
||||
* buffer down to make space for the new data */
|
||||
offset = state->b_bytes_processed - IGZIP_HIST_SIZE; // state->b_bytes_valid - (IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD);
|
||||
memmove(state->buffer, state->buffer + offset,
|
||||
IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD);
|
||||
|
||||
state->b_bytes_processed -= offset;
|
||||
state->b_bytes_valid -= offset;
|
||||
state->file_start -= offset;
|
||||
|
||||
stream->avail_in -= bytes_to_buffer;
|
||||
memcpy(state->buffer + state->b_bytes_valid, stream->next_in,
|
||||
bytes_to_buffer);
|
||||
update_crc(crc, stream->next_in, bytes_to_buffer);
|
||||
stream->next_in += bytes_to_buffer;
|
||||
}
|
||||
} else {
|
||||
/* There is enough space in the buffer, copy in the new data */
|
||||
stream->avail_in -= bytes_to_buffer;
|
||||
memcpy(state->buffer + state->b_bytes_valid, stream->next_in,
|
||||
bytes_to_buffer);
|
||||
update_crc(crc, stream->next_in, bytes_to_buffer);
|
||||
stream->next_in += bytes_to_buffer;
|
||||
}
|
||||
|
||||
state->b_bytes_valid += bytes_to_buffer;
|
||||
|
||||
end_in = state->buffer + state->b_bytes_valid - ISAL_LOOK_AHEAD;
|
||||
|
||||
next_in = state->b_bytes_processed + state->buffer;
|
||||
|
||||
while (next_in < end_in) {
|
||||
|
||||
if (is_full(&state->bitbuf)) {
|
||||
state->b_bytes_processed = next_in - state->buffer;
|
||||
update_state(stream, state, start_in);
|
||||
return;
|
||||
}
|
||||
|
||||
literal = *(uint32_t *) next_in;
|
||||
hash = compute_hash(literal) & HASH_MASK;
|
||||
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
|
||||
last_seen[hash] = (uint64_t) (next_in - state->file_start);
|
||||
|
||||
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
|
||||
assert(next_in - dist >= state->buffer);
|
||||
assert(dist != 0);
|
||||
|
||||
match_length = compare258(next_in - dist, next_in, 258);
|
||||
|
||||
if (match_length >= SHORTEST_MATCH) {
|
||||
next_hash = next_in;
|
||||
#ifdef ISAL_LIMIT_HASH_UPDATE
|
||||
end = next_hash + 3;
|
||||
#else
|
||||
end = next_hash + match_length;
|
||||
#endif
|
||||
next_hash++;
|
||||
|
||||
for (; next_hash < end; next_hash++) {
|
||||
literal = *(uint32_t *) next_hash;
|
||||
hash = compute_hash(literal) & HASH_MASK;
|
||||
last_seen[hash] =
|
||||
(uint64_t) (next_hash - state->file_start);
|
||||
}
|
||||
|
||||
get_len_code(stream->hufftables, match_length, &code,
|
||||
&code_len);
|
||||
get_dist_code(stream->hufftables, dist, &code2,
|
||||
&code_len2);
|
||||
|
||||
code |= code2 << code_len;
|
||||
code_len += code_len2;
|
||||
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
|
||||
next_in += match_length;
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
next_in++;
|
||||
}
|
||||
|
||||
state->b_bytes_processed = next_in - state->buffer;
|
||||
|
||||
}
|
||||
|
||||
update_state(stream, state, start_in);
|
||||
|
||||
if (stream->avail_in == 0) {
|
||||
if (stream->end_of_stream || stream->flush != NO_FLUSH)
|
||||
state->state = ZSTATE_FLUSH_READ_BUFFER;
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
void isal_deflate_finish_base(struct isal_zstream *stream)
|
||||
{
|
||||
uint32_t literal = 0, hash;
|
||||
uint8_t *next_in, *end_in, *end, *next_hash;
|
||||
uint16_t match_length;
|
||||
uint32_t dist;
|
||||
uint64_t code, code_len, code2, code_len2;
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
uint16_t *last_seen = state->head;
|
||||
|
||||
if (stream->avail_in == 0) {
|
||||
if (stream->end_of_stream || stream->flush != NO_FLUSH)
|
||||
state->state = ZSTATE_FLUSH_READ_BUFFER;
|
||||
return;
|
||||
}
|
||||
|
||||
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
|
||||
|
||||
end_in = state->b_bytes_valid + (uint8_t *) state->buffer;
|
||||
start_in = stream->next_in;
|
||||
end_in = start_in + stream->avail_in;
|
||||
next_in = start_in;
|
||||
|
||||
next_in = state->b_bytes_processed + state->buffer;
|
||||
|
||||
while (next_in < end_in) {
|
||||
while (next_in < end_in - ISAL_LOOK_AHEAD) {
|
||||
|
||||
if (is_full(&state->bitbuf)) {
|
||||
state->b_bytes_processed = next_in - state->buffer;
|
||||
update_state(stream, state, stream->next_in);
|
||||
update_state(stream, start_in, next_in, end_in);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -230,9 +57,11 @@ void isal_deflate_finish_base(struct isal_zstream *stream)
|
||||
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
|
||||
last_seen[hash] = (uint64_t) (next_in - state->file_start);
|
||||
|
||||
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
|
||||
assert(next_in - dist >= state->buffer);
|
||||
match_length = compare258(next_in - dist, next_in, end_in - next_in);
|
||||
/* The -1 are to handle the case when dist = 0 */
|
||||
if (dist - 1 < IGZIP_HIST_SIZE - 1) {
|
||||
assert(dist != 0);
|
||||
|
||||
match_length = compare258(next_in - dist, next_in, 258);
|
||||
|
||||
if (match_length >= SHORTEST_MATCH) {
|
||||
next_hash = next_in;
|
||||
@ -265,29 +94,113 @@ void isal_deflate_finish_base(struct isal_zstream *stream)
|
||||
}
|
||||
}
|
||||
|
||||
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
next_in++;
|
||||
}
|
||||
|
||||
update_state(stream, start_in, next_in, end_in);
|
||||
|
||||
assert(stream->avail_in <= ISAL_LOOK_AHEAD);
|
||||
if (stream->end_of_stream || stream->flush != NO_FLUSH)
|
||||
state->state = ZSTATE_FLUSH_READ_BUFFER;
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
void isal_deflate_finish_base(struct isal_zstream *stream)
|
||||
{
|
||||
uint32_t literal = 0, hash;
|
||||
uint8_t *start_in, *next_in, *end_in, *end, *next_hash;
|
||||
uint16_t match_length;
|
||||
uint32_t dist;
|
||||
uint64_t code, code_len, code2, code_len2;
|
||||
struct isal_zstate *state = &stream->internal_state;
|
||||
uint16_t *last_seen = state->head;
|
||||
|
||||
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
|
||||
|
||||
start_in = stream->next_in;
|
||||
end_in = start_in + stream->avail_in;
|
||||
next_in = start_in;
|
||||
|
||||
while (next_in < end_in - 3) {
|
||||
if (is_full(&state->bitbuf)) {
|
||||
update_state(stream, start_in, next_in, end_in);
|
||||
return;
|
||||
}
|
||||
|
||||
literal = *(uint32_t *) next_in;
|
||||
hash = compute_hash(literal) & HASH_MASK;
|
||||
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
|
||||
last_seen[hash] = (uint64_t) (next_in - state->file_start);
|
||||
|
||||
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
|
||||
match_length = compare258(next_in - dist, next_in, end_in - next_in);
|
||||
|
||||
if (match_length >= SHORTEST_MATCH) {
|
||||
next_hash = next_in;
|
||||
#ifdef ISAL_LIMIT_HASH_UPDATE
|
||||
end = next_hash + 3;
|
||||
#else
|
||||
end = next_hash + match_length;
|
||||
#endif
|
||||
next_hash++;
|
||||
|
||||
for (; next_hash < end - 3; next_hash++) {
|
||||
literal = *(uint32_t *) next_hash;
|
||||
hash = compute_hash(literal) & HASH_MASK;
|
||||
last_seen[hash] =
|
||||
(uint64_t) (next_hash - state->file_start);
|
||||
}
|
||||
|
||||
get_len_code(stream->hufftables, match_length, &code,
|
||||
&code_len);
|
||||
get_dist_code(stream->hufftables, dist, &code2, &code_len2);
|
||||
|
||||
code |= code2 << code_len;
|
||||
code_len += code_len2;
|
||||
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
|
||||
next_in += match_length;
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
next_in++;
|
||||
|
||||
}
|
||||
|
||||
state->b_bytes_processed = next_in - state->buffer;
|
||||
while (next_in < end_in) {
|
||||
if (is_full(&state->bitbuf)) {
|
||||
update_state(stream, start_in, next_in, end_in);
|
||||
return;
|
||||
}
|
||||
|
||||
literal = *next_in;
|
||||
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
next_in++;
|
||||
|
||||
if (is_full(&state->bitbuf) || state->left_over > 0) {
|
||||
update_state(stream, state, stream->next_in);
|
||||
return;
|
||||
}
|
||||
|
||||
get_lit_code(stream->hufftables, 256, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
state->has_eob = 1;
|
||||
if (!is_full(&state->bitbuf)) {
|
||||
get_lit_code(stream->hufftables, 256, &code, &code_len);
|
||||
write_bits(&state->bitbuf, code, code_len);
|
||||
state->has_eob = 1;
|
||||
|
||||
update_state(stream, state, stream->next_in);
|
||||
if (stream->end_of_stream == 1)
|
||||
state->state = ZSTATE_TRL;
|
||||
else
|
||||
state->state = ZSTATE_SYNC_FLUSH;
|
||||
}
|
||||
|
||||
if (stream->end_of_stream == 1)
|
||||
state->state = ZSTATE_TRL;
|
||||
else
|
||||
state->state = ZSTATE_SYNC_FLUSH;
|
||||
update_state(stream, start_in, next_in, end_in);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -28,26 +28,16 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%include "options.asm"
|
||||
%ifndef TEST
|
||||
|
||||
extern fold_4
|
||||
|
||||
%include "lz0a_const.asm"
|
||||
%include "data_struct2.asm"
|
||||
%include "bitbuf2.asm"
|
||||
%include "huffman.asm"
|
||||
%include "igzip_compare_types.asm"
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%include "stdmac.asm"
|
||||
|
||||
%if (ARCH == 04)
|
||||
%define MOVDQA vmovdqa
|
||||
%else
|
||||
%define MOVDQA movdqa
|
||||
%endif
|
||||
|
||||
%ifdef DEBUG
|
||||
%macro MARK 1
|
||||
global %1
|
||||
@ -61,99 +51,74 @@ global %1
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
%define tmp2 rcx
|
||||
%define hash2 rcx
|
||||
|
||||
%define b_bytes_valid rax
|
||||
%define curr_data rax
|
||||
%define code rax
|
||||
%define tmp5 rax
|
||||
%define tmp2 rcx
|
||||
%define hash2 rcx
|
||||
|
||||
%define tmp4 rbx
|
||||
%define dist rbx
|
||||
%define code2 rbx
|
||||
%define curr_data rax
|
||||
%define code rax
|
||||
%define tmp5 rax
|
||||
|
||||
%define x rdx
|
||||
%define len rdx
|
||||
%define hash rdx
|
||||
%define code_len3 rdx
|
||||
%define tmp8 rdx
|
||||
%define tmp4 rbx
|
||||
%define dist rbx
|
||||
%define code2 rbx
|
||||
|
||||
%define tmp1 rsi
|
||||
%define code_len2 rsi
|
||||
%define hash rdx
|
||||
%define len rdx
|
||||
%define code_len3 rdx
|
||||
%define tmp8 rdx
|
||||
|
||||
%define blen rdi
|
||||
%define file_start rdi
|
||||
%define tmp1 rsi
|
||||
%define code_len2 rsi
|
||||
|
||||
%define m_bit_count rbp
|
||||
%define file_start rdi
|
||||
|
||||
%define in_buf r8
|
||||
%define curr_data2 r8
|
||||
%define len2 r8
|
||||
%define tmp6 r8
|
||||
%define m_bit_count rbp
|
||||
|
||||
%define m_bits r9
|
||||
%define curr_data2 r8
|
||||
%define len2 r8
|
||||
%define tmp6 r8
|
||||
|
||||
%define f_i r10
|
||||
%define m_bits r9
|
||||
|
||||
%define m_out_buf r11
|
||||
%define f_i r10
|
||||
|
||||
%define f_end_i r12
|
||||
%define dist2 r12
|
||||
%define tmp7 r12
|
||||
%define code4 r12
|
||||
%define m_out_buf r11
|
||||
|
||||
%define tmp3 r13
|
||||
%define code3 r13
|
||||
%define f_end_i r12
|
||||
%define dist2 r12
|
||||
%define tmp7 r12
|
||||
%define code4 r12
|
||||
|
||||
%define stream r14
|
||||
%define tmp3 r13
|
||||
%define code3 r13
|
||||
|
||||
%define hufftables r15
|
||||
%define stream r14
|
||||
|
||||
%define crc_0 xmm0 ; in/out: crc state
|
||||
%define crc_1 xmm1 ; in/out: crc state
|
||||
%define crc_2 xmm2 ; in/out: crc state
|
||||
%define crc_3 xmm3 ; in/out: crc state
|
||||
%define crc_fold xmm4 ; in: (loaded from fold_4)
|
||||
%define hufftables r15
|
||||
|
||||
%define xtmp0 xmm5 ; tmp
|
||||
%define xtmp1 xmm6 ; tmp
|
||||
%define xtmp2 xmm7 ; tmp
|
||||
%define xtmp3 xmm8 ; tmp
|
||||
%define xtmp4 xmm9 ; tmp
|
||||
%define xhash xmm10
|
||||
%define xmask xmm11
|
||||
%define xdata xmm12
|
||||
;; GPR r8 & r15 can be used
|
||||
|
||||
%define xtmp0 xmm0 ; tmp
|
||||
%define xtmp1 xmm1 ; tmp
|
||||
%define xhash xmm2
|
||||
%define xmask xmm3
|
||||
%define xdata xmm4
|
||||
|
||||
%define ytmp0 ymm0 ; tmp
|
||||
%define ytmp1 ymm1 ; tmp
|
||||
|
||||
%define ytmp0 ymm5 ; tmp
|
||||
%define ytmp1 ymm6 ; tmp
|
||||
|
||||
%if ( ARCH == 02 || ARCH == 04)
|
||||
%define vtmp0 ymm5 ; tmp
|
||||
%define vtmp1 ymm6 ; tmp
|
||||
%define vtmp2 ymm7 ; tmp
|
||||
%define vtmp3 ymm8 ; tmp
|
||||
%define vtmp4 ymm9 ; tmp
|
||||
%else
|
||||
%define vtmp0 xmm5 ; tmp
|
||||
%define vtmp1 xmm6 ; tmp
|
||||
%define vtmp2 xmm7 ; tmp
|
||||
%define vtmp3 xmm8 ; tmp
|
||||
%define vtmp4 xmm9 ; tmp
|
||||
%endif
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define b_bytes_processed f_i
|
||||
|
||||
blen_mem_offset equ 0 ; local variable (8 bytes)
|
||||
in_buf_mem_offset equ 8
|
||||
f_end_i_mem_offset equ 16
|
||||
empty_buffer_flag equ 24
|
||||
gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes)
|
||||
xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (8*16 bytes) (16 byte aligned)
|
||||
stack_size equ 4*8 + 8*8 + 8*16 + 8
|
||||
f_end_i_mem_offset equ 8
|
||||
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
|
||||
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
|
||||
stack_size equ 2*8 + 8*8 + 4*16 + 8
|
||||
;;; 8 because stack address is odd multiple of 8 after a function call and
|
||||
;;; we want it aligned to 16 bytes
|
||||
|
||||
@ -197,173 +162,74 @@ skip1:
|
||||
mov [rsp + gpr_save_mem_offset + 5*8], r13
|
||||
mov [rsp + gpr_save_mem_offset + 6*8], r14
|
||||
mov [rsp + gpr_save_mem_offset + 7*8], r15
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 0*16], xmm6
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 1*16], xmm7
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 2*16], xmm8
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 3*16], xmm9
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 4*16], xmm10
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 5*16], xmm11
|
||||
MOVDQA [rsp + xmm_save_mem_offset + 6*16], xmm12
|
||||
|
||||
mov stream, rcx
|
||||
MOVDQU xmask, [mask]
|
||||
|
||||
MOVDQA crc_0, [stream + _internal_state_crc + 0*16]
|
||||
MOVDQA crc_1, [stream + _internal_state_crc + 1*16]
|
||||
MOVDQA crc_2, [stream + _internal_state_crc + 2*16]
|
||||
MOVDQA crc_3, [stream + _internal_state_crc + 3*16]
|
||||
MOVDQA crc_fold, [fold_4]
|
||||
mov dword [stream + _internal_state_has_eob], 0
|
||||
|
||||
MOVDQU xmask, [mask]
|
||||
|
||||
; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
|
||||
mov m_out_buf, [stream + _next_out]
|
||||
mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf
|
||||
mov tmp1 %+ d, [stream + _avail_out]
|
||||
add tmp1, m_out_buf
|
||||
sub tmp1, SLOP
|
||||
skip_SLOP:
|
||||
|
||||
mov [stream + _internal_state_bitbuf_m_out_end], tmp1
|
||||
|
||||
mov m_bits, [stream + _internal_state_bitbuf_m_bits]
|
||||
mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
|
||||
|
||||
mov hufftables, [stream + _hufftables]
|
||||
; in_buf = stream->next_in
|
||||
mov in_buf, [stream + _next_in]
|
||||
mov blen %+ d, [stream + _avail_in]
|
||||
|
||||
mov dword [rsp + empty_buffer_flag], 0
|
||||
cmp dword [stream + _internal_state_b_bytes_processed], 0
|
||||
sete byte [rsp + empty_buffer_flag]
|
||||
mov file_start, [stream + _next_in]
|
||||
|
||||
; while (blen != 0)
|
||||
MARK __Compute_X_ %+ ARCH
|
||||
loop1:
|
||||
; x = D + LA - (state->b_bytes_valid - state->b_bytes_processed);
|
||||
mov b_bytes_valid %+ d, [stream + _internal_state_b_bytes_valid]
|
||||
mov b_bytes_processed %+ d, [stream + _internal_state_b_bytes_processed]
|
||||
lea x, [b_bytes_processed + D + LA]
|
||||
sub x, b_bytes_valid
|
||||
mov f_i %+ d, dword [stream + _total_in]
|
||||
sub file_start, f_i
|
||||
|
||||
; if (x > D) x = D;
|
||||
cmp x, D
|
||||
cmova x, [const_D]
|
||||
mov f_end_i %+ d, [stream + _avail_in]
|
||||
add f_end_i, f_i
|
||||
|
||||
; if (blen < D) x = blen;
|
||||
cmp blen, D
|
||||
cmovb x, blen
|
||||
|
||||
;; process x bytes starting at in_buf
|
||||
|
||||
;; If there isn't enough room, shift buffer down
|
||||
; if (x > BSIZE - state->b_bytes_valid) {
|
||||
mov tmp1, BSIZE
|
||||
sub tmp1, b_bytes_valid
|
||||
cmp x, tmp1
|
||||
jbe skip_move
|
||||
|
||||
; if (state->b_bytes_processed < state->b_bytes_valid - LA) {
|
||||
mov tmp1, b_bytes_valid
|
||||
sub tmp1, LA
|
||||
cmp b_bytes_processed, tmp1
|
||||
jae do_move
|
||||
|
||||
;; We need to move an odd amount, skip move for this copy of loop
|
||||
xor x,x
|
||||
mov [rsp + blen_mem_offset], blen
|
||||
jmp skip_move_zero
|
||||
|
||||
MARK __shift_data_down_ %+ ARCH
|
||||
do_move:
|
||||
; offset = state->b_bytes_valid - (D + LA);
|
||||
mov tmp4, b_bytes_valid
|
||||
sub tmp4, D + LA
|
||||
; copy_D_LA(state->buffer, state->buffer + offset);
|
||||
lea tmp1, [stream + _internal_state_buffer]
|
||||
lea tmp2, [tmp1 + tmp4]
|
||||
copy_D_LA tmp1, tmp2, tmp3, vtmp0, vtmp1, vtmp2, vtmp3
|
||||
; tmp1 clobbered
|
||||
|
||||
; state->file_start -= offset;
|
||||
sub [stream + _internal_state_file_start], tmp4
|
||||
; state->b_bytes_processed -= offset;
|
||||
sub b_bytes_processed, tmp4
|
||||
mov b_bytes_valid, D + LA
|
||||
|
||||
MARK __copy_in_ %+ ARCH
|
||||
skip_move:
|
||||
sub blen, x
|
||||
|
||||
mov [rsp + blen_mem_offset], blen
|
||||
|
||||
; copy_in(state->buffer + state->b_bytes_valid, in_buf, x);
|
||||
lea tmp1, [stream + _internal_state_buffer + b_bytes_valid]
|
||||
mov tmp2, in_buf
|
||||
mov tmp3, x
|
||||
|
||||
|
||||
COPY_IN_CRC tmp1, tmp2, tmp3, tmp4, crc_0, crc_1, crc_2, crc_3, crc_fold, \
|
||||
xtmp0, xtmp1, xtmp2, xtmp3, xtmp4
|
||||
|
||||
; in_buf += x;
|
||||
add in_buf, x
|
||||
MARK __prepare_loop_ %+ ARCH
|
||||
skip_move_zero:
|
||||
mov [rsp + in_buf_mem_offset], in_buf
|
||||
; state->b_bytes_valid += x;
|
||||
add b_bytes_valid, x
|
||||
mov [stream + _internal_state_b_bytes_valid], b_bytes_valid %+ d
|
||||
|
||||
; f_end_i = state->b_bytes_valid - LA;
|
||||
%ifnidn f_end_i, b_bytes_valid
|
||||
mov f_end_i, b_bytes_valid
|
||||
%endif
|
||||
; f_end_i -= LA;
|
||||
sub f_end_i, LA
|
||||
; if (f_end_i <= 0) continue;
|
||||
cmp f_end_i, 0
|
||||
jle continue_while
|
||||
|
||||
; f_start_i = state->b_bytes_processed;
|
||||
;; f_i and b_bytes_processed are same register, just store b_bytes_proc
|
||||
mov [stream + _internal_state_b_bytes_processed], b_bytes_processed %+ d
|
||||
|
||||
; f_start_i += (uint32_t)(state->buffer - state->file_start);
|
||||
mov file_start, [stream + _internal_state_file_start]
|
||||
lea tmp1, [stream + _internal_state_buffer]
|
||||
sub tmp1, file_start
|
||||
add f_i, tmp1
|
||||
add f_end_i, tmp1
|
||||
mov [rsp + f_end_i_mem_offset], f_end_i
|
||||
; if (f_end_i <= 0) continue;
|
||||
|
||||
cmp f_end_i, f_i
|
||||
jle input_end
|
||||
|
||||
; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
|
||||
cmp f_i, f_end_i
|
||||
jge end_loop_2
|
||||
|
||||
MARK __misc_compute_hash_lookup_ %+ ARCH
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
MARK __body_compute_hash_ %+ ARCH
|
||||
mov curr_data, [file_start + f_i]
|
||||
mov tmp3, curr_data
|
||||
mov tmp6, curr_data
|
||||
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja output_end
|
||||
|
||||
;; Encode first byte in the stream as a literal
|
||||
compute_hash hash, curr_data
|
||||
and hash %+ d, HASH_MASK
|
||||
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
|
||||
and curr_data, 0xff
|
||||
get_lit_code curr_data, code2, code_len2, hufftables
|
||||
|
||||
shr tmp3, 8
|
||||
compute_hash hash2, tmp3
|
||||
mov tmp3, [file_start + f_i + 1]
|
||||
mov tmp6, tmp3
|
||||
compute_hash hash, tmp3
|
||||
|
||||
and hash, HASH_MASK
|
||||
and hash2, HASH_MASK
|
||||
shr tmp6, 8
|
||||
compute_hash hash2, tmp6
|
||||
|
||||
cmp dword [rsp + empty_buffer_flag], 0
|
||||
jne write_first_byte
|
||||
MOVD xhash, hash %+ d
|
||||
PINSRD xhash, hash2 %+ d, 1
|
||||
PAND xhash, xhash, xmask
|
||||
|
||||
jmp loop2
|
||||
jmp write_lit_bits
|
||||
|
||||
align 16
|
||||
|
||||
loop2:
|
||||
; if (state->bitbuf.is_full()) {
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja bitbuf_full
|
||||
ja output_end
|
||||
|
||||
xor dist, dist
|
||||
xor dist2, dist2
|
||||
@ -398,7 +264,7 @@ loop2:
|
||||
and dist2 %+ d, (D-1)
|
||||
neg dist2
|
||||
|
||||
MARK __compare_ %+ ARCH
|
||||
MARK __body_compare_ %+ ARCH
|
||||
;; Check for long len/dist match (>7) with first literal
|
||||
MOVQ len, xdata
|
||||
mov curr_data, len
|
||||
@ -436,7 +302,7 @@ MARK __compare_ %+ ARCH
|
||||
test len2 %+ d, 0xFFFFFFFF
|
||||
jnz write_lit_bits
|
||||
|
||||
MARK __len_dist_lit_huffman_ %+ ARCH
|
||||
MARK __body_len_dist_lit_huffman_ %+ ARCH
|
||||
len_dist_lit_huffman_pre:
|
||||
mov code_len3, rcx
|
||||
bsf len2, len2
|
||||
@ -494,17 +360,17 @@ len_dist_lit_huffman:
|
||||
shr curr_data2, 8
|
||||
compute_hash hash2, curr_data2
|
||||
|
||||
%ifdef NO_LIMIT_HASH_UPDATE
|
||||
%ifdef NO_LIMIT_HASH_UPDATE
|
||||
loop3:
|
||||
add tmp3,1
|
||||
add tmp3,1
|
||||
cmp tmp3, f_i
|
||||
jae loop3_done
|
||||
mov tmp6, [file_start + tmp3]
|
||||
compute_hash tmp4, tmp6
|
||||
and tmp4 %+ d, HASH_MASK
|
||||
mov tmp6, [file_start + tmp3]
|
||||
compute_hash tmp4, tmp6
|
||||
and tmp4 %+ d, HASH_MASK
|
||||
; state->head[hash] = k;
|
||||
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
|
||||
jmp loop3
|
||||
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
|
||||
jmp loop3
|
||||
loop3_done:
|
||||
%endif
|
||||
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
|
||||
@ -514,13 +380,14 @@ loop3_done:
|
||||
; continue
|
||||
cmp f_i, f_end_i
|
||||
jl loop2
|
||||
jmp end_loop_2
|
||||
jmp input_end
|
||||
;; encode as dist/len
|
||||
|
||||
MARK __len_dist_huffman_ %+ ARCH
|
||||
MARK __body_len_dist_huffman_ %+ ARCH
|
||||
len_dist_huffman_pre:
|
||||
bsf len, len
|
||||
shr len, 3
|
||||
|
||||
len_dist_huffman:
|
||||
dec f_i
|
||||
neg dist
|
||||
@ -563,29 +430,29 @@ len_dist_huffman:
|
||||
shr curr_data2, 8
|
||||
compute_hash hash2, curr_data2
|
||||
|
||||
%ifdef NO_LIMIT_HASH_UPDATE
|
||||
%ifdef NO_LIMIT_HASH_UPDATE
|
||||
loop4:
|
||||
add tmp3,1
|
||||
|
||||
add tmp3,1
|
||||
cmp tmp3, f_i
|
||||
jae loop4_done
|
||||
mov tmp6, [file_start + tmp3]
|
||||
compute_hash tmp4, tmp6
|
||||
and tmp4, HASH_MASK
|
||||
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
|
||||
jmp loop4
|
||||
mov tmp6, [file_start + tmp3]
|
||||
compute_hash tmp4, tmp6
|
||||
and tmp4, HASH_MASK
|
||||
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
|
||||
jmp loop4
|
||||
loop4_done:
|
||||
%endif
|
||||
|
||||
and hash, HASH_MASK
|
||||
and hash2, HASH_MASK
|
||||
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
|
||||
and hash %+ d, HASH_MASK
|
||||
and hash2 %+ d, HASH_MASK
|
||||
|
||||
; continue
|
||||
cmp f_i, f_end_i
|
||||
jl loop2
|
||||
jmp end_loop_2
|
||||
jmp input_end
|
||||
|
||||
MARK __write_lit_bits_ %+ ARCH
|
||||
MARK __body_write_lit_bits_ %+ ARCH
|
||||
write_lit_bits:
|
||||
MOVDQU xdata, [file_start + f_i + 1]
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
@ -602,39 +469,7 @@ write_lit_bits:
|
||||
cmp f_i, f_end_i
|
||||
jl loop2
|
||||
|
||||
|
||||
MARK __end_loops_ %+ ARCH
|
||||
end_loop_2:
|
||||
|
||||
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
|
||||
add f_i, [stream + _internal_state_file_start]
|
||||
sub f_i, stream
|
||||
sub f_i, _internal_state_buffer
|
||||
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
|
||||
|
||||
; continue
|
||||
continue_while:
|
||||
mov blen, [rsp + blen_mem_offset]
|
||||
mov in_buf, [rsp + in_buf_mem_offset]
|
||||
cmp blen, 0
|
||||
jnz loop1
|
||||
|
||||
end:
|
||||
;; update input buffer
|
||||
; stream->total_in += (uint32_t)(in_buf - stream->next_in); // bytes copied
|
||||
mov tmp1 %+ d, [stream + _total_in]
|
||||
mov in_buf, [rsp + in_buf_mem_offset]
|
||||
add tmp1, in_buf
|
||||
sub tmp1, [stream + _next_in]
|
||||
mov [stream + _total_in], tmp1 %+ d
|
||||
|
||||
mov [stream + _next_in], in_buf
|
||||
mov [stream + _avail_in], blen %+ d
|
||||
|
||||
cmp blen, 0
|
||||
jne skip2
|
||||
|
||||
;; Set stream's next state
|
||||
input_end:
|
||||
mov tmp1, ZSTATE_FLUSH_READ_BUFFER
|
||||
mov tmp5, ZSTATE_BODY
|
||||
cmp dword [stream + _end_of_stream], 0
|
||||
@ -642,9 +477,18 @@ end:
|
||||
cmp dword [stream + _flush], _NO_FLUSH
|
||||
cmovne tmp5, tmp1
|
||||
mov dword [stream + _internal_state_state], tmp5 %+ d
|
||||
skip2:
|
||||
|
||||
output_end:
|
||||
;; update input buffer
|
||||
add f_end_i, LA
|
||||
mov [stream + _total_in], f_i %+ d
|
||||
add file_start, f_i
|
||||
mov [stream + _next_in], file_start
|
||||
sub f_end_i, f_i
|
||||
mov [stream + _avail_in], f_end_i %+ d
|
||||
|
||||
;; update output buffer
|
||||
mov [stream + _next_out], m_out_buf
|
||||
; offset = state->bitbuf.buffer_used();
|
||||
sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
|
||||
sub [stream + _avail_out], m_out_buf %+ d
|
||||
add [stream + _total_out], m_out_buf %+ d
|
||||
@ -652,12 +496,6 @@ skip2:
|
||||
mov [stream + _internal_state_bitbuf_m_bits], m_bits
|
||||
mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
|
||||
|
||||
|
||||
MOVDQA [stream + _internal_state_crc + 0*16], crc_0
|
||||
MOVDQA [stream + _internal_state_crc + 1*16], crc_1
|
||||
MOVDQA [stream + _internal_state_crc + 2*16], crc_2
|
||||
MOVDQA [stream + _internal_state_crc + 3*16], crc_3
|
||||
|
||||
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
|
||||
mov rsi, [rsp + gpr_save_mem_offset + 1*8]
|
||||
mov rdi, [rsp + gpr_save_mem_offset + 2*8]
|
||||
@ -666,13 +504,6 @@ skip2:
|
||||
mov r13, [rsp + gpr_save_mem_offset + 5*8]
|
||||
mov r14, [rsp + gpr_save_mem_offset + 6*8]
|
||||
mov r15, [rsp + gpr_save_mem_offset + 7*8]
|
||||
MOVDQA xmm6, [rsp + xmm_save_mem_offset + 0*16]
|
||||
MOVDQA xmm7, [rsp + xmm_save_mem_offset + 1*16]
|
||||
MOVDQA xmm8, [rsp + xmm_save_mem_offset + 2*16]
|
||||
MOVDQA xmm9, [rsp + xmm_save_mem_offset + 3*16]
|
||||
MOVDQA xmm10, [rsp + xmm_save_mem_offset + 4*16]
|
||||
MOVDQA xmm11, [rsp + xmm_save_mem_offset + 5*16]
|
||||
MOVDQA xmm12, [rsp + xmm_save_mem_offset + 6*16]
|
||||
|
||||
%ifndef ALIGN_STACK
|
||||
add rsp, stack_size
|
||||
@ -682,17 +513,7 @@ skip2:
|
||||
%endif
|
||||
ret
|
||||
|
||||
MARK __bitbuf_full_ %+ ARCH
|
||||
bitbuf_full:
|
||||
mov blen, [rsp + blen_mem_offset]
|
||||
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
|
||||
add f_i, [stream + _internal_state_file_start]
|
||||
sub f_i, stream
|
||||
sub f_i, _internal_state_buffer
|
||||
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
|
||||
jmp end
|
||||
|
||||
MARK __compare_loops_ %+ ARCH
|
||||
MARK __body_compare_loops_ %+ ARCH
|
||||
compare_loop:
|
||||
MOVD xhash, tmp6 %+ d
|
||||
PINSRD xhash, tmp2 %+ d, 1
|
||||
@ -711,8 +532,8 @@ compare_loop:
|
||||
jmp len_dist_huffman
|
||||
|
||||
compare_loop2:
|
||||
lea tmp2, [tmp1 + dist2]
|
||||
add tmp1, 1
|
||||
lea tmp2, [tmp1 + dist2]
|
||||
add tmp1, 1
|
||||
%if (COMPARE_TYPE == 1)
|
||||
compare250 tmp1, tmp2, len2, tmp3
|
||||
%elif (COMPARE_TYPE == 2)
|
||||
@ -727,30 +548,7 @@ compare_loop2:
|
||||
get_lit_code curr_data, code3, code_len3, hufftables
|
||||
jmp len_dist_lit_huffman
|
||||
|
||||
MARK __write_first_byte_ %+ ARCH
|
||||
write_first_byte:
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja bitbuf_full
|
||||
|
||||
mov dword [rsp + empty_buffer_flag], 0
|
||||
|
||||
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
|
||||
|
||||
mov hash, hash2
|
||||
shr tmp6, 16
|
||||
compute_hash hash2, tmp6
|
||||
|
||||
MOVD xhash, hash %+ d
|
||||
PINSRD xhash, hash2 %+ d, 1
|
||||
PAND xhash, xhash, xmask
|
||||
|
||||
and curr_data, 0xff
|
||||
get_lit_code curr_data, code2, code_len2, hufftables
|
||||
jmp write_lit_bits
|
||||
|
||||
section .data
|
||||
align 16
|
||||
mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK
|
||||
const_D: dq D
|
||||
|
||||
%endif ;; ifndef TEST
|
||||
|
@ -4,5 +4,4 @@
|
||||
%define COMPARE_TYPE 2
|
||||
%endif
|
||||
|
||||
%include "igzip_buffer_utils_01.asm"
|
||||
%include "igzip_body.asm"
|
||||
|
@ -4,5 +4,4 @@
|
||||
%define COMPARE_TYPE 2
|
||||
%endif
|
||||
|
||||
%include "igzip_buffer_utils_04.asm"
|
||||
%include "igzip_body.asm"
|
||||
|
@ -5,5 +5,4 @@
|
||||
%define COMPARE_TYPE 3
|
||||
%endif
|
||||
|
||||
%include "igzip_buffer_utils_04.asm"
|
||||
%include "igzip_body.asm"
|
||||
|
@ -1,543 +0,0 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%ifndef BUFFER_UTILS
|
||||
%define BUFFER_UTILS
|
||||
|
||||
%include "options.asm"
|
||||
|
||||
extern pshufb_shf_table
|
||||
extern mask3
|
||||
|
||||
%ifdef FIX_CACHE_READ
|
||||
%define movntdqa movdqa
|
||||
%else
|
||||
%macro prefetchnta 1
|
||||
%endm
|
||||
%endif
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; code for doing the CRC calculation as part of copy-in, using pclmulqdq
|
||||
|
||||
; "shift" 4 input registers down 4 places
|
||||
; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
|
||||
%macro FOLD4 7
|
||||
%define %%xmm0 %1 ; xmm reg, in/out
|
||||
%define %%xmm1 %2 ; xmm reg, in/out
|
||||
%define %%xmm2 %3 ; xmm reg, in/out
|
||||
%define %%xmm3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
%define %%tmp1 %7 ; xmm reg, tmp
|
||||
|
||||
movaps %%tmp0, %%xmm0
|
||||
movaps %%tmp1, %%xmm1
|
||||
|
||||
pclmulqdq %%xmm0, %%const, 0x01
|
||||
pclmulqdq %%xmm1, %%const, 0x01
|
||||
|
||||
pclmulqdq %%tmp0, %%const, 0x10
|
||||
pclmulqdq %%tmp1, %%const, 0x10
|
||||
|
||||
xorps %%xmm0, %%tmp0
|
||||
xorps %%xmm1, %%tmp1
|
||||
|
||||
|
||||
movaps %%tmp0, %%xmm2
|
||||
movaps %%tmp1, %%xmm3
|
||||
|
||||
pclmulqdq %%xmm2, %%const, 0x01
|
||||
pclmulqdq %%xmm3, %%const, 0x01
|
||||
|
||||
pclmulqdq %%tmp0, %%const, 0x10
|
||||
pclmulqdq %%tmp1, %%const, 0x10
|
||||
|
||||
xorps %%xmm2, %%tmp0
|
||||
xorps %%xmm3, %%tmp1
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 3 input registers down 4 places
|
||||
; macro FOLD3 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out D A' B' C'
|
||||
%macro FOLD3 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
movdqa %%tmp0, %%x3
|
||||
|
||||
movaps %%x3, %%x2
|
||||
pclmulqdq %%x2, %%const, 0x01
|
||||
pclmulqdq %%x3, %%const, 0x10
|
||||
xorps %%x3, %%x2
|
||||
|
||||
movaps %%x2, %%x1
|
||||
pclmulqdq %%x1, %%const, 0x01
|
||||
pclmulqdq %%x2, %%const, 0x10
|
||||
xorps %%x2, %%x1
|
||||
|
||||
movaps %%x1, %%x0
|
||||
pclmulqdq %%x0, %%const, 0x01
|
||||
pclmulqdq %%x1, %%const, 0x10
|
||||
xorps %%x1, %%x0
|
||||
|
||||
movdqa %%x0, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 2 input registers down 4 places
|
||||
; macro FOLD2 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out C D A' B'
|
||||
%macro FOLD2 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
movdqa %%tmp0, %%x3
|
||||
|
||||
movaps %%x3, %%x1
|
||||
pclmulqdq %%x1, %%const, 0x01
|
||||
pclmulqdq %%x3, %%const, 0x10
|
||||
xorps %%x3, %%x1
|
||||
|
||||
movdqa %%x1, %%tmp0
|
||||
movdqa %%tmp0, %%x2
|
||||
|
||||
movaps %%x2, %%x0
|
||||
pclmulqdq %%x0, %%const, 0x01
|
||||
pclmulqdq %%x2, %%const, 0x10
|
||||
xorps %%x2, %%x0
|
||||
|
||||
movdqa %%x0, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 1 input registers down 4 places
|
||||
; macro FOLD1 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out B C D A'
|
||||
%macro FOLD1 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
movdqa %%tmp0, %%x3
|
||||
|
||||
movaps %%x3, %%x0
|
||||
pclmulqdq %%x0, %%const, 0x01
|
||||
pclmulqdq %%x3, %%const, 0x10
|
||||
xorps %%x3, %%x0
|
||||
|
||||
movdqa %%x0, %%x1
|
||||
movdqa %%x1, %%x2
|
||||
movdqa %%x2, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
|
||||
|
||||
; XP X3 X2 X1 X0 tmp2
|
||||
; Initial state xI HG FE DC BA
|
||||
; after shift IH GF ED CB A0
|
||||
; after fold ff GF ED CB ff = merge(IH, A0)
|
||||
;
|
||||
%macro PARTIAL_FOLD 12
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%xp %5 ; xmm partial reg, in/clobbered
|
||||
%define %%size %6 ; GPR, in/clobbered (1...15)
|
||||
%define %%const %7 ; xmm reg, in
|
||||
%define %%shl %8 ; xmm reg, tmp
|
||||
%define %%shr %9 ; xmm reg, tmp
|
||||
%define %%tmp2 %10 ; xmm reg, tmp
|
||||
%define %%tmp3 %11 ; xmm reg, tmp
|
||||
%define %%gtmp %12 ; GPR, tmp
|
||||
|
||||
; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
|
||||
shl %%size, 4 ; size *= 16
|
||||
lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
|
||||
movdqa %%shl, [%%gtmp + %%size] ; shl constant
|
||||
movdqa %%shr, %%shl
|
||||
pxor %%shr, [mask3 WRT_OPT] ; shr constant
|
||||
|
||||
movdqa %%tmp2, %%x0 ; tmp2 = BA
|
||||
pshufb %%tmp2, %%shl ; tmp2 = A0
|
||||
|
||||
pshufb %%x0, %%shr ; x0 = 0B
|
||||
movdqa %%tmp3, %%x1 ; tmp3 = DC
|
||||
pshufb %%tmp3, %%shl ; tmp3 = C0
|
||||
por %%x0, %%tmp3 ; x0 = CB
|
||||
|
||||
pshufb %%x1, %%shr ; x1 = 0D
|
||||
movdqa %%tmp3, %%x2 ; tmp3 = FE
|
||||
pshufb %%tmp3, %%shl ; tmp3 = E0
|
||||
por %%x1, %%tmp3 ; x1 = ED
|
||||
|
||||
pshufb %%x2, %%shr ; x2 = 0F
|
||||
movdqa %%tmp3, %%x3 ; tmp3 = HG
|
||||
pshufb %%tmp3, %%shl ; tmp3 = G0
|
||||
por %%x2, %%tmp3 ; x2 = GF
|
||||
|
||||
pshufb %%x3, %%shr ; x3 = 0H
|
||||
pshufb %%xp, %%shl ; xp = I0
|
||||
por %%x3, %%xp ; x3 = IH
|
||||
|
||||
; fold tmp2 into X3
|
||||
movaps %%tmp3, %%tmp2
|
||||
pclmulqdq %%tmp2, %%const, 0x01
|
||||
pclmulqdq %%tmp3, %%const, 0x10
|
||||
xorps %%x3, %%tmp2
|
||||
xorps %%x3, %%tmp3
|
||||
%endm
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
|
||||
; Returns 0 if data has length 0.
|
||||
; Input: The input data (src), that data's length (size).
|
||||
; Output: The packed xmm register (xmm_out).
|
||||
; size is clobbered.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
%macro LOAD_FRACTIONAL_XMM 3
|
||||
%define %%xmm_out %1 ; %%xmm_out is an xmm register
|
||||
%define %%src %2
|
||||
%define %%size %3
|
||||
|
||||
pxor %%xmm_out, %%xmm_out
|
||||
|
||||
cmp %%size, 0
|
||||
je %%_done
|
||||
|
||||
add %%src, %%size
|
||||
|
||||
cmp %%size, 8
|
||||
jl %%_byte_loop
|
||||
|
||||
sub %%src, 8
|
||||
pinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
|
||||
sub %%size, 8
|
||||
|
||||
je %%_done
|
||||
|
||||
%%_byte_loop: ;Read in data 1 byte at a time while data is left
|
||||
pslldq %%xmm_out, 1
|
||||
|
||||
dec %%src
|
||||
pinsrb %%xmm_out, BYTE [%%src], 0
|
||||
dec %%size
|
||||
|
||||
jg %%_byte_loop
|
||||
|
||||
%%_done:
|
||||
|
||||
%endmacro ; LOAD_FRACTIONAL_XMM
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; copy x bytes (rounded up to 16 bytes) from src to dst
|
||||
; src & dst are unaligned
|
||||
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
|
||||
; xt0, xt1, xt2, xt3, xt4
|
||||
%macro COPY_IN_CRC 14
|
||||
%define %%dst %1 ; reg, in/clobbered
|
||||
%define %%src %2 ; reg, in/clobbered
|
||||
%define %%size %3 ; reg, in/clobbered
|
||||
%define %%tmp %4 ; reg, tmp
|
||||
%define %%x0 %5 ; xmm, in/out: crc state
|
||||
%define %%x1 %6 ; xmm, in/out: crc state
|
||||
%define %%x2 %7 ; xmm, in/out: crc state
|
||||
%define %%x3 %8 ; xmm, in/out: crc state
|
||||
%define %%xfold %9 ; xmm, in: (loaded from fold4)
|
||||
%define %%xtmp0 %10 ; xmm, tmp
|
||||
%define %%xtmp1 %11 ; xmm, tmp
|
||||
%define %%xtmp2 %12 ; xmm, tmp
|
||||
%define %%xtmp3 %13 ; xmm, tmp
|
||||
%define %%xtmp4 %14 ; xmm, tmp
|
||||
|
||||
cmp %%size, 16
|
||||
jl %%lt_16
|
||||
|
||||
; align source
|
||||
xor %%tmp, %%tmp
|
||||
sub %%tmp, %%src
|
||||
and %%tmp, 15
|
||||
jz %%already_aligned
|
||||
|
||||
; need to align, tmp contains number of bytes to transfer
|
||||
movdqu %%xtmp0, [%%src]
|
||||
movdqu [%%dst], %%xtmp0
|
||||
add %%dst, %%tmp
|
||||
add %%src, %%tmp
|
||||
sub %%size, %%tmp
|
||||
|
||||
%ifndef DEFLATE
|
||||
push %%dst
|
||||
|
||||
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
||||
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
||||
pop %%dst
|
||||
%endif
|
||||
|
||||
%%already_aligned:
|
||||
sub %%size, 64
|
||||
jl %%end_loop
|
||||
jmp %%loop
|
||||
align 16
|
||||
%%loop:
|
||||
movntdqa %%xtmp0, [%%src+0*16]
|
||||
movntdqa %%xtmp1, [%%src+1*16]
|
||||
movntdqa %%xtmp2, [%%src+2*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
|
||||
%endif
|
||||
movntdqa %%xtmp3, [%%src+3*16]
|
||||
|
||||
movdqu [%%dst+0*16], %%xtmp0
|
||||
movdqu [%%dst+1*16], %%xtmp1
|
||||
movdqu [%%dst+2*16], %%xtmp2
|
||||
movdqu [%%dst+3*16], %%xtmp3
|
||||
|
||||
%ifndef DEFLATE
|
||||
pxor %%x0, %%xtmp0
|
||||
pxor %%x1, %%xtmp1
|
||||
pxor %%x2, %%xtmp2
|
||||
pxor %%x3, %%xtmp3
|
||||
%endif
|
||||
add %%src, 4*16
|
||||
add %%dst, 4*16
|
||||
sub %%size, 4*16
|
||||
jge %%loop
|
||||
|
||||
%%end_loop:
|
||||
; %%size contains (num bytes left - 64)
|
||||
add %%size, 16
|
||||
jge %%three_full_regs
|
||||
add %%size, 16
|
||||
jge %%two_full_regs
|
||||
add %%size, 16
|
||||
jge %%one_full_reg
|
||||
add %%size, 16
|
||||
|
||||
%%no_full_regs: ; 0 <= %%size < 16, no full regs
|
||||
jz %%done ; if no bytes left, we're done
|
||||
jmp %%partial
|
||||
|
||||
;; Handle case where input is <16 bytes
|
||||
%%lt_16:
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%one_full_reg:
|
||||
movntdqa %%xtmp0, [%%src+0*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
movdqu [%%dst+0*16], %%xtmp0
|
||||
|
||||
%ifndef DEFLATE
|
||||
pxor %%x3, %%xtmp0
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 1*16
|
||||
add %%src, 1*16
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%two_full_regs:
|
||||
movntdqa %%xtmp0, [%%src+0*16]
|
||||
movntdqa %%xtmp1, [%%src+1*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
movdqu [%%dst+0*16], %%xtmp0
|
||||
movdqu [%%dst+1*16], %%xtmp1
|
||||
|
||||
%ifndef DEFLATE
|
||||
pxor %%x2, %%xtmp0
|
||||
pxor %%x3, %%xtmp1
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 2*16
|
||||
add %%src, 2*16
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%three_full_regs:
|
||||
movntdqa %%xtmp0, [%%src+0*16]
|
||||
movntdqa %%xtmp1, [%%src+1*16]
|
||||
movntdqa %%xtmp2, [%%src+2*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
movdqu [%%dst+0*16], %%xtmp0
|
||||
movdqu [%%dst+1*16], %%xtmp1
|
||||
movdqu [%%dst+2*16], %%xtmp2
|
||||
|
||||
%ifndef DEFLATE
|
||||
pxor %%x1, %%xtmp0
|
||||
pxor %%x2, %%xtmp1
|
||||
pxor %%x3, %%xtmp2
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 3*16
|
||||
add %%src, 3*16
|
||||
|
||||
; fall through to %%partial
|
||||
%%partial: ; 0 <= %%size < 16
|
||||
|
||||
%ifndef DEFLATE
|
||||
mov %%tmp, %%size
|
||||
%endif
|
||||
|
||||
LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
|
||||
|
||||
movdqu [%%dst], %%xtmp0
|
||||
|
||||
%ifndef DEFLATE
|
||||
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
||||
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
||||
%endif
|
||||
|
||||
%%done:
|
||||
%endm
|
||||
|
||||
|
||||
;%assign K 1024;
|
||||
;%assign D 8 * K; ; Amount of history
|
||||
;%assign LA 18 * 16; ; Max look-ahead, rounded up to 32 byte boundary
|
||||
|
||||
; copy D + LA bytes from src to dst
|
||||
; dst is aligned
|
||||
;void copy_D_LA(uint8_t *dst, uint8_t *src);
|
||||
; arg 1: rcx : dst
|
||||
; arg 2: rdx : src
|
||||
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
|
||||
%macro copy_D_LA 7
|
||||
%define %%dst %1 ; reg, clobbered
|
||||
%define %%src %2 ; reg, clobbered
|
||||
%define %%tmp %3
|
||||
%define %%xtmp0 %4
|
||||
%define %%xtmp1 %5
|
||||
%define %%xtmp2 %6
|
||||
%define %%xtmp3 %7
|
||||
|
||||
%assign %%SIZE (D + LA) / 16 ; number of DQ words to be copied
|
||||
%assign %%SIZE4 %%SIZE/4
|
||||
|
||||
lea %%tmp, [%%dst + 4 * 16 * %%SIZE4]
|
||||
jmp %%copy_D_LA_1
|
||||
align 16
|
||||
%%copy_D_LA_1:
|
||||
movdqu %%xtmp0, [%%src]
|
||||
movdqu %%xtmp1, [%%src+16]
|
||||
movdqu %%xtmp2, [%%src+32]
|
||||
movdqu %%xtmp3, [%%src+48]
|
||||
movdqa [%%dst], %%xtmp0
|
||||
movdqa [%%dst+16], %%xtmp1
|
||||
movdqa [%%dst+32], %%xtmp2
|
||||
movdqa [%%dst+48], %%xtmp3
|
||||
add %%src, 4*16
|
||||
add %%dst, 4*16
|
||||
cmp %%dst, %%tmp
|
||||
jne %%copy_D_LA_1
|
||||
%assign %%i 0
|
||||
%rep (%%SIZE - 4 * %%SIZE4)
|
||||
|
||||
%if (%%i == 0)
|
||||
movdqu %%xtmp0, [%%src + %%i*16]
|
||||
%elif (%%i == 1)
|
||||
movdqu %%xtmp1, [%%src + %%i*16]
|
||||
%elif (%%i == 2)
|
||||
movdqu %%xtmp2, [%%src + %%i*16]
|
||||
%elif (%%i == 3)
|
||||
movdqu %%xtmp3, [%%src + %%i*16]
|
||||
%else
|
||||
%error too many i
|
||||
% error
|
||||
%endif
|
||||
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%assign %%i 0
|
||||
%rep (%%SIZE - 4 * %%SIZE4)
|
||||
|
||||
%if (%%i == 0)
|
||||
movdqa [%%dst + %%i*16], %%xtmp0
|
||||
%elif (%%i == 1)
|
||||
movdqa [%%dst + %%i*16], %%xtmp1
|
||||
%elif (%%i == 2)
|
||||
movdqa [%%dst + %%i*16], %%xtmp2
|
||||
%elif (%%i == 3)
|
||||
movdqa [%%dst + %%i*16], %%xtmp3
|
||||
%else
|
||||
%error too many i
|
||||
% error
|
||||
%endif
|
||||
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endm
|
||||
%endif
|
@ -1,552 +0,0 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%ifndef BUFFER_UTILS
|
||||
%define BUFFER_UTILS
|
||||
|
||||
%include "options.asm"
|
||||
|
||||
extern pshufb_shf_table
|
||||
extern mask3
|
||||
|
||||
%ifdef FIX_CACHE_READ
|
||||
%define vmovntdqa vmovdqa
|
||||
%else
|
||||
%macro prefetchnta 1
|
||||
%endm
|
||||
%endif
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; code for doing the CRC calculation as part of copy-in, using pclmulqdq
|
||||
|
||||
; "shift" 4 input registers down 4 places
|
||||
; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
|
||||
%macro FOLD4 7
|
||||
%define %%xmm0 %1 ; xmm reg, in/out
|
||||
%define %%xmm1 %2 ; xmm reg, in/out
|
||||
%define %%xmm2 %3 ; xmm reg, in/out
|
||||
%define %%xmm3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
%define %%tmp1 %7 ; xmm reg, tmp
|
||||
|
||||
vmovaps %%tmp0, %%xmm0
|
||||
vmovaps %%tmp1, %%xmm1
|
||||
|
||||
vpclmulqdq %%xmm0, %%const, 0x01
|
||||
vpclmulqdq %%xmm1, %%const, 0x01
|
||||
|
||||
vpclmulqdq %%tmp0, %%const, 0x10
|
||||
vpclmulqdq %%tmp1, %%const, 0x10
|
||||
|
||||
vxorps %%xmm0, %%tmp0
|
||||
vxorps %%xmm1, %%tmp1
|
||||
|
||||
|
||||
vmovaps %%tmp0, %%xmm2
|
||||
vmovaps %%tmp1, %%xmm3
|
||||
|
||||
vpclmulqdq %%xmm2, %%const, 0x01
|
||||
vpclmulqdq %%xmm3, %%const, 0x01
|
||||
|
||||
vpclmulqdq %%tmp0, %%const, 0x10
|
||||
vpclmulqdq %%tmp1, %%const, 0x10
|
||||
|
||||
vxorps %%xmm2, %%tmp0
|
||||
vxorps %%xmm3, %%tmp1
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 3 input registers down 4 places
|
||||
; macro FOLD3 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out D A' B' C'
|
||||
%macro FOLD3 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
vmovdqa %%tmp0, %%x3
|
||||
|
||||
vmovaps %%x3, %%x2
|
||||
vpclmulqdq %%x2, %%const, 0x01
|
||||
vpclmulqdq %%x3, %%const, 0x10
|
||||
vxorps %%x3, %%x2
|
||||
|
||||
vmovaps %%x2, %%x1
|
||||
vpclmulqdq %%x1, %%const, 0x01
|
||||
vpclmulqdq %%x2, %%const, 0x10
|
||||
vxorps %%x2, %%x1
|
||||
|
||||
vmovaps %%x1, %%x0
|
||||
vpclmulqdq %%x0, %%const, 0x01
|
||||
vpclmulqdq %%x1, %%const, 0x10
|
||||
vxorps %%x1, %%x0
|
||||
|
||||
vmovdqa %%x0, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 2 input registers down 4 places
|
||||
; macro FOLD2 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out C D A' B'
|
||||
%macro FOLD2 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
vmovdqa %%tmp0, %%x3
|
||||
|
||||
vmovaps %%x3, %%x1
|
||||
vpclmulqdq %%x1, %%const, 0x01
|
||||
vpclmulqdq %%x3, %%const, 0x10
|
||||
vxorps %%x3, %%x1
|
||||
|
||||
vmovdqa %%x1, %%tmp0
|
||||
vmovdqa %%tmp0, %%x2
|
||||
|
||||
vmovaps %%x2, %%x0
|
||||
vpclmulqdq %%x0, %%const, 0x01
|
||||
vpclmulqdq %%x2, %%const, 0x10
|
||||
vxorps %%x2, %%x0
|
||||
|
||||
vmovdqa %%x0, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; "shift" 1 input registers down 4 places
|
||||
; macro FOLD1 x0, x1, x2, x3, const, tmp0
|
||||
; x0 x1 x2 x3
|
||||
; In A B C D
|
||||
; Out B C D A'
|
||||
%macro FOLD1 6
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%const %5 ; xmm reg, in
|
||||
%define %%tmp0 %6 ; xmm reg, tmp
|
||||
|
||||
vmovdqa %%tmp0, %%x3
|
||||
|
||||
vmovaps %%x3, %%x0
|
||||
vpclmulqdq %%x0, %%const, 0x01
|
||||
vpclmulqdq %%x3, %%const, 0x10
|
||||
vxorps %%x3, %%x0
|
||||
|
||||
vmovdqa %%x0, %%x1
|
||||
vmovdqa %%x1, %%x2
|
||||
vmovdqa %%x2, %%tmp0
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
|
||||
|
||||
; XP X3 X2 X1 X0 tmp2
|
||||
; Initial state xI HG FE DC BA
|
||||
; after shift IH GF ED CB A0
|
||||
; after fold ff GF ED CB ff = merge(IH, A0)
|
||||
;
|
||||
%macro PARTIAL_FOLD 12
|
||||
%define %%x0 %1 ; xmm reg, in/out
|
||||
%define %%x1 %2 ; xmm reg, in/out
|
||||
%define %%x2 %3 ; xmm reg, in/out
|
||||
%define %%x3 %4 ; xmm reg, in/out
|
||||
%define %%xp %5 ; xmm partial reg, in/clobbered
|
||||
%define %%size %6 ; GPR, in/clobbered (1...15)
|
||||
%define %%const %7 ; xmm reg, in
|
||||
%define %%shl %8 ; xmm reg, tmp
|
||||
%define %%shr %9 ; xmm reg, tmp
|
||||
%define %%tmp2 %10 ; xmm reg, tmp
|
||||
%define %%tmp3 %11 ; xmm reg, tmp
|
||||
%define %%gtmp %12 ; GPR, tmp
|
||||
|
||||
; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
|
||||
shl %%size, 4 ; size *= 16
|
||||
lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
|
||||
vmovdqa %%shl, [%%gtmp + %%size] ; shl constant
|
||||
vmovdqa %%shr, %%shl
|
||||
vpxor %%shr, [mask3 WRT_OPT] ; shr constant
|
||||
|
||||
vmovdqa %%tmp2, %%x0 ; tmp2 = BA
|
||||
vpshufb %%tmp2, %%shl ; tmp2 = A0
|
||||
|
||||
vpshufb %%x0, %%shr ; x0 = 0B
|
||||
vmovdqa %%tmp3, %%x1 ; tmp3 = DC
|
||||
vpshufb %%tmp3, %%shl ; tmp3 = C0
|
||||
vpor %%x0, %%tmp3 ; x0 = CB
|
||||
|
||||
vpshufb %%x1, %%shr ; x1 = 0D
|
||||
vmovdqa %%tmp3, %%x2 ; tmp3 = FE
|
||||
vpshufb %%tmp3, %%shl ; tmp3 = E0
|
||||
vpor %%x1, %%tmp3 ; x1 = ED
|
||||
|
||||
vpshufb %%x2, %%shr ; x2 = 0F
|
||||
vmovdqa %%tmp3, %%x3 ; tmp3 = HG
|
||||
vpshufb %%tmp3, %%shl ; tmp3 = G0
|
||||
vpor %%x2, %%tmp3 ; x2 = GF
|
||||
|
||||
vpshufb %%x3, %%shr ; x3 = 0H
|
||||
vpshufb %%xp, %%shl ; xp = I0
|
||||
vpor %%x3, %%xp ; x3 = IH
|
||||
|
||||
; fold tmp2 into X3
|
||||
vmovaps %%tmp3, %%tmp2
|
||||
vpclmulqdq %%tmp2, %%const, 0x01
|
||||
vpclmulqdq %%tmp3, %%const, 0x10
|
||||
vxorps %%x3, %%tmp2
|
||||
vxorps %%x3, %%tmp3
|
||||
%endm
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
|
||||
; Returns 0 if data has length 0.
|
||||
; Input: The input data (src), that data's length (size).
|
||||
; Output: The packed xmm register (xmm_out).
|
||||
; size is clobbered.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
%macro LOAD_FRACTIONAL_XMM 3
|
||||
%define %%xmm_out %1 ; %%xmm_out is an xmm register
|
||||
%define %%src %2
|
||||
%define %%size %3
|
||||
|
||||
vpxor %%xmm_out, %%xmm_out
|
||||
|
||||
cmp %%size, 0
|
||||
je %%_done
|
||||
|
||||
add %%src, %%size
|
||||
|
||||
cmp %%size, 8
|
||||
jl %%_byte_loop
|
||||
|
||||
sub %%src, 8
|
||||
vpinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
|
||||
sub %%size, 8
|
||||
|
||||
je %%_done
|
||||
|
||||
%%_byte_loop: ;Read in data 1 byte at a time while data is left
|
||||
vpslldq %%xmm_out, 1
|
||||
|
||||
dec %%src
|
||||
vpinsrb %%xmm_out, BYTE [%%src], 0
|
||||
dec %%size
|
||||
|
||||
jg %%_byte_loop
|
||||
|
||||
%%_done:
|
||||
|
||||
%endmacro ; LOAD_FRACTIONAL_XMM
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; copy x bytes (rounded up to 16 bytes) from src to dst
|
||||
; src & dst are unaligned
|
||||
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
|
||||
; xt0, xt1, xt2, xt3, xt4
|
||||
%macro COPY_IN_CRC 14
|
||||
%define %%dst %1 ; reg, in/clobbered
|
||||
%define %%src %2 ; reg, in/clobbered
|
||||
%define %%size %3 ; reg, in/clobbered
|
||||
%define %%tmp %4 ; reg, tmp
|
||||
%define %%x0 %5 ; xmm, in/out: crc state
|
||||
%define %%x1 %6 ; xmm, in/out: crc state
|
||||
%define %%x2 %7 ; xmm, in/out: crc state
|
||||
%define %%x3 %8 ; xmm, in/out: crc state
|
||||
%define %%xfold %9 ; xmm, in: (loaded from fold4)
|
||||
%define %%xtmp0 %10 ; xmm, tmp
|
||||
%define %%xtmp1 %11 ; xmm, tmp
|
||||
%define %%xtmp2 %12 ; xmm, tmp
|
||||
%define %%xtmp3 %13 ; xmm, tmp
|
||||
%define %%xtmp4 %14 ; xmm, tmp
|
||||
|
||||
cmp %%size, 16
|
||||
jl %%lt_16
|
||||
|
||||
; align source
|
||||
xor %%tmp, %%tmp
|
||||
sub %%tmp, %%src
|
||||
and %%tmp, 15
|
||||
jz %%already_aligned
|
||||
|
||||
; need to align, tmp contains number of bytes to transfer
|
||||
vmovdqu %%xtmp0, [%%src]
|
||||
vmovdqu [%%dst], %%xtmp0
|
||||
add %%dst, %%tmp
|
||||
add %%src, %%tmp
|
||||
sub %%size, %%tmp
|
||||
|
||||
%ifndef DEFLATE
|
||||
push %%dst
|
||||
|
||||
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
||||
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
||||
pop %%dst
|
||||
%endif
|
||||
|
||||
%%already_aligned:
|
||||
sub %%size, 64
|
||||
jl %%end_loop
|
||||
jmp %%loop
|
||||
align 16
|
||||
%%loop:
|
||||
vmovntdqa %%xtmp0, [%%src+0*16]
|
||||
vmovntdqa %%xtmp1, [%%src+1*16]
|
||||
vmovntdqa %%xtmp2, [%%src+2*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
|
||||
%endif
|
||||
vmovntdqa %%xtmp3, [%%src+3*16]
|
||||
|
||||
vmovdqu [%%dst+0*16], %%xtmp0
|
||||
vmovdqu [%%dst+1*16], %%xtmp1
|
||||
vmovdqu [%%dst+2*16], %%xtmp2
|
||||
vmovdqu [%%dst+3*16], %%xtmp3
|
||||
|
||||
%ifndef DEFLATE
|
||||
vpxor %%x0, %%xtmp0
|
||||
vpxor %%x1, %%xtmp1
|
||||
vpxor %%x2, %%xtmp2
|
||||
vpxor %%x3, %%xtmp3
|
||||
%endif
|
||||
add %%src, 4*16
|
||||
add %%dst, 4*16
|
||||
sub %%size, 4*16
|
||||
jge %%loop
|
||||
|
||||
%%end_loop:
|
||||
; %%size contains (num bytes left - 64)
|
||||
add %%size, 16
|
||||
jge %%three_full_regs
|
||||
add %%size, 16
|
||||
jge %%two_full_regs
|
||||
add %%size, 16
|
||||
jge %%one_full_reg
|
||||
add %%size, 16
|
||||
|
||||
%%no_full_regs: ; 0 <= %%size < 16, no full regs
|
||||
jz %%done ; if no bytes left, we're done
|
||||
jmp %%partial
|
||||
|
||||
;; Handle case where input is <16 bytes
|
||||
%%lt_16:
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%one_full_reg:
|
||||
vmovntdqa %%xtmp0, [%%src+0*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
vmovdqu [%%dst+0*16], %%xtmp0
|
||||
|
||||
%ifndef DEFLATE
|
||||
vpxor %%x3, %%xtmp0
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 1*16
|
||||
add %%src, 1*16
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%two_full_regs:
|
||||
vmovntdqa %%xtmp0, [%%src+0*16]
|
||||
vmovntdqa %%xtmp1, [%%src+1*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
vmovdqu [%%dst+0*16], %%xtmp0
|
||||
vmovdqu [%%dst+1*16], %%xtmp1
|
||||
|
||||
%ifndef DEFLATE
|
||||
vpxor %%x2, %%xtmp0
|
||||
vpxor %%x3, %%xtmp1
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 2*16
|
||||
add %%src, 2*16
|
||||
jmp %%partial
|
||||
|
||||
|
||||
%%three_full_regs:
|
||||
vmovntdqa %%xtmp0, [%%src+0*16]
|
||||
vmovntdqa %%xtmp1, [%%src+1*16]
|
||||
vmovntdqa %%xtmp2, [%%src+2*16]
|
||||
|
||||
%ifndef DEFLATE
|
||||
FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
|
||||
%endif
|
||||
vmovdqu [%%dst+0*16], %%xtmp0
|
||||
vmovdqu [%%dst+1*16], %%xtmp1
|
||||
vmovdqu [%%dst+2*16], %%xtmp2
|
||||
|
||||
%ifndef DEFLATE
|
||||
vpxor %%x1, %%xtmp0
|
||||
vpxor %%x2, %%xtmp1
|
||||
vpxor %%x3, %%xtmp2
|
||||
%endif
|
||||
test %%size, %%size
|
||||
jz %%done ; if no bytes left, we're done
|
||||
|
||||
add %%dst, 3*16
|
||||
add %%src, 3*16
|
||||
|
||||
; fall through to %%partial
|
||||
%%partial: ; 0 <= %%size < 16
|
||||
|
||||
%ifndef DEFLATE
|
||||
mov %%tmp, %%size
|
||||
%endif
|
||||
|
||||
LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
|
||||
|
||||
vmovdqu [%%dst], %%xtmp0
|
||||
|
||||
%ifndef DEFLATE
|
||||
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
|
||||
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
|
||||
%endif
|
||||
|
||||
%%done:
|
||||
%endm
|
||||
|
||||
|
||||
;%assign K 1024;
|
||||
;%assign D 8 * K; ; Amount of history
|
||||
;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary
|
||||
|
||||
; copy D + LA bytes from src to dst
|
||||
; dst is aligned
|
||||
;void copy_D_LA(uint8_t *dst, uint8_t *src);
|
||||
; arg 1: rcx : dst
|
||||
; arg 2: rdx : src
|
||||
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
|
||||
%macro copy_D_LA 7
|
||||
%define %%dst %1 ; reg, clobbered
|
||||
%define %%src %2 ; reg, clobbered
|
||||
%define %%tmp %3
|
||||
%define %%ytmp0 %4
|
||||
%define %%ytmp1 %5
|
||||
%define %%ytmp2 %6
|
||||
%define %%ytmp3 %7
|
||||
|
||||
%define %%xtmp0 %4x
|
||||
|
||||
%assign %%SIZE (D + LA) / 32 ; number of DQ words to be copied
|
||||
%assign %%SIZE4 %%SIZE/4
|
||||
%assign %%MOD16 ((D + LA) - 32 * %%SIZE) / 16
|
||||
|
||||
lea %%tmp, [%%dst + 4 * 32 * %%SIZE4]
|
||||
jmp %%copy_D_LA_1
|
||||
align 16
|
||||
%%copy_D_LA_1:
|
||||
vmovdqu %%ytmp0, [%%src]
|
||||
vmovdqu %%ytmp1, [%%src + 1 * 32]
|
||||
vmovdqu %%ytmp2, [%%src + 2 * 32]
|
||||
vmovdqu %%ytmp3, [%%src + 3 * 32]
|
||||
vmovdqa [%%dst], %%ytmp0
|
||||
vmovdqa [%%dst + 1 * 32], %%ytmp1
|
||||
vmovdqa [%%dst + 2 * 32], %%ytmp2
|
||||
vmovdqa [%%dst + 3 * 32], %%ytmp3
|
||||
add %%src, 4*32
|
||||
add %%dst, 4*32
|
||||
cmp %%dst, %%tmp
|
||||
jne %%copy_D_LA_1
|
||||
%assign %%i 0
|
||||
%rep (%%SIZE - 4 * %%SIZE4)
|
||||
|
||||
%if (%%i == 0)
|
||||
vmovdqu %%ytmp0, [%%src + %%i*32]
|
||||
%elif (%%i == 1)
|
||||
vmovdqu %%ytmp1, [%%src + %%i*32]
|
||||
%elif (%%i == 2)
|
||||
vmovdqu %%ytmp2, [%%src + %%i*32]
|
||||
%elif (%%i == 3)
|
||||
vmovdqu %%ytmp3, [%%src + %%i*32]
|
||||
%else
|
||||
%error too many i
|
||||
% error
|
||||
%endif
|
||||
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%assign %%i 0
|
||||
%rep (%%SIZE - 4 * %%SIZE4)
|
||||
|
||||
%if (%%i == 0)
|
||||
vmovdqa [%%dst + %%i*32], %%ytmp0
|
||||
%elif (%%i == 1)
|
||||
vmovdqa [%%dst + %%i*32], %%ytmp1
|
||||
%elif (%%i == 2)
|
||||
vmovdqa [%%dst + %%i*32], %%ytmp2
|
||||
%elif (%%i == 3)
|
||||
vmovdqa [%%dst + %%i*32], %%ytmp3
|
||||
%else
|
||||
%error too many i
|
||||
% error
|
||||
%endif
|
||||
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
|
||||
%rep %%MOD16
|
||||
vmovdqu %%xtmp0, [%%src + (%%SIZE - 4 * %%SIZE4)*32]
|
||||
vmovdqa [%%dst + (%%SIZE - 4 * %%SIZE4)*32], %%xtmp0
|
||||
%endrep
|
||||
|
||||
%endm
|
||||
%endif
|
@ -41,6 +41,7 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define curr_data rax
|
||||
%define tmp1 rax
|
||||
|
||||
%define f_index rbx
|
||||
@ -69,7 +70,6 @@
|
||||
%define m_bit_count r11
|
||||
|
||||
%define code2 r12
|
||||
|
||||
%define f_end_i r12
|
||||
|
||||
%define file_start r13
|
||||
@ -110,32 +110,29 @@ skip_SLOP:
|
||||
|
||||
mov hufftables, [stream + _hufftables]
|
||||
|
||||
; f_i = state->b_bytes_processed;
|
||||
; f_end_i = state->b_bytes_valid;
|
||||
mov f_i %+ d, [stream + _internal_state_b_bytes_processed]
|
||||
mov f_end_i %+ d, [stream + _internal_state_b_bytes_valid]
|
||||
mov file_start, [stream + _next_in]
|
||||
|
||||
; f_i += (uint32_t)(state->buffer - state->file_start);
|
||||
; f_end_i += (uint32_t)(state->buffer - state->file_start);
|
||||
mov file_start, [stream + _internal_state_file_start]
|
||||
lea tmp1, [stream + _internal_state_buffer]
|
||||
sub tmp1, file_start
|
||||
add f_i, tmp1
|
||||
add f_end_i, tmp1
|
||||
mov f_i %+ d, dword [stream + _total_in]
|
||||
sub file_start, f_i
|
||||
|
||||
mov f_end_i %+ d, dword [stream + _avail_in]
|
||||
add f_end_i, f_i
|
||||
|
||||
sub f_end_i, LAST_BYTES_COUNT
|
||||
mov [rsp + f_end_i_mem_offset], f_end_i
|
||||
; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
|
||||
cmp f_i, f_end_i
|
||||
jge end_loop_2
|
||||
|
||||
mov tmp1, [file_start + f_i]
|
||||
mov curr_data %+ d, [file_start + f_i]
|
||||
|
||||
cmp dword [stream + _internal_state_b_bytes_processed], 0
|
||||
cmp dword [stream + _internal_state_b_bytes_processed], 0 ;TODO fixz
|
||||
jne skip_write_first_byte
|
||||
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja end_loop_2
|
||||
|
||||
compute_hash hash, tmp1
|
||||
compute_hash hash, curr_data
|
||||
and hash %+ d, HASH_MASK
|
||||
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
|
||||
jmp encode_literal
|
||||
@ -148,7 +145,8 @@ loop2:
|
||||
ja end_loop_2
|
||||
|
||||
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
|
||||
compute_hash hash, tmp1
|
||||
mov curr_data %+ d, [file_start + f_i]
|
||||
compute_hash hash, curr_data
|
||||
and hash %+ d, HASH_MASK
|
||||
|
||||
; f_index = state->head[hash];
|
||||
@ -171,6 +169,7 @@ loop2:
|
||||
; len = f_end_i - f_i;
|
||||
mov tmp4, [rsp + f_end_i_mem_offset]
|
||||
sub tmp4, f_i
|
||||
add tmp4, LAST_BYTES_COUNT
|
||||
|
||||
; if (len > 258) len = 258;
|
||||
cmp tmp4, 258
|
||||
@ -206,11 +205,13 @@ loop2:
|
||||
; for (k = f_i+1, f_i += len-1; k <= f_i; k++) {
|
||||
lea tmp3, [f_i + 1] ; tmp3 <= k
|
||||
add f_i, len
|
||||
%ifdef LIMIT_HASH_UPDATE
|
||||
cmp f_i, [rsp + f_end_i_mem_offset]
|
||||
jae skip_hash_update
|
||||
|
||||
; only update hash twice
|
||||
|
||||
; hash = compute_hash(state->file_start + k) & HASH_MASK;
|
||||
mov tmp6, [file_start + tmp3]
|
||||
mov tmp6 %+ d, dword [file_start + tmp3]
|
||||
compute_hash hash, tmp6
|
||||
and hash %+ d, HASH_MASK
|
||||
; state->head[hash] = k;
|
||||
@ -219,27 +220,13 @@ loop2:
|
||||
add tmp3, 1
|
||||
|
||||
; hash = compute_hash(state->file_start + k) & HASH_MASK;
|
||||
mov tmp6, [file_start + tmp3]
|
||||
mov tmp6 %+ d, dword [file_start + tmp3]
|
||||
compute_hash hash, tmp6
|
||||
and hash %+ d, HASH_MASK
|
||||
; state->head[hash] = k;
|
||||
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
||||
|
||||
%else
|
||||
loop3:
|
||||
; hash = compute_hash(state->file_start + k) & HASH_MASK;
|
||||
mov tmp6, [file_start + tmp3]
|
||||
compute_hash hash, tmp6
|
||||
and hash %+ d, HASH_MASK
|
||||
; state->head[hash] = k;
|
||||
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
||||
inc tmp3
|
||||
cmp tmp3, f_i
|
||||
jl loop3
|
||||
%endif
|
||||
|
||||
mov tmp1 %+ d, [file_start + f_i]
|
||||
|
||||
skip_hash_update:
|
||||
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp5
|
||||
|
||||
; continue
|
||||
@ -248,8 +235,6 @@ loop3:
|
||||
jmp end_loop_2
|
||||
|
||||
encode_literal:
|
||||
mov tmp1 %+ d, [file_start + f_i + 1]
|
||||
|
||||
; get_lit_code(state->file_start[f_i], &code2, &code_len2);
|
||||
movzx tmp5, byte [file_start + f_i]
|
||||
get_lit_code tmp5, code2, code_len2, hufftables
|
||||
@ -262,19 +247,29 @@ encode_literal:
|
||||
jl loop2
|
||||
|
||||
end_loop_2:
|
||||
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
add f_end_i, LAST_BYTES_COUNT
|
||||
mov [rsp + f_end_i_mem_offset], f_end_i
|
||||
; if ((f_i >= f_end_i) && ! state->bitbuf.is_full()) {
|
||||
cmp f_i, f_end_i
|
||||
jge write_eob
|
||||
|
||||
xor tmp5, tmp5
|
||||
final_bytes:
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja not_end
|
||||
movzx tmp5, byte [file_start + f_i]
|
||||
get_lit_code tmp5, code2, code_len2, hufftables
|
||||
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
|
||||
|
||||
inc f_i
|
||||
cmp f_i, [rsp + f_end_i_mem_offset]
|
||||
jl not_end
|
||||
jl final_bytes
|
||||
|
||||
write_eob:
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja not_end
|
||||
|
||||
cmp dword [stream + _end_of_stream], 1
|
||||
jne cont
|
||||
cmp dword [stream + _internal_state_left_over], 0
|
||||
jg not_end
|
||||
|
||||
cont:
|
||||
; get_lit_code(256, &code2, &code_len2);
|
||||
get_lit_code 256, code2, code_len2, hufftables
|
||||
|
||||
@ -293,14 +288,16 @@ sync_flush:
|
||||
; }
|
||||
not_end:
|
||||
|
||||
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
|
||||
add f_i, [stream + _internal_state_file_start]
|
||||
sub f_i, stream
|
||||
sub f_i, _internal_state_buffer
|
||||
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
|
||||
|
||||
; // update output buffer
|
||||
; stream->next_out = state->bitbuf.buffer_ptr();
|
||||
;; Update input buffer
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
mov [stream + _total_in], f_i %+ d
|
||||
add file_start, f_i
|
||||
mov [stream + _next_in], file_start
|
||||
sub f_end_i, f_i
|
||||
mov [stream + _avail_in], f_end_i %+ d
|
||||
|
||||
;; Update output buffer
|
||||
mov [stream + _next_out], m_out_buf
|
||||
; len = state->bitbuf.buffer_used();
|
||||
sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
|
||||
|
@ -50,23 +50,17 @@ extern isal_deflate_body_04
|
||||
extern isal_deflate_finish_base
|
||||
extern isal_deflate_finish_01
|
||||
|
||||
extern get_crc_base
|
||||
extern get_crc_01
|
||||
|
||||
extern isal_update_histogram_base
|
||||
extern isal_update_histogram_01
|
||||
extern isal_update_histogram_04
|
||||
|
||||
extern isal_deflate_init_base
|
||||
extern isal_deflate_init_01
|
||||
extern crc32_gzip_base
|
||||
extern crc32_gzip_01
|
||||
|
||||
section .text
|
||||
|
||||
%include "multibinary.asm"
|
||||
|
||||
mbin_interface isal_deflate_init
|
||||
mbin_dispatch_init5 isal_deflate_init, isal_deflate_init_base, isal_deflate_init_01, isal_deflate_init_01, isal_deflate_init_01
|
||||
|
||||
mbin_interface isal_deflate_body_stateless
|
||||
mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_02, isal_deflate_body_stateless_04
|
||||
|
||||
@ -75,8 +69,8 @@ mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body
|
||||
mbin_interface isal_deflate_finish
|
||||
mbin_dispatch_init5 isal_deflate_finish, isal_deflate_finish_base, isal_deflate_finish_01, isal_deflate_finish_01, isal_deflate_finish_01
|
||||
|
||||
mbin_interface get_crc
|
||||
mbin_dispatch_init5 get_crc, get_crc_base, get_crc_01, get_crc_01, get_crc_01
|
||||
|
||||
mbin_interface isal_update_histogram
|
||||
mbin_dispatch_init5 isal_update_histogram, isal_update_histogram_base, isal_update_histogram_01, isal_update_histogram_01, isal_update_histogram_04
|
||||
|
||||
mbin_interface crc32_gzip
|
||||
mbin_dispatch_init5 crc32_gzip, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01, crc32_gzip_01
|
||||
|
@ -37,7 +37,6 @@
|
||||
|
||||
%include "stdmac.asm"
|
||||
|
||||
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
||||
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
||||
|
||||
%ifdef DEBUG
|
||||
@ -179,8 +178,7 @@ skip_SLOP:
|
||||
mov [stream + _internal_state_b_bytes_valid], f_end_i %+ d
|
||||
|
||||
mov f_i, 0
|
||||
mov file_start, [stream + _next_in]
|
||||
mov [stream + _internal_state_file_start], file_start
|
||||
mov file_start, [stream + _internal_state_file_start]
|
||||
|
||||
; f_end_i -= LA;
|
||||
sub f_end_i, LA_STATELESS
|
||||
|
@ -231,14 +231,13 @@ struct isal_zstate {
|
||||
uint32_t b_bytes_valid; //!< number of bytes of valid data in buffer
|
||||
uint32_t b_bytes_processed; //!< keeps track of the number of bytes processed in isal_zstate.buffer
|
||||
uint8_t *file_start; //!< pointer to where file would logically start
|
||||
DECLARE_ALIGNED(uint32_t crc[16], 16); //!< actually 4 128-bit integers
|
||||
uint32_t crc; //!< Current crc
|
||||
struct BitBuf2 bitbuf; //!< Bit Buffer
|
||||
enum isal_zstate_state state; //!< Current state in processing the data stream
|
||||
uint32_t count; //!< used for partial header/trailer writes
|
||||
uint8_t tmp_out_buff[16]; //!< temporary array
|
||||
uint32_t tmp_out_start; //!< temporary variable
|
||||
uint32_t tmp_out_end; //!< temporary variable
|
||||
uint32_t last_flush; //!< keeps track of last submitted flush
|
||||
uint32_t has_gzip_hdr; //!< keeps track of if the gzip header has been written.
|
||||
uint32_t has_eob; //!< keeps track of eob on the last deflate block
|
||||
uint32_t has_eob_hdr; //!< keeps track of eob hdr (with BFINAL set)
|
||||
|
Loading…
x
Reference in New Issue
Block a user