igzip: Improve igzip stateful on large buffers

Change-Id: I3e9e56958e8ea3e636df3369b29c3d4b94dce9d8
Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
Roy Oursler 2016-09-12 13:15:32 -07:00 committed by Greg Tucker
parent 547e8e1893
commit d941b66162
19 changed files with 457 additions and 2214 deletions

View File

@ -28,8 +28,6 @@
########################################################################
lsrc += igzip/igzip.c igzip/hufftables_c.c \
igzip/crc_utils_01.asm \
igzip/crc_utils_04.asm \
igzip/igzip_body_01.asm \
igzip/igzip_body_02.asm \
igzip/igzip_body_04.asm \
@ -64,8 +62,6 @@ other_tests += igzip/igzip_file_perf igzip/igzip_sync_flush_file_perf igzip/igz
other_src += igzip/bitbuf2.asm igzip/data_struct2.asm \
igzip/inflate_data_structs.asm \
igzip/igzip_buffer_utils_01.asm \
igzip/igzip_buffer_utils_04.asm \
igzip/igzip_body.asm igzip/igzip_finish.asm \
igzip/lz0a_const.asm igzip/options.asm igzip/stdmac.asm igzip/igzip_compare_types.asm \
igzip/bitbuf2.h igzip/repeated_char_result.h \

View File

@ -84,8 +84,8 @@ section .text
%endif
align 16
global crc32_gzip
crc32_gzip:
global crc32_gzip_01
crc32_gzip_01:
; unsigned long c = crc ^ 0xffffffffL;
not arg1_low32 ;

View File

@ -8,43 +8,6 @@ section .data
align 32
global pshufb_shf_table:data internal
pshufb_shf_table:
dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
;; ; MAGIC value, which when folded 4 times gives FFFFFF00000...0000
;; global crc_init_4
;; crc_init_4:
;; dq 0x9db42487
;; dq 0x0
;; dq 0x0
;; dq 0x0
; constant used to shift/fold one XMM reg down by 4 XMM widths
global fold_4:data internal
fold_4:
dq 0x00000001c6e41596
dq 0x0000000154442bd4
;value, which when xored with pshufb_shf_table entry gives shr value
global mask3:data internal
mask3: dq 0x8080808080808080, 0x8080808080808080
%ifndef CRC_TABLE
%define CRC_TABLE
; Place marker in library to avoid linker warning

View File

@ -1,195 +0,0 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%include "options.asm"
%include "reg_sizes.asm"
; Functional versions of CRC macros
%include "igzip_buffer_utils_01.asm"
extern fold_4
%define crc_0 xmm0 ; in/out: crc state
%define crc_1 xmm1 ; in/out: crc state
%define crc_2 xmm2 ; in/out: crc state
%define crc_3 xmm3 ; in/out: crc state
%define crc_fold xmm4 ; in: (loaded from fold_4)
%define crc_tmp0 xmm5 ; tmp
%define crc_tmp1 xmm6 ; tmp
%define crc_tmp2 xmm7 ; tmp
%define crc_tmp3 xmm8 ; tmp
%define crc_tmp4 xmm9 ; tmp
%define tmp4 rax
; copy x bytes (rounded up to 16 bytes) from src to dst with crc
; src & dst are unaligned
; void copy_in_crc(uint8_t *dst, uint8_t *src, uint32_t size, uint32_t *crc)
; arg 1: rcx: pointer to dst
; arg 2: rdx: pointer to src
; arg 3: r8: size (in bytes)
; arg 4: r9: pointer to CRC
;; %if 0
global copy_in_crc_01
copy_in_crc_01:
%ifidn __OUTPUT_FORMAT__, elf64
mov r9, rcx
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
%endif
; Save xmm registers that need to be preserved.
sub rsp, 8 + 4*16
movdqa [rsp+0*16], xmm6
movdqa [rsp+1*16], xmm7
movdqa [rsp+2*16], xmm8
movdqa [rsp+3*16], xmm9
movdqa crc_0, [r9 + 0*16]
movdqa crc_1, [r9 + 1*16]
movdqa crc_2, [r9 + 2*16]
movdqa crc_3, [r9 + 3*16]
movdqa crc_fold, [fold_4 WRT_OPT]
COPY_IN_CRC rcx, rdx, r8, tmp4, crc_0, crc_1, crc_2, crc_3, \
crc_fold, \
crc_tmp0, crc_tmp1, crc_tmp2, crc_tmp3, crc_tmp4
movdqa [r9 + 0*16], crc_0
movdqa [r9 + 1*16], crc_1
movdqa [r9 + 2*16], crc_2
movdqa [r9 + 3*16], crc_3
movdqa xmm9, [rsp+3*16]
movdqa xmm8, [rsp+2*16]
movdqa xmm7, [rsp+1*16]
movdqa xmm6, [rsp+0*16]
add rsp, 8 + 4*16
ret
; Convert 512-bit CRC data to real 32-bit value
; uint32_t crc_512to32(uint32_t *crc)
; arg 1: rcx: pointer to CRC
; returns: eax: 32 bit crc
global crc_512to32_01
crc_512to32_01:
%ifidn __OUTPUT_FORMAT__, elf64
mov rcx, rdi
%endif
movdqa crc_0, [rcx + 0*16]
movdqa crc_1, [rcx + 1*16]
movdqa crc_2, [rcx + 2*16]
movdqa crc_3, [rcx + 3*16]
movdqa crc_fold, [rk1 WRT_OPT] ;k1
; fold the 4 xmm registers to 1 xmm register with different constants
movdqa crc_tmp0, crc_0
pclmulqdq crc_0, crc_fold, 0x1
pclmulqdq crc_tmp0, crc_fold, 0x10
pxor crc_1, crc_tmp0
pxor crc_1, crc_0
movdqa crc_tmp0, crc_1
pclmulqdq crc_1, crc_fold, 0x1
pclmulqdq crc_tmp0, crc_fold, 0x10
pxor crc_2, crc_tmp0
pxor crc_2, crc_1
movdqa crc_tmp0, crc_2
pclmulqdq crc_2, crc_fold, 0x1
pclmulqdq crc_tmp0, crc_fold, 0x10
pxor crc_3, crc_tmp0
pxor crc_3, crc_2
movdqa crc_fold, [rk5 WRT_OPT]
movdqa crc_0, crc_3
pclmulqdq crc_3, crc_fold, 0
psrldq crc_0, 8
pxor crc_3, crc_0
movdqa crc_0, crc_3
pslldq crc_3, 4
pclmulqdq crc_3, crc_fold, 0x10
pxor crc_3, crc_0
pand crc_3, [mask2 WRT_OPT]
movdqa crc_1, crc_3
movdqa crc_2, crc_3
movdqa crc_fold, [rk7 WRT_OPT]
pclmulqdq crc_3, crc_fold, 0
pxor crc_3, crc_2
pand crc_3, [mask WRT_OPT]
movdqa crc_2, crc_3
pclmulqdq crc_3, crc_fold, 0x10
pxor crc_3, crc_2
pxor crc_3, crc_1
pextrd eax, crc_3, 2
not eax
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section .data
align 16
rk1: dq 0x00000000ccaa009e
rk2: dq 0x00000001751997d0
rk5: dq 0x00000000ccaa009e
rk6: dq 0x0000000163cd6124
rk7: dq 0x00000001f7011640
rk8: dq 0x00000001db710640
mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF

View File

@ -1,194 +0,0 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%include "options.asm"
%include "reg_sizes.asm"
; Functional versions of CRC macros
%include "igzip_buffer_utils_04.asm"
extern fold_4
%define crc_0 xmm0 ; in/out: crc state
%define crc_1 xmm1 ; in/out: crc state
%define crc_2 xmm2 ; in/out: crc state
%define crc_3 xmm3 ; in/out: crc state
%define crc_fold xmm4 ; in: (loaded from fold_4)
%define crc_tmp0 xmm5 ; tmp
%define crc_tmp1 xmm6 ; tmp
%define crc_tmp2 xmm7 ; tmp
%define crc_tmp3 xmm8 ; tmp
%define crc_tmp4 xmm9 ; tmp
%define tmp4 rax
; copy x bytes (rounded up to 16 bytes) from src to dst with crc
; src & dst are unaligned
; void copy_in_crc(uint8_t *dst, uint8_t *src, uint32_t size, uint32_t *crc)
; arg 1: rcx: pointer to dst
; arg 2: rdx: pointer to src
; arg 3: r8: size (in bytes)
; arg 4: r9: pointer to CRC
;; %if 0
global copy_in_crc_04
copy_in_crc_04:
%ifidn __OUTPUT_FORMAT__, elf64
mov r9, rcx
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
%endif
; Save xmm registers that need to be preserved.
sub rsp, 8 + 4*16
vmovdqa [rsp+0*16], xmm6
vmovdqa [rsp+1*16], xmm7
vmovdqa [rsp+2*16], xmm8
vmovdqa [rsp+3*16], xmm9
vmovdqa crc_0, [r9 + 0*16]
vmovdqa crc_1, [r9 + 1*16]
vmovdqa crc_2, [r9 + 2*16]
vmovdqa crc_3, [r9 + 3*16]
vmovdqa crc_fold, [fold_4 WRT_OPT]
COPY_IN_CRC rcx, rdx, r8, tmp4, crc_0, crc_1, crc_2, crc_3, \
crc_fold, \
crc_tmp0, crc_tmp1, crc_tmp2, crc_tmp3, crc_tmp4
vmovdqa [r9 + 0*16], crc_0
vmovdqa [r9 + 1*16], crc_1
vmovdqa [r9 + 2*16], crc_2
vmovdqa [r9 + 3*16], crc_3
vmovdqa xmm9, [rsp+3*16]
vmovdqa xmm8, [rsp+2*16]
vmovdqa xmm7, [rsp+1*16]
vmovdqa xmm6, [rsp+0*16]
add rsp, 8 + 4*16
ret
; Convert 512-bit CRC data to real 32-bit value
; uint32_t crc_512to32(uint32_t *crc)
; arg 1: rcx: pointer to CRC
; returns: eax: 32 bit crc
global crc_512to32_04
crc_512to32_04:
%ifidn __OUTPUT_FORMAT__, elf64
mov rcx, rdi
%endif
vmovdqa crc_0, [rcx + 0*16]
vmovdqa crc_1, [rcx + 1*16]
vmovdqa crc_2, [rcx + 2*16]
vmovdqa crc_3, [rcx + 3*16]
vmovdqa crc_fold, [rk1 WRT_OPT] ;k1
; fold the 4 xmm registers to 1 xmm register with different constants
vmovdqa crc_tmp0, crc_0
vpclmulqdq crc_0, crc_fold, 0x1
vpclmulqdq crc_tmp0, crc_fold, 0x10
vpxor crc_1, crc_tmp0
vpxor crc_1, crc_0
vmovdqa crc_tmp0, crc_1
vpclmulqdq crc_1, crc_fold, 0x1
vpclmulqdq crc_tmp0, crc_fold, 0x10
vpxor crc_2, crc_tmp0
vpxor crc_2, crc_1
vmovdqa crc_tmp0, crc_2
vpclmulqdq crc_2, crc_fold, 0x1
vpclmulqdq crc_tmp0, crc_fold, 0x10
vpxor crc_3, crc_tmp0
vpxor crc_3, crc_2
vmovdqa crc_fold, [rk5 WRT_OPT]
vmovdqa crc_0, crc_3
vpclmulqdq crc_3, crc_fold, 0
vpsrldq crc_0, 8
vpxor crc_3, crc_0
vmovdqa crc_0, crc_3
vpslldq crc_3, 4
vpclmulqdq crc_3, crc_fold, 0x10
vpxor crc_3, crc_0
vpand crc_3, [mask2 WRT_OPT]
vmovdqa crc_1, crc_3
vmovdqa crc_2, crc_3
vmovdqa crc_fold, [rk7 WRT_OPT]
vpclmulqdq crc_3, crc_fold, 0
vpxor crc_3, crc_2
vpand crc_3, [mask WRT_OPT]
vmovdqa crc_2, crc_3
vpclmulqdq crc_3, crc_fold, 0x10
vpxor crc_3, crc_2
vpxor crc_3, crc_1
vpextrd eax, crc_3, 2
not eax
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section .data
align 16
rk1: dq 0x00000000ccaa009e
rk2: dq 0x00000001751997d0
rk5: dq 0x00000000ccaa009e
rk6: dq 0x0000000163cd6124
rk7: dq 0x00000001f7011640
rk8: dq 0x00000001db710640
mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF

View File

@ -73,14 +73,13 @@ START_FIELDS ;; isal_zstate
FIELD _b_bytes_valid, 4, 4
FIELD _b_bytes_processed, 4, 4
FIELD _file_start, 8, 8
FIELD _crc, 64, 16
FIELD _crc, 4, 4
FIELD _bitbuf, _BitBuf2_size, _BitBuf2_align
FIELD _state, 4, 4
FIELD _count, 4, 4
FIELD _tmp_out_buff, 16, 1
FIELD _tmp_out_start, 4, 4
FIELD _tmp_out_end, 4, 4
FIELD _last_flush, 4, 4
FIELD _has_gzip_hdr, 4, 4
FIELD _has_eob, 4, 4
FIELD _has_eob_hdr, 4, 4
@ -128,7 +127,6 @@ _internal_state_count equ _internal_state+_count
_internal_state_tmp_out_buff equ _internal_state+_tmp_out_buff
_internal_state_tmp_out_start equ _internal_state+_tmp_out_start
_internal_state_tmp_out_end equ _internal_state+_tmp_out_end
_internal_state_last_flush equ _internal_state+_last_flush
_internal_state_has_gzip_hdr equ _internal_state+_has_gzip_hdr
_internal_state_has_eob equ _internal_state+_has_eob
_internal_state_has_eob_hdr equ _internal_state+_has_eob_hdr

View File

@ -43,9 +43,6 @@
# define DEFLATE 1
#endif
extern uint32_t CrcTable[256];
static inline uint32_t bsr(uint32_t val)
{
uint32_t msb;
@ -210,16 +207,3 @@ static inline int compare258(uint8_t * str1, uint8_t * str2, uint32_t max_length
return count;
}
static inline void update_crc(uint32_t* crc, uint8_t * start, uint32_t length)
{
#ifndef DEFLATE
uint8_t *end = start + length;
while (start < end)
*crc = (*crc >> 8) ^ CrcTable[(*crc & 0x000000FF) ^ *start++];
#else
return;
#endif
}

View File

@ -54,6 +54,7 @@ extern const uint8_t gzip_hdr[];
extern const uint32_t gzip_hdr_bytes;
extern const uint32_t gzip_trl_bytes;
extern const struct isal_hufftables hufftables_default;
extern uint32_t CrcTable[256];
extern uint32_t crc32_gzip(uint32_t init_crc, const unsigned char *buf, uint64_t len);
@ -76,8 +77,6 @@ unsigned int detect_repeated_char(uint8_t * buf, uint32_t size);
void isal_deflate_body(struct isal_zstream *stream);
void isal_deflate_finish(struct isal_zstream *stream);
uint32_t crc_512to32_01(uint32_t * crc);
uint32_t get_crc(uint32_t * crc);
/*****************************************************************/
@ -107,12 +106,6 @@ struct slver isal_deflate_stateless_slver_01010083;
struct slver isal_deflate_stateless_slver = { 0x0083, 0x01, 0x01 };
/*****************************************************************/
uint32_t file_size(struct isal_zstate *state)
{
return state->b_bytes_valid + (uint32_t) (state->buffer - state->file_start);
}
static
void sync_flush(struct isal_zstream *stream)
{
@ -213,9 +206,14 @@ static void flush_write_buffer(struct isal_zstream *stream)
}
}
static void isal_deflate_int(struct isal_zstream *stream)
static void isal_deflate_pass(struct isal_zstream *stream)
{
struct isal_zstate *state = &stream->internal_state;
#ifndef DEFLATE
uint8_t *start_in = stream->next_in;
#endif
if (state->state == ZSTATE_NEW_HDR || state->state == ZSTATE_HDR)
write_header(stream);
@ -231,10 +229,79 @@ static void isal_deflate_int(struct isal_zstream *stream)
if (state->state == ZSTATE_FLUSH_WRITE_BUFFER)
flush_write_buffer(stream);
#ifndef DEFLATE
state->crc = crc32_gzip(state->crc, start_in, stream->next_in - start_in);
#endif
if (state->state == ZSTATE_TRL)
write_trailer(stream);
}
static void isal_deflate_int(struct isal_zstream *stream)
{
struct isal_zstate *state = &stream->internal_state;
uint32_t size;
/* Move data from temporary output buffer to output buffer */
if (state->state >= ZSTATE_TMP_OFFSET) {
size = state->tmp_out_end - state->tmp_out_start;
if (size > stream->avail_out)
size = stream->avail_out;
memcpy(stream->next_out, state->tmp_out_buff + state->tmp_out_start, size);
stream->next_out += size;
stream->avail_out -= size;
stream->total_out += size;
state->tmp_out_start += size;
if (state->tmp_out_start == state->tmp_out_end)
state->state -= ZSTATE_TMP_OFFSET;
if (stream->avail_out == 0 || state->state == ZSTATE_END
|| state->state == ZSTATE_NEW_HDR)
return;
}
assert(state->tmp_out_start == state->tmp_out_end);
isal_deflate_pass(stream);
/* Fill temporary output buffer then complete filling output buffer */
if (stream->avail_out > 0 && stream->avail_out < 8 && state->state != ZSTATE_NEW_HDR) {
uint8_t *next_out;
uint32_t avail_out;
uint32_t total_out;
next_out = stream->next_out;
avail_out = stream->avail_out;
total_out = stream->total_out;
stream->next_out = state->tmp_out_buff;
stream->avail_out = sizeof(state->tmp_out_buff);
stream->total_out = 0;
isal_deflate_pass(stream);
state->tmp_out_start = 0;
state->tmp_out_end = stream->total_out;
stream->next_out = next_out;
stream->avail_out = avail_out;
stream->total_out = total_out;
if (state->tmp_out_end) {
size = state->tmp_out_end;
if (size > stream->avail_out)
size = stream->avail_out;
memcpy(stream->next_out, state->tmp_out_buff, size);
stream->next_out += size;
stream->avail_out -= size;
stream->total_out += size;
state->tmp_out_start += size;
if (state->tmp_out_start != state->tmp_out_end)
state->state += ZSTATE_TMP_OFFSET;
}
}
}
static uint32_t write_constant_compressed_stateless(struct isal_zstream *stream,
uint32_t repeated_char,
uint32_t repeated_length,
@ -396,6 +463,7 @@ static int isal_deflate_int_stateless(struct isal_zstream *stream, uint8_t * nex
return STATELESS_OVERFLOW;
memset(stream->internal_state.head, 0, sizeof(stream->internal_state.head));
stream->internal_state.file_start = stream->next_in;
isal_deflate_body_stateless(stream);
if (!stream->internal_state.has_eob)
@ -494,13 +562,11 @@ static inline void reset_match_history(struct isal_zstream *stream)
int i = 0;
for (i = 0; i < sizeof(state->head) / 2; i++) {
head[i] =
(uint16_t) (state->b_bytes_processed + state->buffer - state->file_start -
IGZIP_HIST_SIZE);
head[i] = (uint16_t) (stream->total_in);
}
}
void isal_deflate_init_01(struct isal_zstream *stream)
void isal_deflate_init(struct isal_zstream *stream)
{
struct isal_zstate *state = &stream->internal_state;
@ -514,7 +580,6 @@ void isal_deflate_init_01(struct isal_zstream *stream)
state->has_eob = 0;
state->has_eob_hdr = 0;
state->left_over = 0;
state->last_flush = 0;
state->has_gzip_hdr = 0;
state->state = ZSTATE_NEW_HDR;
state->count = 0;
@ -522,12 +587,11 @@ void isal_deflate_init_01(struct isal_zstream *stream)
state->tmp_out_start = 0;
state->tmp_out_end = 0;
state->file_start = state->buffer;
state->file_start = stream->next_in;
init(&state->bitbuf);
memset(state->crc, 0, sizeof(state->crc));
*state->crc = 0x9db42487;
state->crc = 0;
memset(state->head, 0, sizeof(state->head));
@ -544,6 +608,15 @@ void isal_deflate_stateless_init(struct isal_zstream *stream)
return;
}
uint32_t crc32_gzip_base(uint32_t crc, uint8_t * start, uint32_t length)
{
uint8_t *end = start + length;
crc = ~crc;
while (start < end)
crc = (crc >> 8) ^ CrcTable[(crc & 0x000000FF) ^ *start++];
return ~crc;
}
int isal_deflate_stateless(struct isal_zstream *stream)
{
uint8_t *next_in = stream->next_in;
@ -634,74 +707,96 @@ int isal_deflate_stateless(struct isal_zstream *stream)
int isal_deflate(struct isal_zstream *stream)
{
struct isal_zstate *state = &stream->internal_state;
uint32_t size;
int ret = COMP_OK;
uint8_t *next_in;
uint32_t avail_in, avail_in_start;
uint32_t flush_type = stream->flush;
uint32_t end_of_stream = stream->end_of_stream;
int size = 0;
uint8_t *copy_down_src = NULL;
uint64_t copy_down_size = 0;
uint32_t processed = 0;
if (stream->flush < 3) {
if (stream->flush >= 3)
return INVALID_FLUSH;
state->last_flush = stream->flush;
next_in = stream->next_in;
avail_in = stream->avail_in;
stream->total_in -= state->b_bytes_valid - state->b_bytes_processed;
if (state->state >= ZSTATE_TMP_OFFSET) {
size = state->tmp_out_end - state->tmp_out_start;
if (size > stream->avail_out)
size = stream->avail_out;
memcpy(stream->next_out, state->tmp_out_buff + state->tmp_out_start,
size);
stream->next_out += size;
stream->avail_out -= size;
stream->total_out += size;
state->tmp_out_start += size;
if (state->tmp_out_start == state->tmp_out_end)
state->state -= ZSTATE_TMP_OFFSET;
if (stream->avail_out == 0 || state->state == ZSTATE_END)
return ret;
while (processed < IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD) {
size = avail_in;
if (size > sizeof(state->buffer) - state->b_bytes_valid) {
size = sizeof(state->buffer) - state->b_bytes_valid;
stream->flush = NO_FLUSH;
stream->end_of_stream = 0;
}
assert(state->tmp_out_start == state->tmp_out_end);
memcpy(&state->buffer[state->b_bytes_valid], next_in, size);
isal_deflate_int(stream);
next_in += size;
avail_in -= size;
state->b_bytes_valid += size;
if (stream->avail_out == 0)
return ret;
stream->next_in = &state->buffer[state->b_bytes_processed];
stream->avail_in = state->b_bytes_valid - state->b_bytes_processed;
state->file_start = stream->next_in - stream->total_in;
else if (stream->avail_out < 8) {
uint8_t *next_out;
uint32_t avail_out;
uint32_t total_out;
if (stream->avail_in > IGZIP_HIST_SIZE
|| stream->end_of_stream || stream->flush != NO_FLUSH) {
avail_in_start = stream->avail_in;
isal_deflate_int(stream);
state->b_bytes_processed += avail_in_start - stream->avail_in;
next_out = stream->next_out;
avail_out = stream->avail_out;
total_out = stream->total_out;
if (state->b_bytes_processed > IGZIP_HIST_SIZE) {
copy_down_src =
&state->buffer[state->b_bytes_processed - IGZIP_HIST_SIZE];
copy_down_size =
state->b_bytes_valid - state->b_bytes_processed +
IGZIP_HIST_SIZE;
memmove(state->buffer, copy_down_src, copy_down_size);
stream->next_out = state->tmp_out_buff;
stream->avail_out = sizeof(state->tmp_out_buff);
stream->total_out = 0;
state->b_bytes_valid -= copy_down_src - state->buffer;
state->b_bytes_processed -= copy_down_src - state->buffer;
}
}
stream->flush = flush_type;
stream->end_of_stream = end_of_stream;
if (avail_in <= 0 || stream->avail_out <= 0)
break;
processed += size;
}
if (processed >= IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD) {
stream->next_in = next_in - stream->avail_in;
stream->avail_in = avail_in + stream->avail_in;
state->file_start = stream->next_in - stream->total_in;
if (stream->avail_in > 0 && stream->avail_out > 0)
isal_deflate_int(stream);
state->tmp_out_start = 0;
state->tmp_out_end = stream->total_out;
size = stream->avail_in;
if (stream->avail_in > IGZIP_HIST_SIZE)
size = 0;
stream->next_out = next_out;
stream->avail_out = avail_out;
stream->total_out = total_out;
if (state->tmp_out_end) {
size = state->tmp_out_end;
if (size > stream->avail_out)
size = stream->avail_out;
memcpy(stream->next_out, state->tmp_out_buff, size);
stream->next_out += size;
stream->avail_out -= size;
stream->total_out += size;
state->tmp_out_start += size;
if (state->tmp_out_start != state->tmp_out_end)
state->state += ZSTATE_TMP_OFFSET;
memmove(state->buffer, stream->next_in - IGZIP_HIST_SIZE,
size + IGZIP_HIST_SIZE);
state->b_bytes_processed = IGZIP_HIST_SIZE;
state->b_bytes_valid = size + IGZIP_HIST_SIZE;
}
}
} else
ret = INVALID_FLUSH;
stream->next_in += size;
stream->avail_in -= size;
stream->total_in += size;
} else {
stream->total_in += state->b_bytes_valid - state->b_bytes_processed;
stream->next_in = next_in;
stream->avail_in = avail_in;
state->file_start = stream->next_in - stream->total_in;
}
return ret;
}
@ -912,11 +1007,6 @@ void write_header(struct isal_zstream *stream)
}
uint32_t get_crc_01(uint32_t * crc)
{
return crc_512to32_01(crc);
}
void write_trailer(struct isal_zstream *stream)
{
struct isal_zstate *state = &stream->internal_state;
@ -950,11 +1040,11 @@ void write_trailer(struct isal_zstream *stream)
bytes = buffer_used(&state->bitbuf);
#ifndef DEFLATE
uint32_t *crc = state->crc;
uint32_t crc = state->crc;
if (!is_full(&state->bitbuf)) {
*(uint64_t *) stream->next_out =
((uint64_t) file_size(state) << 32) | get_crc(crc);
((uint64_t) stream->total_in << 32) | crc;
stream->next_out += 8;
bytes += 8;
state->state = ZSTATE_END;

View File

@ -6,52 +6,15 @@
extern const struct isal_hufftables hufftables_default;
void isal_deflate_init_base(struct isal_zstream *stream)
static inline void update_state(struct isal_zstream *stream, uint8_t * start_in,
uint8_t * next_in, uint8_t * end_in)
{
struct isal_zstate *state = &stream->internal_state;
int i;
uint32_t *crc = state->crc;
stream->total_in = 0;
stream->total_out = 0;
stream->hufftables = (struct isal_hufftables *)&hufftables_default;
stream->flush = 0;
state->b_bytes_valid = 0;
state->b_bytes_processed = 0;
state->has_eob = 0;
state->has_eob_hdr = 0;
state->left_over = 0;
state->last_flush = 0;
state->has_gzip_hdr = 0;
state->state = ZSTATE_NEW_HDR;
state->count = 0;
state->tmp_out_start = 0;
state->tmp_out_end = 0;
state->file_start = state->buffer;
init(&state->bitbuf);
*crc = ~0;
for (i = 0; i < IGZIP_HASH_SIZE; i++)
state->head[i] = (uint16_t) - (IGZIP_HIST_SIZE + 1);
return;
}
uint32_t get_crc_base(uint32_t * crc)
{
return ~*crc;
}
static inline void update_state(struct isal_zstream *stream, struct isal_zstate *state,
uint8_t * start_in)
{
uint32_t bytes_written;
stream->total_in += stream->next_in - start_in;
stream->next_in = next_in;
stream->total_in += next_in - start_in;
stream->avail_in = end_in - next_in;
bytes_written = buffer_used(&state->bitbuf);
stream->total_out += bytes_written;
@ -65,163 +28,27 @@ void isal_deflate_body_base(struct isal_zstream *stream)
uint32_t literal, hash;
uint8_t *start_in, *next_in, *end_in, *end, *next_hash;
uint16_t match_length;
uint32_t dist, bytes_to_buffer, offset;
uint64_t code, code_len, code2, code_len2;
struct isal_zstate *state = &stream->internal_state;
uint16_t *last_seen = state->head;
uint32_t *crc = state->crc;
if (stream->avail_in == 0) {
if (stream->end_of_stream || stream->flush != NO_FLUSH)
state->state = ZSTATE_FLUSH_READ_BUFFER;
return;
}
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
start_in = stream->next_in;
while (stream->avail_in != 0) {
bytes_to_buffer =
IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD - (state->b_bytes_valid -
state->b_bytes_processed);
if (bytes_to_buffer > IGZIP_HIST_SIZE)
bytes_to_buffer = IGZIP_HIST_SIZE;
if (stream->avail_in < IGZIP_HIST_SIZE)
bytes_to_buffer = stream->avail_in;
if (bytes_to_buffer > sizeof(state->buffer) - state->b_bytes_valid) {
if (state->b_bytes_valid - state->b_bytes_processed > ISAL_LOOK_AHEAD) {
/* There was an out buffer overflow last round,
* complete the processing of data */
bytes_to_buffer = 0;
} else {
/* Not enough room in the buffer, shift the
* buffer down to make space for the new data */
offset = state->b_bytes_processed - IGZIP_HIST_SIZE; // state->b_bytes_valid - (IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD);
memmove(state->buffer, state->buffer + offset,
IGZIP_HIST_SIZE + ISAL_LOOK_AHEAD);
state->b_bytes_processed -= offset;
state->b_bytes_valid -= offset;
state->file_start -= offset;
stream->avail_in -= bytes_to_buffer;
memcpy(state->buffer + state->b_bytes_valid, stream->next_in,
bytes_to_buffer);
update_crc(crc, stream->next_in, bytes_to_buffer);
stream->next_in += bytes_to_buffer;
}
} else {
/* There is enough space in the buffer, copy in the new data */
stream->avail_in -= bytes_to_buffer;
memcpy(state->buffer + state->b_bytes_valid, stream->next_in,
bytes_to_buffer);
update_crc(crc, stream->next_in, bytes_to_buffer);
stream->next_in += bytes_to_buffer;
}
state->b_bytes_valid += bytes_to_buffer;
end_in = state->buffer + state->b_bytes_valid - ISAL_LOOK_AHEAD;
next_in = state->b_bytes_processed + state->buffer;
while (next_in < end_in) {
if (is_full(&state->bitbuf)) {
state->b_bytes_processed = next_in - state->buffer;
update_state(stream, state, start_in);
return;
}
literal = *(uint32_t *) next_in;
hash = compute_hash(literal) & HASH_MASK;
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
last_seen[hash] = (uint64_t) (next_in - state->file_start);
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
assert(next_in - dist >= state->buffer);
assert(dist != 0);
match_length = compare258(next_in - dist, next_in, 258);
if (match_length >= SHORTEST_MATCH) {
next_hash = next_in;
#ifdef ISAL_LIMIT_HASH_UPDATE
end = next_hash + 3;
#else
end = next_hash + match_length;
#endif
next_hash++;
for (; next_hash < end; next_hash++) {
literal = *(uint32_t *) next_hash;
hash = compute_hash(literal) & HASH_MASK;
last_seen[hash] =
(uint64_t) (next_hash - state->file_start);
}
get_len_code(stream->hufftables, match_length, &code,
&code_len);
get_dist_code(stream->hufftables, dist, &code2,
&code_len2);
code |= code2 << code_len;
code_len += code_len2;
write_bits(&state->bitbuf, code, code_len);
next_in += match_length;
continue;
}
}
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
next_in++;
}
state->b_bytes_processed = next_in - state->buffer;
}
update_state(stream, state, start_in);
if (stream->avail_in == 0) {
if (stream->end_of_stream || stream->flush != NO_FLUSH)
state->state = ZSTATE_FLUSH_READ_BUFFER;
return;
}
return;
}
void isal_deflate_finish_base(struct isal_zstream *stream)
{
uint32_t literal = 0, hash;
uint8_t *next_in, *end_in, *end, *next_hash;
uint16_t match_length;
uint32_t dist;
uint64_t code, code_len, code2, code_len2;
struct isal_zstate *state = &stream->internal_state;
uint16_t *last_seen = state->head;
if (stream->avail_in == 0) {
if (stream->end_of_stream || stream->flush != NO_FLUSH)
state->state = ZSTATE_FLUSH_READ_BUFFER;
return;
}
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
end_in = state->b_bytes_valid + (uint8_t *) state->buffer;
start_in = stream->next_in;
end_in = start_in + stream->avail_in;
next_in = start_in;
next_in = state->b_bytes_processed + state->buffer;
while (next_in < end_in) {
while (next_in < end_in - ISAL_LOOK_AHEAD) {
if (is_full(&state->bitbuf)) {
state->b_bytes_processed = next_in - state->buffer;
update_state(stream, state, stream->next_in);
update_state(stream, start_in, next_in, end_in);
return;
}
@ -230,9 +57,11 @@ void isal_deflate_finish_base(struct isal_zstream *stream)
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
last_seen[hash] = (uint64_t) (next_in - state->file_start);
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
assert(next_in - dist >= state->buffer);
match_length = compare258(next_in - dist, next_in, end_in - next_in);
/* The -1 are to handle the case when dist = 0 */
if (dist - 1 < IGZIP_HIST_SIZE - 1) {
assert(dist != 0);
match_length = compare258(next_in - dist, next_in, 258);
if (match_length >= SHORTEST_MATCH) {
next_hash = next_in;
@ -265,29 +94,113 @@ void isal_deflate_finish_base(struct isal_zstream *stream)
}
}
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
next_in++;
}
update_state(stream, start_in, next_in, end_in);
assert(stream->avail_in <= ISAL_LOOK_AHEAD);
if (stream->end_of_stream || stream->flush != NO_FLUSH)
state->state = ZSTATE_FLUSH_READ_BUFFER;
return;
}
void isal_deflate_finish_base(struct isal_zstream *stream)
{
uint32_t literal = 0, hash;
uint8_t *start_in, *next_in, *end_in, *end, *next_hash;
uint16_t match_length;
uint32_t dist;
uint64_t code, code_len, code2, code_len2;
struct isal_zstate *state = &stream->internal_state;
uint16_t *last_seen = state->head;
set_buf(&state->bitbuf, stream->next_out, stream->avail_out);
start_in = stream->next_in;
end_in = start_in + stream->avail_in;
next_in = start_in;
while (next_in < end_in - 3) {
if (is_full(&state->bitbuf)) {
update_state(stream, start_in, next_in, end_in);
return;
}
literal = *(uint32_t *) next_in;
hash = compute_hash(literal) & HASH_MASK;
dist = (next_in - state->file_start - last_seen[hash]) & 0xFFFF;
last_seen[hash] = (uint64_t) (next_in - state->file_start);
if (dist - 1 < IGZIP_HIST_SIZE - 1) { /* The -1 are to handle the case when dist = 0 */
match_length = compare258(next_in - dist, next_in, end_in - next_in);
if (match_length >= SHORTEST_MATCH) {
next_hash = next_in;
#ifdef ISAL_LIMIT_HASH_UPDATE
end = next_hash + 3;
#else
end = next_hash + match_length;
#endif
next_hash++;
for (; next_hash < end - 3; next_hash++) {
literal = *(uint32_t *) next_hash;
hash = compute_hash(literal) & HASH_MASK;
last_seen[hash] =
(uint64_t) (next_hash - state->file_start);
}
get_len_code(stream->hufftables, match_length, &code,
&code_len);
get_dist_code(stream->hufftables, dist, &code2, &code_len2);
code |= code2 << code_len;
code_len += code_len2;
write_bits(&state->bitbuf, code, code_len);
next_in += match_length;
continue;
}
}
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
next_in++;
}
state->b_bytes_processed = next_in - state->buffer;
while (next_in < end_in) {
if (is_full(&state->bitbuf)) {
update_state(stream, start_in, next_in, end_in);
return;
}
literal = *next_in;
get_lit_code(stream->hufftables, literal & 0xFF, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
next_in++;
if (is_full(&state->bitbuf) || state->left_over > 0) {
update_state(stream, state, stream->next_in);
return;
}
get_lit_code(stream->hufftables, 256, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
state->has_eob = 1;
if (!is_full(&state->bitbuf)) {
get_lit_code(stream->hufftables, 256, &code, &code_len);
write_bits(&state->bitbuf, code, code_len);
state->has_eob = 1;
update_state(stream, state, stream->next_in);
if (stream->end_of_stream == 1)
state->state = ZSTATE_TRL;
else
state->state = ZSTATE_SYNC_FLUSH;
}
if (stream->end_of_stream == 1)
state->state = ZSTATE_TRL;
else
state->state = ZSTATE_SYNC_FLUSH;
update_state(stream, start_in, next_in, end_in);
return;
}

View File

@ -28,26 +28,16 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%include "options.asm"
%ifndef TEST
extern fold_4
%include "lz0a_const.asm"
%include "data_struct2.asm"
%include "bitbuf2.asm"
%include "huffman.asm"
%include "igzip_compare_types.asm"
%include "reg_sizes.asm"
%include "stdmac.asm"
%if (ARCH == 04)
%define MOVDQA vmovdqa
%else
%define MOVDQA movdqa
%endif
%ifdef DEBUG
%macro MARK 1
global %1
@ -61,99 +51,74 @@ global %1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define tmp2 rcx
%define hash2 rcx
%define b_bytes_valid rax
%define curr_data rax
%define code rax
%define tmp5 rax
%define tmp2 rcx
%define hash2 rcx
%define tmp4 rbx
%define dist rbx
%define code2 rbx
%define curr_data rax
%define code rax
%define tmp5 rax
%define x rdx
%define len rdx
%define hash rdx
%define code_len3 rdx
%define tmp8 rdx
%define tmp4 rbx
%define dist rbx
%define code2 rbx
%define tmp1 rsi
%define code_len2 rsi
%define hash rdx
%define len rdx
%define code_len3 rdx
%define tmp8 rdx
%define blen rdi
%define file_start rdi
%define tmp1 rsi
%define code_len2 rsi
%define m_bit_count rbp
%define file_start rdi
%define in_buf r8
%define curr_data2 r8
%define len2 r8
%define tmp6 r8
%define m_bit_count rbp
%define m_bits r9
%define curr_data2 r8
%define len2 r8
%define tmp6 r8
%define f_i r10
%define m_bits r9
%define m_out_buf r11
%define f_i r10
%define f_end_i r12
%define dist2 r12
%define tmp7 r12
%define code4 r12
%define m_out_buf r11
%define tmp3 r13
%define code3 r13
%define f_end_i r12
%define dist2 r12
%define tmp7 r12
%define code4 r12
%define stream r14
%define tmp3 r13
%define code3 r13
%define hufftables r15
%define stream r14
%define crc_0 xmm0 ; in/out: crc state
%define crc_1 xmm1 ; in/out: crc state
%define crc_2 xmm2 ; in/out: crc state
%define crc_3 xmm3 ; in/out: crc state
%define crc_fold xmm4 ; in: (loaded from fold_4)
%define hufftables r15
%define xtmp0 xmm5 ; tmp
%define xtmp1 xmm6 ; tmp
%define xtmp2 xmm7 ; tmp
%define xtmp3 xmm8 ; tmp
%define xtmp4 xmm9 ; tmp
%define xhash xmm10
%define xmask xmm11
%define xdata xmm12
;; GPR r8 & r15 can be used
%define xtmp0 xmm0 ; tmp
%define xtmp1 xmm1 ; tmp
%define xhash xmm2
%define xmask xmm3
%define xdata xmm4
%define ytmp0 ymm0 ; tmp
%define ytmp1 ymm1 ; tmp
%define ytmp0 ymm5 ; tmp
%define ytmp1 ymm6 ; tmp
%if ( ARCH == 02 || ARCH == 04)
%define vtmp0 ymm5 ; tmp
%define vtmp1 ymm6 ; tmp
%define vtmp2 ymm7 ; tmp
%define vtmp3 ymm8 ; tmp
%define vtmp4 ymm9 ; tmp
%else
%define vtmp0 xmm5 ; tmp
%define vtmp1 xmm6 ; tmp
%define vtmp2 xmm7 ; tmp
%define vtmp3 xmm8 ; tmp
%define vtmp4 xmm9 ; tmp
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define b_bytes_processed f_i
blen_mem_offset equ 0 ; local variable (8 bytes)
in_buf_mem_offset equ 8
f_end_i_mem_offset equ 16
empty_buffer_flag equ 24
gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (8*16 bytes) (16 byte aligned)
stack_size equ 4*8 + 8*8 + 8*16 + 8
f_end_i_mem_offset equ 8
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
stack_size equ 2*8 + 8*8 + 4*16 + 8
;;; 8 because stack address is odd multiple of 8 after a function call and
;;; we want it aligned to 16 bytes
@ -197,173 +162,74 @@ skip1:
mov [rsp + gpr_save_mem_offset + 5*8], r13
mov [rsp + gpr_save_mem_offset + 6*8], r14
mov [rsp + gpr_save_mem_offset + 7*8], r15
MOVDQA [rsp + xmm_save_mem_offset + 0*16], xmm6
MOVDQA [rsp + xmm_save_mem_offset + 1*16], xmm7
MOVDQA [rsp + xmm_save_mem_offset + 2*16], xmm8
MOVDQA [rsp + xmm_save_mem_offset + 3*16], xmm9
MOVDQA [rsp + xmm_save_mem_offset + 4*16], xmm10
MOVDQA [rsp + xmm_save_mem_offset + 5*16], xmm11
MOVDQA [rsp + xmm_save_mem_offset + 6*16], xmm12
mov stream, rcx
MOVDQU xmask, [mask]
MOVDQA crc_0, [stream + _internal_state_crc + 0*16]
MOVDQA crc_1, [stream + _internal_state_crc + 1*16]
MOVDQA crc_2, [stream + _internal_state_crc + 2*16]
MOVDQA crc_3, [stream + _internal_state_crc + 3*16]
MOVDQA crc_fold, [fold_4]
mov dword [stream + _internal_state_has_eob], 0
MOVDQU xmask, [mask]
; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
mov m_out_buf, [stream + _next_out]
mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf
mov tmp1 %+ d, [stream + _avail_out]
add tmp1, m_out_buf
sub tmp1, SLOP
skip_SLOP:
mov [stream + _internal_state_bitbuf_m_out_end], tmp1
mov m_bits, [stream + _internal_state_bitbuf_m_bits]
mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
mov hufftables, [stream + _hufftables]
; in_buf = stream->next_in
mov in_buf, [stream + _next_in]
mov blen %+ d, [stream + _avail_in]
mov dword [rsp + empty_buffer_flag], 0
cmp dword [stream + _internal_state_b_bytes_processed], 0
sete byte [rsp + empty_buffer_flag]
mov file_start, [stream + _next_in]
; while (blen != 0)
MARK __Compute_X_ %+ ARCH
loop1:
; x = D + LA - (state->b_bytes_valid - state->b_bytes_processed);
mov b_bytes_valid %+ d, [stream + _internal_state_b_bytes_valid]
mov b_bytes_processed %+ d, [stream + _internal_state_b_bytes_processed]
lea x, [b_bytes_processed + D + LA]
sub x, b_bytes_valid
mov f_i %+ d, dword [stream + _total_in]
sub file_start, f_i
; if (x > D) x = D;
cmp x, D
cmova x, [const_D]
mov f_end_i %+ d, [stream + _avail_in]
add f_end_i, f_i
; if (blen < D) x = blen;
cmp blen, D
cmovb x, blen
;; process x bytes starting at in_buf
;; If there isn't enough room, shift buffer down
; if (x > BSIZE - state->b_bytes_valid) {
mov tmp1, BSIZE
sub tmp1, b_bytes_valid
cmp x, tmp1
jbe skip_move
; if (state->b_bytes_processed < state->b_bytes_valid - LA) {
mov tmp1, b_bytes_valid
sub tmp1, LA
cmp b_bytes_processed, tmp1
jae do_move
;; We need to move an odd amount, skip move for this copy of loop
xor x,x
mov [rsp + blen_mem_offset], blen
jmp skip_move_zero
MARK __shift_data_down_ %+ ARCH
do_move:
; offset = state->b_bytes_valid - (D + LA);
mov tmp4, b_bytes_valid
sub tmp4, D + LA
; copy_D_LA(state->buffer, state->buffer + offset);
lea tmp1, [stream + _internal_state_buffer]
lea tmp2, [tmp1 + tmp4]
copy_D_LA tmp1, tmp2, tmp3, vtmp0, vtmp1, vtmp2, vtmp3
; tmp1 clobbered
; state->file_start -= offset;
sub [stream + _internal_state_file_start], tmp4
; state->b_bytes_processed -= offset;
sub b_bytes_processed, tmp4
mov b_bytes_valid, D + LA
MARK __copy_in_ %+ ARCH
skip_move:
sub blen, x
mov [rsp + blen_mem_offset], blen
; copy_in(state->buffer + state->b_bytes_valid, in_buf, x);
lea tmp1, [stream + _internal_state_buffer + b_bytes_valid]
mov tmp2, in_buf
mov tmp3, x
COPY_IN_CRC tmp1, tmp2, tmp3, tmp4, crc_0, crc_1, crc_2, crc_3, crc_fold, \
xtmp0, xtmp1, xtmp2, xtmp3, xtmp4
; in_buf += x;
add in_buf, x
MARK __prepare_loop_ %+ ARCH
skip_move_zero:
mov [rsp + in_buf_mem_offset], in_buf
; state->b_bytes_valid += x;
add b_bytes_valid, x
mov [stream + _internal_state_b_bytes_valid], b_bytes_valid %+ d
; f_end_i = state->b_bytes_valid - LA;
%ifnidn f_end_i, b_bytes_valid
mov f_end_i, b_bytes_valid
%endif
; f_end_i -= LA;
sub f_end_i, LA
; if (f_end_i <= 0) continue;
cmp f_end_i, 0
jle continue_while
; f_start_i = state->b_bytes_processed;
;; f_i and b_bytes_processed are same register, just store b_bytes_proc
mov [stream + _internal_state_b_bytes_processed], b_bytes_processed %+ d
; f_start_i += (uint32_t)(state->buffer - state->file_start);
mov file_start, [stream + _internal_state_file_start]
lea tmp1, [stream + _internal_state_buffer]
sub tmp1, file_start
add f_i, tmp1
add f_end_i, tmp1
mov [rsp + f_end_i_mem_offset], f_end_i
; if (f_end_i <= 0) continue;
cmp f_end_i, f_i
jle input_end
; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
cmp f_i, f_end_i
jge end_loop_2
MARK __misc_compute_hash_lookup_ %+ ARCH
MOVDQU xdata, [file_start + f_i]
MARK __body_compute_hash_ %+ ARCH
mov curr_data, [file_start + f_i]
mov tmp3, curr_data
mov tmp6, curr_data
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja output_end
;; Encode first byte in the stream as a literal
compute_hash hash, curr_data
and hash %+ d, HASH_MASK
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
and curr_data, 0xff
get_lit_code curr_data, code2, code_len2, hufftables
shr tmp3, 8
compute_hash hash2, tmp3
mov tmp3, [file_start + f_i + 1]
mov tmp6, tmp3
compute_hash hash, tmp3
and hash, HASH_MASK
and hash2, HASH_MASK
shr tmp6, 8
compute_hash hash2, tmp6
cmp dword [rsp + empty_buffer_flag], 0
jne write_first_byte
MOVD xhash, hash %+ d
PINSRD xhash, hash2 %+ d, 1
PAND xhash, xhash, xmask
jmp loop2
jmp write_lit_bits
align 16
loop2:
; if (state->bitbuf.is_full()) {
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja bitbuf_full
ja output_end
xor dist, dist
xor dist2, dist2
@ -398,7 +264,7 @@ loop2:
and dist2 %+ d, (D-1)
neg dist2
MARK __compare_ %+ ARCH
MARK __body_compare_ %+ ARCH
;; Check for long len/dist match (>7) with first literal
MOVQ len, xdata
mov curr_data, len
@ -436,7 +302,7 @@ MARK __compare_ %+ ARCH
test len2 %+ d, 0xFFFFFFFF
jnz write_lit_bits
MARK __len_dist_lit_huffman_ %+ ARCH
MARK __body_len_dist_lit_huffman_ %+ ARCH
len_dist_lit_huffman_pre:
mov code_len3, rcx
bsf len2, len2
@ -494,17 +360,17 @@ len_dist_lit_huffman:
shr curr_data2, 8
compute_hash hash2, curr_data2
%ifdef NO_LIMIT_HASH_UPDATE
%ifdef NO_LIMIT_HASH_UPDATE
loop3:
add tmp3,1
add tmp3,1
cmp tmp3, f_i
jae loop3_done
mov tmp6, [file_start + tmp3]
compute_hash tmp4, tmp6
and tmp4 %+ d, HASH_MASK
mov tmp6, [file_start + tmp3]
compute_hash tmp4, tmp6
and tmp4 %+ d, HASH_MASK
; state->head[hash] = k;
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
jmp loop3
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
jmp loop3
loop3_done:
%endif
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
@ -514,13 +380,14 @@ loop3_done:
; continue
cmp f_i, f_end_i
jl loop2
jmp end_loop_2
jmp input_end
;; encode as dist/len
MARK __len_dist_huffman_ %+ ARCH
MARK __body_len_dist_huffman_ %+ ARCH
len_dist_huffman_pre:
bsf len, len
shr len, 3
len_dist_huffman:
dec f_i
neg dist
@ -563,29 +430,29 @@ len_dist_huffman:
shr curr_data2, 8
compute_hash hash2, curr_data2
%ifdef NO_LIMIT_HASH_UPDATE
%ifdef NO_LIMIT_HASH_UPDATE
loop4:
add tmp3,1
add tmp3,1
cmp tmp3, f_i
jae loop4_done
mov tmp6, [file_start + tmp3]
compute_hash tmp4, tmp6
and tmp4, HASH_MASK
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
jmp loop4
mov tmp6, [file_start + tmp3]
compute_hash tmp4, tmp6
and tmp4, HASH_MASK
mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
jmp loop4
loop4_done:
%endif
and hash, HASH_MASK
and hash2, HASH_MASK
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
and hash %+ d, HASH_MASK
and hash2 %+ d, HASH_MASK
; continue
cmp f_i, f_end_i
jl loop2
jmp end_loop_2
jmp input_end
MARK __write_lit_bits_ %+ ARCH
MARK __body_write_lit_bits_ %+ ARCH
write_lit_bits:
MOVDQU xdata, [file_start + f_i + 1]
mov f_end_i, [rsp + f_end_i_mem_offset]
@ -602,39 +469,7 @@ write_lit_bits:
cmp f_i, f_end_i
jl loop2
MARK __end_loops_ %+ ARCH
end_loop_2:
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
add f_i, [stream + _internal_state_file_start]
sub f_i, stream
sub f_i, _internal_state_buffer
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
; continue
continue_while:
mov blen, [rsp + blen_mem_offset]
mov in_buf, [rsp + in_buf_mem_offset]
cmp blen, 0
jnz loop1
end:
;; update input buffer
; stream->total_in += (uint32_t)(in_buf - stream->next_in); // bytes copied
mov tmp1 %+ d, [stream + _total_in]
mov in_buf, [rsp + in_buf_mem_offset]
add tmp1, in_buf
sub tmp1, [stream + _next_in]
mov [stream + _total_in], tmp1 %+ d
mov [stream + _next_in], in_buf
mov [stream + _avail_in], blen %+ d
cmp blen, 0
jne skip2
;; Set stream's next state
input_end:
mov tmp1, ZSTATE_FLUSH_READ_BUFFER
mov tmp5, ZSTATE_BODY
cmp dword [stream + _end_of_stream], 0
@ -642,9 +477,18 @@ end:
cmp dword [stream + _flush], _NO_FLUSH
cmovne tmp5, tmp1
mov dword [stream + _internal_state_state], tmp5 %+ d
skip2:
output_end:
;; update input buffer
add f_end_i, LA
mov [stream + _total_in], f_i %+ d
add file_start, f_i
mov [stream + _next_in], file_start
sub f_end_i, f_i
mov [stream + _avail_in], f_end_i %+ d
;; update output buffer
mov [stream + _next_out], m_out_buf
; offset = state->bitbuf.buffer_used();
sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
sub [stream + _avail_out], m_out_buf %+ d
add [stream + _total_out], m_out_buf %+ d
@ -652,12 +496,6 @@ skip2:
mov [stream + _internal_state_bitbuf_m_bits], m_bits
mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
MOVDQA [stream + _internal_state_crc + 0*16], crc_0
MOVDQA [stream + _internal_state_crc + 1*16], crc_1
MOVDQA [stream + _internal_state_crc + 2*16], crc_2
MOVDQA [stream + _internal_state_crc + 3*16], crc_3
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
mov rsi, [rsp + gpr_save_mem_offset + 1*8]
mov rdi, [rsp + gpr_save_mem_offset + 2*8]
@ -666,13 +504,6 @@ skip2:
mov r13, [rsp + gpr_save_mem_offset + 5*8]
mov r14, [rsp + gpr_save_mem_offset + 6*8]
mov r15, [rsp + gpr_save_mem_offset + 7*8]
MOVDQA xmm6, [rsp + xmm_save_mem_offset + 0*16]
MOVDQA xmm7, [rsp + xmm_save_mem_offset + 1*16]
MOVDQA xmm8, [rsp + xmm_save_mem_offset + 2*16]
MOVDQA xmm9, [rsp + xmm_save_mem_offset + 3*16]
MOVDQA xmm10, [rsp + xmm_save_mem_offset + 4*16]
MOVDQA xmm11, [rsp + xmm_save_mem_offset + 5*16]
MOVDQA xmm12, [rsp + xmm_save_mem_offset + 6*16]
%ifndef ALIGN_STACK
add rsp, stack_size
@ -682,17 +513,7 @@ skip2:
%endif
ret
MARK __bitbuf_full_ %+ ARCH
bitbuf_full:
mov blen, [rsp + blen_mem_offset]
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
add f_i, [stream + _internal_state_file_start]
sub f_i, stream
sub f_i, _internal_state_buffer
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
jmp end
MARK __compare_loops_ %+ ARCH
MARK __body_compare_loops_ %+ ARCH
compare_loop:
MOVD xhash, tmp6 %+ d
PINSRD xhash, tmp2 %+ d, 1
@ -711,8 +532,8 @@ compare_loop:
jmp len_dist_huffman
compare_loop2:
lea tmp2, [tmp1 + dist2]
add tmp1, 1
lea tmp2, [tmp1 + dist2]
add tmp1, 1
%if (COMPARE_TYPE == 1)
compare250 tmp1, tmp2, len2, tmp3
%elif (COMPARE_TYPE == 2)
@ -727,30 +548,7 @@ compare_loop2:
get_lit_code curr_data, code3, code_len3, hufftables
jmp len_dist_lit_huffman
MARK __write_first_byte_ %+ ARCH
write_first_byte:
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja bitbuf_full
mov dword [rsp + empty_buffer_flag], 0
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
mov hash, hash2
shr tmp6, 16
compute_hash hash2, tmp6
MOVD xhash, hash %+ d
PINSRD xhash, hash2 %+ d, 1
PAND xhash, xhash, xmask
and curr_data, 0xff
get_lit_code curr_data, code2, code_len2, hufftables
jmp write_lit_bits
section .data
align 16
mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK
const_D: dq D
%endif ;; ifndef TEST

View File

@ -4,5 +4,4 @@
%define COMPARE_TYPE 2
%endif
%include "igzip_buffer_utils_01.asm"
%include "igzip_body.asm"

View File

@ -4,5 +4,4 @@
%define COMPARE_TYPE 2
%endif
%include "igzip_buffer_utils_04.asm"
%include "igzip_body.asm"

View File

@ -5,5 +5,4 @@
%define COMPARE_TYPE 3
%endif
%include "igzip_buffer_utils_04.asm"
%include "igzip_body.asm"

View File

@ -1,543 +0,0 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifndef BUFFER_UTILS
%define BUFFER_UTILS
%include "options.asm"
extern pshufb_shf_table
extern mask3
%ifdef FIX_CACHE_READ
%define movntdqa movdqa
%else
%macro prefetchnta 1
%endm
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; code for doing the CRC calculation as part of copy-in, using pclmulqdq
; "shift" 4 input registers down 4 places
; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
%macro FOLD4 7
%define %%xmm0 %1 ; xmm reg, in/out
%define %%xmm1 %2 ; xmm reg, in/out
%define %%xmm2 %3 ; xmm reg, in/out
%define %%xmm3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
%define %%tmp1 %7 ; xmm reg, tmp
movaps %%tmp0, %%xmm0
movaps %%tmp1, %%xmm1
pclmulqdq %%xmm0, %%const, 0x01
pclmulqdq %%xmm1, %%const, 0x01
pclmulqdq %%tmp0, %%const, 0x10
pclmulqdq %%tmp1, %%const, 0x10
xorps %%xmm0, %%tmp0
xorps %%xmm1, %%tmp1
movaps %%tmp0, %%xmm2
movaps %%tmp1, %%xmm3
pclmulqdq %%xmm2, %%const, 0x01
pclmulqdq %%xmm3, %%const, 0x01
pclmulqdq %%tmp0, %%const, 0x10
pclmulqdq %%tmp1, %%const, 0x10
xorps %%xmm2, %%tmp0
xorps %%xmm3, %%tmp1
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 3 input registers down 4 places
; macro FOLD3 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out D A' B' C'
%macro FOLD3 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
movdqa %%tmp0, %%x3
movaps %%x3, %%x2
pclmulqdq %%x2, %%const, 0x01
pclmulqdq %%x3, %%const, 0x10
xorps %%x3, %%x2
movaps %%x2, %%x1
pclmulqdq %%x1, %%const, 0x01
pclmulqdq %%x2, %%const, 0x10
xorps %%x2, %%x1
movaps %%x1, %%x0
pclmulqdq %%x0, %%const, 0x01
pclmulqdq %%x1, %%const, 0x10
xorps %%x1, %%x0
movdqa %%x0, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 2 input registers down 4 places
; macro FOLD2 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out C D A' B'
%macro FOLD2 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
movdqa %%tmp0, %%x3
movaps %%x3, %%x1
pclmulqdq %%x1, %%const, 0x01
pclmulqdq %%x3, %%const, 0x10
xorps %%x3, %%x1
movdqa %%x1, %%tmp0
movdqa %%tmp0, %%x2
movaps %%x2, %%x0
pclmulqdq %%x0, %%const, 0x01
pclmulqdq %%x2, %%const, 0x10
xorps %%x2, %%x0
movdqa %%x0, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 1 input registers down 4 places
; macro FOLD1 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out B C D A'
%macro FOLD1 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
movdqa %%tmp0, %%x3
movaps %%x3, %%x0
pclmulqdq %%x0, %%const, 0x01
pclmulqdq %%x3, %%const, 0x10
xorps %%x3, %%x0
movdqa %%x0, %%x1
movdqa %%x1, %%x2
movdqa %%x2, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
; XP X3 X2 X1 X0 tmp2
; Initial state xI HG FE DC BA
; after shift IH GF ED CB A0
; after fold ff GF ED CB ff = merge(IH, A0)
;
%macro PARTIAL_FOLD 12
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%xp %5 ; xmm partial reg, in/clobbered
%define %%size %6 ; GPR, in/clobbered (1...15)
%define %%const %7 ; xmm reg, in
%define %%shl %8 ; xmm reg, tmp
%define %%shr %9 ; xmm reg, tmp
%define %%tmp2 %10 ; xmm reg, tmp
%define %%tmp3 %11 ; xmm reg, tmp
%define %%gtmp %12 ; GPR, tmp
; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
shl %%size, 4 ; size *= 16
lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
movdqa %%shl, [%%gtmp + %%size] ; shl constant
movdqa %%shr, %%shl
pxor %%shr, [mask3 WRT_OPT] ; shr constant
movdqa %%tmp2, %%x0 ; tmp2 = BA
pshufb %%tmp2, %%shl ; tmp2 = A0
pshufb %%x0, %%shr ; x0 = 0B
movdqa %%tmp3, %%x1 ; tmp3 = DC
pshufb %%tmp3, %%shl ; tmp3 = C0
por %%x0, %%tmp3 ; x0 = CB
pshufb %%x1, %%shr ; x1 = 0D
movdqa %%tmp3, %%x2 ; tmp3 = FE
pshufb %%tmp3, %%shl ; tmp3 = E0
por %%x1, %%tmp3 ; x1 = ED
pshufb %%x2, %%shr ; x2 = 0F
movdqa %%tmp3, %%x3 ; tmp3 = HG
pshufb %%tmp3, %%shl ; tmp3 = G0
por %%x2, %%tmp3 ; x2 = GF
pshufb %%x3, %%shr ; x3 = 0H
pshufb %%xp, %%shl ; xp = I0
por %%x3, %%xp ; x3 = IH
; fold tmp2 into X3
movaps %%tmp3, %%tmp2
pclmulqdq %%tmp2, %%const, 0x01
pclmulqdq %%tmp3, %%const, 0x10
xorps %%x3, %%tmp2
xorps %%x3, %%tmp3
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
; Returns 0 if data has length 0.
; Input: The input data (src), that data's length (size).
; Output: The packed xmm register (xmm_out).
; size is clobbered.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro LOAD_FRACTIONAL_XMM 3
%define %%xmm_out %1 ; %%xmm_out is an xmm register
%define %%src %2
%define %%size %3
pxor %%xmm_out, %%xmm_out
cmp %%size, 0
je %%_done
add %%src, %%size
cmp %%size, 8
jl %%_byte_loop
sub %%src, 8
pinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
sub %%size, 8
je %%_done
%%_byte_loop: ;Read in data 1 byte at a time while data is left
pslldq %%xmm_out, 1
dec %%src
pinsrb %%xmm_out, BYTE [%%src], 0
dec %%size
jg %%_byte_loop
%%_done:
%endmacro ; LOAD_FRACTIONAL_XMM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; copy x bytes (rounded up to 16 bytes) from src to dst
; src & dst are unaligned
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
; xt0, xt1, xt2, xt3, xt4
%macro COPY_IN_CRC 14
%define %%dst %1 ; reg, in/clobbered
%define %%src %2 ; reg, in/clobbered
%define %%size %3 ; reg, in/clobbered
%define %%tmp %4 ; reg, tmp
%define %%x0 %5 ; xmm, in/out: crc state
%define %%x1 %6 ; xmm, in/out: crc state
%define %%x2 %7 ; xmm, in/out: crc state
%define %%x3 %8 ; xmm, in/out: crc state
%define %%xfold %9 ; xmm, in: (loaded from fold4)
%define %%xtmp0 %10 ; xmm, tmp
%define %%xtmp1 %11 ; xmm, tmp
%define %%xtmp2 %12 ; xmm, tmp
%define %%xtmp3 %13 ; xmm, tmp
%define %%xtmp4 %14 ; xmm, tmp
cmp %%size, 16
jl %%lt_16
; align source
xor %%tmp, %%tmp
sub %%tmp, %%src
and %%tmp, 15
jz %%already_aligned
; need to align, tmp contains number of bytes to transfer
movdqu %%xtmp0, [%%src]
movdqu [%%dst], %%xtmp0
add %%dst, %%tmp
add %%src, %%tmp
sub %%size, %%tmp
%ifndef DEFLATE
push %%dst
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
pop %%dst
%endif
%%already_aligned:
sub %%size, 64
jl %%end_loop
jmp %%loop
align 16
%%loop:
movntdqa %%xtmp0, [%%src+0*16]
movntdqa %%xtmp1, [%%src+1*16]
movntdqa %%xtmp2, [%%src+2*16]
%ifndef DEFLATE
FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
%endif
movntdqa %%xtmp3, [%%src+3*16]
movdqu [%%dst+0*16], %%xtmp0
movdqu [%%dst+1*16], %%xtmp1
movdqu [%%dst+2*16], %%xtmp2
movdqu [%%dst+3*16], %%xtmp3
%ifndef DEFLATE
pxor %%x0, %%xtmp0
pxor %%x1, %%xtmp1
pxor %%x2, %%xtmp2
pxor %%x3, %%xtmp3
%endif
add %%src, 4*16
add %%dst, 4*16
sub %%size, 4*16
jge %%loop
%%end_loop:
; %%size contains (num bytes left - 64)
add %%size, 16
jge %%three_full_regs
add %%size, 16
jge %%two_full_regs
add %%size, 16
jge %%one_full_reg
add %%size, 16
%%no_full_regs: ; 0 <= %%size < 16, no full regs
jz %%done ; if no bytes left, we're done
jmp %%partial
;; Handle case where input is <16 bytes
%%lt_16:
test %%size, %%size
jz %%done ; if no bytes left, we're done
jmp %%partial
%%one_full_reg:
movntdqa %%xtmp0, [%%src+0*16]
%ifndef DEFLATE
FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
movdqu [%%dst+0*16], %%xtmp0
%ifndef DEFLATE
pxor %%x3, %%xtmp0
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 1*16
add %%src, 1*16
jmp %%partial
%%two_full_regs:
movntdqa %%xtmp0, [%%src+0*16]
movntdqa %%xtmp1, [%%src+1*16]
%ifndef DEFLATE
FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
movdqu [%%dst+0*16], %%xtmp0
movdqu [%%dst+1*16], %%xtmp1
%ifndef DEFLATE
pxor %%x2, %%xtmp0
pxor %%x3, %%xtmp1
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 2*16
add %%src, 2*16
jmp %%partial
%%three_full_regs:
movntdqa %%xtmp0, [%%src+0*16]
movntdqa %%xtmp1, [%%src+1*16]
movntdqa %%xtmp2, [%%src+2*16]
%ifndef DEFLATE
FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
movdqu [%%dst+0*16], %%xtmp0
movdqu [%%dst+1*16], %%xtmp1
movdqu [%%dst+2*16], %%xtmp2
%ifndef DEFLATE
pxor %%x1, %%xtmp0
pxor %%x2, %%xtmp1
pxor %%x3, %%xtmp2
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 3*16
add %%src, 3*16
; fall through to %%partial
%%partial: ; 0 <= %%size < 16
%ifndef DEFLATE
mov %%tmp, %%size
%endif
LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
movdqu [%%dst], %%xtmp0
%ifndef DEFLATE
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
%endif
%%done:
%endm
;%assign K 1024;
;%assign D 8 * K; ; Amount of history
;%assign LA 18 * 16; ; Max look-ahead, rounded up to 32 byte boundary
; copy D + LA bytes from src to dst
; dst is aligned
;void copy_D_LA(uint8_t *dst, uint8_t *src);
; arg 1: rcx : dst
; arg 2: rdx : src
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
%macro copy_D_LA 7
%define %%dst %1 ; reg, clobbered
%define %%src %2 ; reg, clobbered
%define %%tmp %3
%define %%xtmp0 %4
%define %%xtmp1 %5
%define %%xtmp2 %6
%define %%xtmp3 %7
%assign %%SIZE (D + LA) / 16 ; number of DQ words to be copied
%assign %%SIZE4 %%SIZE/4
lea %%tmp, [%%dst + 4 * 16 * %%SIZE4]
jmp %%copy_D_LA_1
align 16
%%copy_D_LA_1:
movdqu %%xtmp0, [%%src]
movdqu %%xtmp1, [%%src+16]
movdqu %%xtmp2, [%%src+32]
movdqu %%xtmp3, [%%src+48]
movdqa [%%dst], %%xtmp0
movdqa [%%dst+16], %%xtmp1
movdqa [%%dst+32], %%xtmp2
movdqa [%%dst+48], %%xtmp3
add %%src, 4*16
add %%dst, 4*16
cmp %%dst, %%tmp
jne %%copy_D_LA_1
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)
%if (%%i == 0)
movdqu %%xtmp0, [%%src + %%i*16]
%elif (%%i == 1)
movdqu %%xtmp1, [%%src + %%i*16]
%elif (%%i == 2)
movdqu %%xtmp2, [%%src + %%i*16]
%elif (%%i == 3)
movdqu %%xtmp3, [%%src + %%i*16]
%else
%error too many i
% error
%endif
%assign %%i %%i+1
%endrep
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)
%if (%%i == 0)
movdqa [%%dst + %%i*16], %%xtmp0
%elif (%%i == 1)
movdqa [%%dst + %%i*16], %%xtmp1
%elif (%%i == 2)
movdqa [%%dst + %%i*16], %%xtmp2
%elif (%%i == 3)
movdqa [%%dst + %%i*16], %%xtmp3
%else
%error too many i
% error
%endif
%assign %%i %%i+1
%endrep
%endm
%endif

View File

@ -1,552 +0,0 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifndef BUFFER_UTILS
%define BUFFER_UTILS
%include "options.asm"
extern pshufb_shf_table
extern mask3
%ifdef FIX_CACHE_READ
%define vmovntdqa vmovdqa
%else
%macro prefetchnta 1
%endm
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; code for doing the CRC calculation as part of copy-in, using pclmulqdq
; "shift" 4 input registers down 4 places
; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
%macro FOLD4 7
%define %%xmm0 %1 ; xmm reg, in/out
%define %%xmm1 %2 ; xmm reg, in/out
%define %%xmm2 %3 ; xmm reg, in/out
%define %%xmm3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
%define %%tmp1 %7 ; xmm reg, tmp
vmovaps %%tmp0, %%xmm0
vmovaps %%tmp1, %%xmm1
vpclmulqdq %%xmm0, %%const, 0x01
vpclmulqdq %%xmm1, %%const, 0x01
vpclmulqdq %%tmp0, %%const, 0x10
vpclmulqdq %%tmp1, %%const, 0x10
vxorps %%xmm0, %%tmp0
vxorps %%xmm1, %%tmp1
vmovaps %%tmp0, %%xmm2
vmovaps %%tmp1, %%xmm3
vpclmulqdq %%xmm2, %%const, 0x01
vpclmulqdq %%xmm3, %%const, 0x01
vpclmulqdq %%tmp0, %%const, 0x10
vpclmulqdq %%tmp1, %%const, 0x10
vxorps %%xmm2, %%tmp0
vxorps %%xmm3, %%tmp1
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 3 input registers down 4 places
; macro FOLD3 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out D A' B' C'
%macro FOLD3 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
vmovdqa %%tmp0, %%x3
vmovaps %%x3, %%x2
vpclmulqdq %%x2, %%const, 0x01
vpclmulqdq %%x3, %%const, 0x10
vxorps %%x3, %%x2
vmovaps %%x2, %%x1
vpclmulqdq %%x1, %%const, 0x01
vpclmulqdq %%x2, %%const, 0x10
vxorps %%x2, %%x1
vmovaps %%x1, %%x0
vpclmulqdq %%x0, %%const, 0x01
vpclmulqdq %%x1, %%const, 0x10
vxorps %%x1, %%x0
vmovdqa %%x0, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 2 input registers down 4 places
; macro FOLD2 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out C D A' B'
%macro FOLD2 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
vmovdqa %%tmp0, %%x3
vmovaps %%x3, %%x1
vpclmulqdq %%x1, %%const, 0x01
vpclmulqdq %%x3, %%const, 0x10
vxorps %%x3, %%x1
vmovdqa %%x1, %%tmp0
vmovdqa %%tmp0, %%x2
vmovaps %%x2, %%x0
vpclmulqdq %%x0, %%const, 0x01
vpclmulqdq %%x2, %%const, 0x10
vxorps %%x2, %%x0
vmovdqa %%x0, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; "shift" 1 input registers down 4 places
; macro FOLD1 x0, x1, x2, x3, const, tmp0
; x0 x1 x2 x3
; In A B C D
; Out B C D A'
%macro FOLD1 6
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%const %5 ; xmm reg, in
%define %%tmp0 %6 ; xmm reg, tmp
vmovdqa %%tmp0, %%x3
vmovaps %%x3, %%x0
vpclmulqdq %%x0, %%const, 0x01
vpclmulqdq %%x3, %%const, 0x10
vxorps %%x3, %%x0
vmovdqa %%x0, %%x1
vmovdqa %%x1, %%x2
vmovdqa %%x2, %%tmp0
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
; XP X3 X2 X1 X0 tmp2
; Initial state xI HG FE DC BA
; after shift IH GF ED CB A0
; after fold ff GF ED CB ff = merge(IH, A0)
;
%macro PARTIAL_FOLD 12
%define %%x0 %1 ; xmm reg, in/out
%define %%x1 %2 ; xmm reg, in/out
%define %%x2 %3 ; xmm reg, in/out
%define %%x3 %4 ; xmm reg, in/out
%define %%xp %5 ; xmm partial reg, in/clobbered
%define %%size %6 ; GPR, in/clobbered (1...15)
%define %%const %7 ; xmm reg, in
%define %%shl %8 ; xmm reg, tmp
%define %%shr %9 ; xmm reg, tmp
%define %%tmp2 %10 ; xmm reg, tmp
%define %%tmp3 %11 ; xmm reg, tmp
%define %%gtmp %12 ; GPR, tmp
; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
shl %%size, 4 ; size *= 16
lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
vmovdqa %%shl, [%%gtmp + %%size] ; shl constant
vmovdqa %%shr, %%shl
vpxor %%shr, [mask3 WRT_OPT] ; shr constant
vmovdqa %%tmp2, %%x0 ; tmp2 = BA
vpshufb %%tmp2, %%shl ; tmp2 = A0
vpshufb %%x0, %%shr ; x0 = 0B
vmovdqa %%tmp3, %%x1 ; tmp3 = DC
vpshufb %%tmp3, %%shl ; tmp3 = C0
vpor %%x0, %%tmp3 ; x0 = CB
vpshufb %%x1, %%shr ; x1 = 0D
vmovdqa %%tmp3, %%x2 ; tmp3 = FE
vpshufb %%tmp3, %%shl ; tmp3 = E0
vpor %%x1, %%tmp3 ; x1 = ED
vpshufb %%x2, %%shr ; x2 = 0F
vmovdqa %%tmp3, %%x3 ; tmp3 = HG
vpshufb %%tmp3, %%shl ; tmp3 = G0
vpor %%x2, %%tmp3 ; x2 = GF
vpshufb %%x3, %%shr ; x3 = 0H
vpshufb %%xp, %%shl ; xp = I0
vpor %%x3, %%xp ; x3 = IH
; fold tmp2 into X3
vmovaps %%tmp3, %%tmp2
vpclmulqdq %%tmp2, %%const, 0x01
vpclmulqdq %%tmp3, %%const, 0x10
vxorps %%x3, %%tmp2
vxorps %%x3, %%tmp3
%endm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
; Returns 0 if data has length 0.
; Input: The input data (src), that data's length (size).
; Output: The packed xmm register (xmm_out).
; size is clobbered.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro LOAD_FRACTIONAL_XMM 3
%define %%xmm_out %1 ; %%xmm_out is an xmm register
%define %%src %2
%define %%size %3
vpxor %%xmm_out, %%xmm_out
cmp %%size, 0
je %%_done
add %%src, %%size
cmp %%size, 8
jl %%_byte_loop
sub %%src, 8
vpinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
sub %%size, 8
je %%_done
%%_byte_loop: ;Read in data 1 byte at a time while data is left
vpslldq %%xmm_out, 1
dec %%src
vpinsrb %%xmm_out, BYTE [%%src], 0
dec %%size
jg %%_byte_loop
%%_done:
%endmacro ; LOAD_FRACTIONAL_XMM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; copy x bytes (rounded up to 16 bytes) from src to dst
; src & dst are unaligned
; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
; xt0, xt1, xt2, xt3, xt4
%macro COPY_IN_CRC 14
%define %%dst %1 ; reg, in/clobbered
%define %%src %2 ; reg, in/clobbered
%define %%size %3 ; reg, in/clobbered
%define %%tmp %4 ; reg, tmp
%define %%x0 %5 ; xmm, in/out: crc state
%define %%x1 %6 ; xmm, in/out: crc state
%define %%x2 %7 ; xmm, in/out: crc state
%define %%x3 %8 ; xmm, in/out: crc state
%define %%xfold %9 ; xmm, in: (loaded from fold4)
%define %%xtmp0 %10 ; xmm, tmp
%define %%xtmp1 %11 ; xmm, tmp
%define %%xtmp2 %12 ; xmm, tmp
%define %%xtmp3 %13 ; xmm, tmp
%define %%xtmp4 %14 ; xmm, tmp
cmp %%size, 16
jl %%lt_16
; align source
xor %%tmp, %%tmp
sub %%tmp, %%src
and %%tmp, 15
jz %%already_aligned
; need to align, tmp contains number of bytes to transfer
vmovdqu %%xtmp0, [%%src]
vmovdqu [%%dst], %%xtmp0
add %%dst, %%tmp
add %%src, %%tmp
sub %%size, %%tmp
%ifndef DEFLATE
push %%dst
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
pop %%dst
%endif
%%already_aligned:
sub %%size, 64
jl %%end_loop
jmp %%loop
align 16
%%loop:
vmovntdqa %%xtmp0, [%%src+0*16]
vmovntdqa %%xtmp1, [%%src+1*16]
vmovntdqa %%xtmp2, [%%src+2*16]
%ifndef DEFLATE
FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
%endif
vmovntdqa %%xtmp3, [%%src+3*16]
vmovdqu [%%dst+0*16], %%xtmp0
vmovdqu [%%dst+1*16], %%xtmp1
vmovdqu [%%dst+2*16], %%xtmp2
vmovdqu [%%dst+3*16], %%xtmp3
%ifndef DEFLATE
vpxor %%x0, %%xtmp0
vpxor %%x1, %%xtmp1
vpxor %%x2, %%xtmp2
vpxor %%x3, %%xtmp3
%endif
add %%src, 4*16
add %%dst, 4*16
sub %%size, 4*16
jge %%loop
%%end_loop:
; %%size contains (num bytes left - 64)
add %%size, 16
jge %%three_full_regs
add %%size, 16
jge %%two_full_regs
add %%size, 16
jge %%one_full_reg
add %%size, 16
%%no_full_regs: ; 0 <= %%size < 16, no full regs
jz %%done ; if no bytes left, we're done
jmp %%partial
;; Handle case where input is <16 bytes
%%lt_16:
test %%size, %%size
jz %%done ; if no bytes left, we're done
jmp %%partial
%%one_full_reg:
vmovntdqa %%xtmp0, [%%src+0*16]
%ifndef DEFLATE
FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
vmovdqu [%%dst+0*16], %%xtmp0
%ifndef DEFLATE
vpxor %%x3, %%xtmp0
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 1*16
add %%src, 1*16
jmp %%partial
%%two_full_regs:
vmovntdqa %%xtmp0, [%%src+0*16]
vmovntdqa %%xtmp1, [%%src+1*16]
%ifndef DEFLATE
FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
vmovdqu [%%dst+0*16], %%xtmp0
vmovdqu [%%dst+1*16], %%xtmp1
%ifndef DEFLATE
vpxor %%x2, %%xtmp0
vpxor %%x3, %%xtmp1
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 2*16
add %%src, 2*16
jmp %%partial
%%three_full_regs:
vmovntdqa %%xtmp0, [%%src+0*16]
vmovntdqa %%xtmp1, [%%src+1*16]
vmovntdqa %%xtmp2, [%%src+2*16]
%ifndef DEFLATE
FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
%endif
vmovdqu [%%dst+0*16], %%xtmp0
vmovdqu [%%dst+1*16], %%xtmp1
vmovdqu [%%dst+2*16], %%xtmp2
%ifndef DEFLATE
vpxor %%x1, %%xtmp0
vpxor %%x2, %%xtmp1
vpxor %%x3, %%xtmp2
%endif
test %%size, %%size
jz %%done ; if no bytes left, we're done
add %%dst, 3*16
add %%src, 3*16
; fall through to %%partial
%%partial: ; 0 <= %%size < 16
%ifndef DEFLATE
mov %%tmp, %%size
%endif
LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
vmovdqu [%%dst], %%xtmp0
%ifndef DEFLATE
PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
%%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
%endif
%%done:
%endm
;%assign K 1024;
;%assign D 8 * K; ; Amount of history
;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary
; copy D + LA bytes from src to dst
; dst is aligned
;void copy_D_LA(uint8_t *dst, uint8_t *src);
; arg 1: rcx : dst
; arg 2: rdx : src
; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
%macro copy_D_LA 7
%define %%dst %1 ; reg, clobbered
%define %%src %2 ; reg, clobbered
%define %%tmp %3
%define %%ytmp0 %4
%define %%ytmp1 %5
%define %%ytmp2 %6
%define %%ytmp3 %7
%define %%xtmp0 %4x
%assign %%SIZE (D + LA) / 32 ; number of DQ words to be copied
%assign %%SIZE4 %%SIZE/4
%assign %%MOD16 ((D + LA) - 32 * %%SIZE) / 16
lea %%tmp, [%%dst + 4 * 32 * %%SIZE4]
jmp %%copy_D_LA_1
align 16
%%copy_D_LA_1:
vmovdqu %%ytmp0, [%%src]
vmovdqu %%ytmp1, [%%src + 1 * 32]
vmovdqu %%ytmp2, [%%src + 2 * 32]
vmovdqu %%ytmp3, [%%src + 3 * 32]
vmovdqa [%%dst], %%ytmp0
vmovdqa [%%dst + 1 * 32], %%ytmp1
vmovdqa [%%dst + 2 * 32], %%ytmp2
vmovdqa [%%dst + 3 * 32], %%ytmp3
add %%src, 4*32
add %%dst, 4*32
cmp %%dst, %%tmp
jne %%copy_D_LA_1
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)
%if (%%i == 0)
vmovdqu %%ytmp0, [%%src + %%i*32]
%elif (%%i == 1)
vmovdqu %%ytmp1, [%%src + %%i*32]
%elif (%%i == 2)
vmovdqu %%ytmp2, [%%src + %%i*32]
%elif (%%i == 3)
vmovdqu %%ytmp3, [%%src + %%i*32]
%else
%error too many i
% error
%endif
%assign %%i %%i+1
%endrep
%assign %%i 0
%rep (%%SIZE - 4 * %%SIZE4)
%if (%%i == 0)
vmovdqa [%%dst + %%i*32], %%ytmp0
%elif (%%i == 1)
vmovdqa [%%dst + %%i*32], %%ytmp1
%elif (%%i == 2)
vmovdqa [%%dst + %%i*32], %%ytmp2
%elif (%%i == 3)
vmovdqa [%%dst + %%i*32], %%ytmp3
%else
%error too many i
% error
%endif
%assign %%i %%i+1
%endrep
%rep %%MOD16
vmovdqu %%xtmp0, [%%src + (%%SIZE - 4 * %%SIZE4)*32]
vmovdqa [%%dst + (%%SIZE - 4 * %%SIZE4)*32], %%xtmp0
%endrep
%endm
%endif

View File

@ -41,6 +41,7 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define curr_data rax
%define tmp1 rax
%define f_index rbx
@ -69,7 +70,6 @@
%define m_bit_count r11
%define code2 r12
%define f_end_i r12
%define file_start r13
@ -110,32 +110,29 @@ skip_SLOP:
mov hufftables, [stream + _hufftables]
; f_i = state->b_bytes_processed;
; f_end_i = state->b_bytes_valid;
mov f_i %+ d, [stream + _internal_state_b_bytes_processed]
mov f_end_i %+ d, [stream + _internal_state_b_bytes_valid]
mov file_start, [stream + _next_in]
; f_i += (uint32_t)(state->buffer - state->file_start);
; f_end_i += (uint32_t)(state->buffer - state->file_start);
mov file_start, [stream + _internal_state_file_start]
lea tmp1, [stream + _internal_state_buffer]
sub tmp1, file_start
add f_i, tmp1
add f_end_i, tmp1
mov f_i %+ d, dword [stream + _total_in]
sub file_start, f_i
mov f_end_i %+ d, dword [stream + _avail_in]
add f_end_i, f_i
sub f_end_i, LAST_BYTES_COUNT
mov [rsp + f_end_i_mem_offset], f_end_i
; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
cmp f_i, f_end_i
jge end_loop_2
mov tmp1, [file_start + f_i]
mov curr_data %+ d, [file_start + f_i]
cmp dword [stream + _internal_state_b_bytes_processed], 0
cmp dword [stream + _internal_state_b_bytes_processed], 0 ;TODO fixz
jne skip_write_first_byte
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja end_loop_2
compute_hash hash, tmp1
compute_hash hash, curr_data
and hash %+ d, HASH_MASK
mov [stream + _internal_state_head + 2 * hash], f_i %+ w
jmp encode_literal
@ -148,7 +145,8 @@ loop2:
ja end_loop_2
; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
compute_hash hash, tmp1
mov curr_data %+ d, [file_start + f_i]
compute_hash hash, curr_data
and hash %+ d, HASH_MASK
; f_index = state->head[hash];
@ -171,6 +169,7 @@ loop2:
; len = f_end_i - f_i;
mov tmp4, [rsp + f_end_i_mem_offset]
sub tmp4, f_i
add tmp4, LAST_BYTES_COUNT
; if (len > 258) len = 258;
cmp tmp4, 258
@ -206,11 +205,13 @@ loop2:
; for (k = f_i+1, f_i += len-1; k <= f_i; k++) {
lea tmp3, [f_i + 1] ; tmp3 <= k
add f_i, len
%ifdef LIMIT_HASH_UPDATE
cmp f_i, [rsp + f_end_i_mem_offset]
jae skip_hash_update
; only update hash twice
; hash = compute_hash(state->file_start + k) & HASH_MASK;
mov tmp6, [file_start + tmp3]
mov tmp6 %+ d, dword [file_start + tmp3]
compute_hash hash, tmp6
and hash %+ d, HASH_MASK
; state->head[hash] = k;
@ -219,27 +220,13 @@ loop2:
add tmp3, 1
; hash = compute_hash(state->file_start + k) & HASH_MASK;
mov tmp6, [file_start + tmp3]
mov tmp6 %+ d, dword [file_start + tmp3]
compute_hash hash, tmp6
and hash %+ d, HASH_MASK
; state->head[hash] = k;
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
%else
loop3:
; hash = compute_hash(state->file_start + k) & HASH_MASK;
mov tmp6, [file_start + tmp3]
compute_hash hash, tmp6
and hash %+ d, HASH_MASK
; state->head[hash] = k;
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
inc tmp3
cmp tmp3, f_i
jl loop3
%endif
mov tmp1 %+ d, [file_start + f_i]
skip_hash_update:
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp5
; continue
@ -248,8 +235,6 @@ loop3:
jmp end_loop_2
encode_literal:
mov tmp1 %+ d, [file_start + f_i + 1]
; get_lit_code(state->file_start[f_i], &code2, &code_len2);
movzx tmp5, byte [file_start + f_i]
get_lit_code tmp5, code2, code_len2, hufftables
@ -262,19 +247,29 @@ encode_literal:
jl loop2
end_loop_2:
mov f_end_i, [rsp + f_end_i_mem_offset]
add f_end_i, LAST_BYTES_COUNT
mov [rsp + f_end_i_mem_offset], f_end_i
; if ((f_i >= f_end_i) && ! state->bitbuf.is_full()) {
cmp f_i, f_end_i
jge write_eob
xor tmp5, tmp5
final_bytes:
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja not_end
movzx tmp5, byte [file_start + f_i]
get_lit_code tmp5, code2, code_len2, hufftables
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
inc f_i
cmp f_i, [rsp + f_end_i_mem_offset]
jl not_end
jl final_bytes
write_eob:
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja not_end
cmp dword [stream + _end_of_stream], 1
jne cont
cmp dword [stream + _internal_state_left_over], 0
jg not_end
cont:
; get_lit_code(256, &code2, &code_len2);
get_lit_code 256, code2, code_len2, hufftables
@ -293,14 +288,16 @@ sync_flush:
; }
not_end:
; state->b_bytes_processed = f_i - (state->buffer - state->file_start);
add f_i, [stream + _internal_state_file_start]
sub f_i, stream
sub f_i, _internal_state_buffer
mov [stream + _internal_state_b_bytes_processed], f_i %+ d
; // update output buffer
; stream->next_out = state->bitbuf.buffer_ptr();
;; Update input buffer
mov f_end_i, [rsp + f_end_i_mem_offset]
mov [stream + _total_in], f_i %+ d
add file_start, f_i
mov [stream + _next_in], file_start
sub f_end_i, f_i
mov [stream + _avail_in], f_end_i %+ d
;; Update output buffer
mov [stream + _next_out], m_out_buf
; len = state->bitbuf.buffer_used();
sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]

View File

@ -50,23 +50,17 @@ extern isal_deflate_body_04
extern isal_deflate_finish_base
extern isal_deflate_finish_01
extern get_crc_base
extern get_crc_01
extern isal_update_histogram_base
extern isal_update_histogram_01
extern isal_update_histogram_04
extern isal_deflate_init_base
extern isal_deflate_init_01
extern crc32_gzip_base
extern crc32_gzip_01
section .text
%include "multibinary.asm"
mbin_interface isal_deflate_init
mbin_dispatch_init5 isal_deflate_init, isal_deflate_init_base, isal_deflate_init_01, isal_deflate_init_01, isal_deflate_init_01
mbin_interface isal_deflate_body_stateless
mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_02, isal_deflate_body_stateless_04
@ -75,8 +69,8 @@ mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body
mbin_interface isal_deflate_finish
mbin_dispatch_init5 isal_deflate_finish, isal_deflate_finish_base, isal_deflate_finish_01, isal_deflate_finish_01, isal_deflate_finish_01
mbin_interface get_crc
mbin_dispatch_init5 get_crc, get_crc_base, get_crc_01, get_crc_01, get_crc_01
mbin_interface isal_update_histogram
mbin_dispatch_init5 isal_update_histogram, isal_update_histogram_base, isal_update_histogram_01, isal_update_histogram_01, isal_update_histogram_04
mbin_interface crc32_gzip
mbin_dispatch_init5 crc32_gzip, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01, crc32_gzip_01

View File

@ -37,7 +37,6 @@
%include "stdmac.asm"
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
%ifdef DEBUG
@ -179,8 +178,7 @@ skip_SLOP:
mov [stream + _internal_state_b_bytes_valid], f_end_i %+ d
mov f_i, 0
mov file_start, [stream + _next_in]
mov [stream + _internal_state_file_start], file_start
mov file_start, [stream + _internal_state_file_start]
; f_end_i -= LA;
sub f_end_i, LA_STATELESS

View File

@ -231,14 +231,13 @@ struct isal_zstate {
uint32_t b_bytes_valid; //!< number of bytes of valid data in buffer
uint32_t b_bytes_processed; //!< keeps track of the number of bytes processed in isal_zstate.buffer
uint8_t *file_start; //!< pointer to where file would logically start
DECLARE_ALIGNED(uint32_t crc[16], 16); //!< actually 4 128-bit integers
uint32_t crc; //!< Current crc
struct BitBuf2 bitbuf; //!< Bit Buffer
enum isal_zstate_state state; //!< Current state in processing the data stream
uint32_t count; //!< used for partial header/trailer writes
uint8_t tmp_out_buff[16]; //!< temporary array
uint32_t tmp_out_start; //!< temporary variable
uint32_t tmp_out_end; //!< temporary variable
uint32_t last_flush; //!< keeps track of last submitted flush
uint32_t has_gzip_hdr; //!< keeps track of if the gzip header has been written.
uint32_t has_eob; //!< keeps track of eob on the last deflate block
uint32_t has_eob_hdr; //!< keeps track of eob hdr (with BFINAL set)