From f97de75fc6bc2f96ce148e81f89d01fd135ccd9d Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Tue, 21 Jun 2016 10:53:44 -0700 Subject: [PATCH] igzip: Port improvements to stateless compress to stateful compress Signed-off-by: Roy Oursler Reviewed-by: Greg Tucker --- igzip/Makefile.am | 5 +- igzip/igzip_body.asm | 248 +++++++++++++++++++------------- igzip/igzip_body_02.asm | 8 ++ igzip/igzip_buffer_utils_01.asm | 2 +- igzip/igzip_compare_types.asm | 50 +++---- igzip/igzip_file_perf.c | 2 +- igzip/igzip_multibinary.asm | 3 +- igzip/lz0a_const.asm | 2 +- igzip/stdmac.asm | 29 +++- include/igzip_lib.h | 2 +- 10 files changed, 219 insertions(+), 132 deletions(-) create mode 100644 igzip/igzip_body_02.asm diff --git a/igzip/Makefile.am b/igzip/Makefile.am index a704753..fb723c3 100644 --- a/igzip/Makefile.am +++ b/igzip/Makefile.am @@ -30,7 +30,10 @@ lsrc += igzip/igzip.c igzip/hufftables_c.c \ igzip/crc_utils_01.asm \ igzip/crc_utils_04.asm \ - igzip/igzip_body_01.asm igzip/igzip_body_04.asm igzip/igzip_finish.asm \ + igzip/igzip_body_01.asm \ + igzip/igzip_body_02.asm \ + igzip/igzip_body_04.asm \ + igzip/igzip_finish.asm \ igzip/igzip_stateless_01.asm \ igzip/igzip_stateless_02.asm \ igzip/igzip_stateless_04.asm \ diff --git a/igzip/igzip_body.asm b/igzip/igzip_body.asm index b2f5c23..23b0047 100644 --- a/igzip/igzip_body.asm +++ b/igzip/igzip_body.asm @@ -77,6 +77,7 @@ global %1 %define len rdx %define hash rdx %define code_len3 rdx +%define tmp8 rdx %define tmp1 rsi %define code_len2 rsi @@ -120,11 +121,14 @@ global %1 %define xtmp2 xmm7 ; tmp %define xtmp3 xmm8 ; tmp %define xtmp4 xmm9 ; tmp +%define xhash xmm10 +%define xmask xmm11 +%define xdata xmm12 %define ytmp0 ymm5 ; tmp %define ytmp1 ymm6 ; tmp -%if (ARCH == 04) +%if ( ARCH == 02 || ARCH == 04) %define vtmp0 ymm5 ; tmp %define vtmp1 ymm6 ; tmp %define vtmp2 ymm7 ; tmp @@ -148,8 +152,8 @@ in_buf_mem_offset equ 8 f_end_i_mem_offset equ 16 empty_buffer_flag equ 24 gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes) -xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) -stack_size equ 4*8 + 8*8 + 4*16 + 8 +xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (8*16 bytes) (16 byte aligned) +stack_size equ 4*8 + 8*8 + 8*16 + 8 ;;; 8 because stack address is odd multiple of 8 after a function call and ;;; we want it aligned to 16 bytes @@ -197,8 +201,12 @@ skip1: MOVDQA [rsp + xmm_save_mem_offset + 1*16], xmm7 MOVDQA [rsp + xmm_save_mem_offset + 2*16], xmm8 MOVDQA [rsp + xmm_save_mem_offset + 3*16], xmm9 + MOVDQA [rsp + xmm_save_mem_offset + 4*16], xmm10 + MOVDQA [rsp + xmm_save_mem_offset + 5*16], xmm11 + MOVDQA [rsp + xmm_save_mem_offset + 6*16], xmm12 mov stream, rcx + MOVDQU xmask, [mask] MOVDQA crc_0, [stream + _internal_state_crc + 0*16] MOVDQA crc_1, [stream + _internal_state_crc + 1*16] @@ -334,26 +342,27 @@ skip_move_zero: jge end_loop_2 MARK __misc_compute_hash_lookup_ %+ ARCH + MOVDQU xdata, [file_start + f_i] mov curr_data, [file_start + f_i] + mov tmp3, curr_data + mov tmp6, curr_data + + compute_hash hash, curr_data + + shr tmp3, 8 + compute_hash hash2, tmp3 + + and hash, HASH_MASK + and hash2, HASH_MASK cmp dword [rsp + empty_buffer_flag], 0 jne write_first_byte - mov curr_data2, curr_data - - compute_hash hash, curr_data jmp loop2 align 16 loop2: - shr curr_data2, 8 - compute_hash hash2, curr_data2 - - ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; - and hash %+ d, HASH_MASK - and hash2 %+ d, HASH_MASK - ; if (state->bitbuf.is_full()) { cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] ja bitbuf_full @@ -363,59 +372,53 @@ loop2: xor tmp3, tmp3 lea tmp1, [file_start + f_i] - lea tmp6, [tmp1 - 1] mov dist %+ w, f_i %+ w sub dist %+ w, word [stream + _internal_state_head + 2 * hash] - - ; state->head[hash] = (uint16_t) f_i; mov [stream + _internal_state_head + 2 * hash], f_i %+ w + dec dist inc f_i + MOVQ tmp6, xdata + shr tmp5, 16 + mov tmp8, tmp5 + compute_hash tmp6, tmp5 + mov dist2 %+ w, f_i %+ w sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2] - dec dist2 - - ; state->head[hash2] = (uint16_t) f_i; mov [stream + _internal_state_head + 2 * hash2], f_i %+ w - - mov tmp2, tmp1 - sub tmp2, dist - dec dist + dec dist2 ; if ((dist-1) < (D-1)) { cmp dist %+ d, (D-1) - cmovae tmp2, tmp6 cmovae dist, tmp3 - inc dist + add dist, 1 + neg dist + + shr tmp8, 8 + compute_hash tmp2, tmp8 cmp dist2 %+ d, (D-1) cmovae dist2, tmp3 - inc dist2 + add dist2, 1 + neg dist2 MARK __compare_ %+ ARCH - ; len = compare258(state->file_start + f_i, - ; state->file_start + f_i - dist); - - ;; Specutively load distance code (except for when large windows are used) - get_packed_dist_code dist, code2, hufftables - ;; Check for long len/dist match (>7) with first literal - mov len, [tmp1] - xor len, [tmp2] + MOVQ len, xdata + mov curr_data, len + PSRLDQ xdata, 1 + xor len, [tmp1 + dist] jz compare_loop - lea tmp1, [file_start + f_i] - mov tmp2, tmp1 - sub tmp2, dist2 - - ;; Specutively load distance code (except for when large windows are used) - get_packed_dist_code dist2, code4, hufftables + MOVD xhash, tmp6 %+ d + PINSRD xhash, tmp2 %+ d, 1 + PAND xhash, xhash, xmask ;; Check for len/dist match (>7) with second literal - mov len2, [tmp1] - xor len2, [tmp2] + MOVQ len2, xdata + xor len2, [tmp1 + dist2 + 1] jz compare_loop2 ;; Specutively load the code for the first literal @@ -446,51 +449,77 @@ len_dist_lit_huffman_pre: shr len2, 3 len_dist_lit_huffman: + neg dist2 %ifndef LONGER_HUFFTABLE mov tmp4, dist2 get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx %else - unpack_dist_code code4, code_len2 + get_dist_code dist2, code4, code_len2, hufftables %endif get_len_code len2, code, rcx, hufftables ;; rcx is code_len -%ifdef USE_HSWNI - shlx code4, code4, rcx -%else - shl code4, cl -%endif + SHLX code4, code4, rcx or code4, code add code_len2, rcx mov rcx, code_len3 -%ifdef USE_HSWNI - shlx code4, code4, rcx -%else - shl code4, cl -%endif + MOVQ tmp5, xdata + shr tmp5, 24 + compute_hash tmp4, tmp5 + and tmp4, HASH_MASK + + SHLX code4, code4, rcx or code4, code3 add code_len2, rcx - mov code2, code4 ;; Setup for updating hash lea tmp3, [f_i + 1] ; tmp3 <= k + add f_i, len2 + MOVDQU xdata, [file_start + f_i] + mov curr_data, [file_start + f_i] + mov curr_data2, curr_data - ; hash = compute_hash(state->file_start + k) & HASH_MASK; - mov tmp5, [file_start + tmp3] - mov tmp7, tmp5 - shr tmp7, 8 - - compute_hash hash, tmp5 - and hash %+ d, HASH_MASK - - ; state->head[hash] = k; + MOVD hash %+ d, xhash + PEXTRD hash2 %+ d, xhash, 1 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w - add tmp3,1 + compute_hash hash, curr_data - jmp update_hash_for_symbol + add tmp3,1 + mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w + + add tmp3, 1 + mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w + + write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf, tmp4 + mov f_end_i, [rsp + f_end_i_mem_offset] + + shr curr_data2, 8 + compute_hash hash2, curr_data2 + +%ifdef NO_LIMIT_HASH_UPDATE +loop3: + add tmp3,1 + cmp tmp3, f_i + jae loop3_done + mov tmp6, [file_start + tmp3] + compute_hash tmp4, tmp6 + and tmp4 %+ d, HASH_MASK + ; state->head[hash] = k; + mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w + jmp loop3 +loop3_done: +%endif + ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; + and hash %+ d, HASH_MASK + and hash2 %+ d, HASH_MASK + + ; continue + cmp f_i, f_end_i + jl loop2 + jmp end_loop_2 ;; encode as dist/len MARK __len_dist_huffman_ %+ ARCH @@ -498,14 +527,15 @@ len_dist_huffman_pre: bsf len, len shr len, 3 len_dist_huffman: - dec f_i + dec f_i + neg dist ; get_dist_code(dist, &code2, &code_len2); %ifndef LONGER_HUFFTABLE mov tmp3, dist ; since code2 and dist are rbx get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx %else - unpack_dist_code code2, code_len2 + get_dist_code dist, code2, code_len2, hufftables %endif ; get_len_code(len, &code, &code_len); get_len_code len, code, rcx, hufftables ;; rcx is code_len @@ -513,68 +543,66 @@ len_dist_huffman: ; code2 <<= code_len ; code2 |= code ; code_len2 += code_len -%ifdef USE_HSWNI - shlx code2, code2, rcx -%else - shl code2, cl -%endif + SHLX code2, code2, rcx or code2, code add code_len2, rcx ;; Setup for updateing hash lea tmp3, [f_i + 2] ; tmp3 <= k add f_i, len - mov tmp7, [file_start + tmp3] -MARK __update_hash_for_symbol_ %+ ARCH -update_hash_for_symbol: + MOVD hash %+ d, xhash + PEXTRD hash2 %+ d, xhash, 1 + mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w + add tmp3,1 + mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w + + MOVDQU xdata, [file_start + f_i] mov curr_data, [file_start + f_i] mov curr_data2, curr_data compute_hash hash, curr_data -%ifdef LIMIT_HASH_UPDATE - ; only update hash twice, first hash was already calculated. - ; hash = compute_hash(state->file_start + k) & HASH_MASK; - compute_hash hash2, tmp7 - and hash2 %+ d, HASH_MASK - ; state->head[hash] = k; - mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w + write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp7 + mov f_end_i, [rsp + f_end_i_mem_offset] -%else -loop3: - ; hash = compute_hash(state->file_start + k) & HASH_MASK; - mov tmp7, [file_start + tmp3] - compute_hash hash2, tmp7 - and hash2 %+ d, HASH_MASK - ; state->head[hash] = k; - mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w + shr curr_data2, 8 + compute_hash hash2, curr_data2 + +%ifdef NO_LIMIT_HASH_UPDATE +loop4: add tmp3,1 + cmp tmp3, f_i - jl loop3 + jae loop4_done + mov tmp6, [file_start + tmp3] + compute_hash tmp4, tmp6 + and tmp4, HASH_MASK + mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w + jmp loop4 +loop4_done: %endif - -MARK __write_len_dist_bits_ %+ ARCH - mov f_end_i, [rsp + f_end_i_mem_offset] - write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 + and hash, HASH_MASK + and hash2, HASH_MASK ; continue cmp f_i, f_end_i jl loop2 jmp end_loop_2 - MARK __write_lit_bits_ %+ ARCH write_lit_bits: + MOVDQU xdata, [file_start + f_i + 1] mov f_end_i, [rsp + f_end_i_mem_offset] add f_i, 1 mov curr_data, [file_start + f_i] - mov curr_data2, curr_data - compute_hash hash, curr_data + MOVD hash %+ d, xhash write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 + PEXTRD hash2 %+ d, xhash, 1 + ; continue cmp f_i, f_end_i jl loop2 @@ -647,6 +675,9 @@ skip2: MOVDQA xmm7, [rsp + xmm_save_mem_offset + 1*16] MOVDQA xmm8, [rsp + xmm_save_mem_offset + 2*16] MOVDQA xmm9, [rsp + xmm_save_mem_offset + 3*16] + MOVDQA xmm10, [rsp + xmm_save_mem_offset + 4*16] + MOVDQA xmm11, [rsp + xmm_save_mem_offset + 5*16] + MOVDQA xmm12, [rsp + xmm_save_mem_offset + 6*16] %ifndef ALIGN_STACK add rsp, stack_size @@ -668,6 +699,10 @@ bitbuf_full: MARK __compare_loops_ %+ ARCH compare_loop: + MOVD xhash, tmp6 %+ d + PINSRD xhash, tmp2 %+ d, 1 + PAND xhash, xhash, xmask + lea tmp2, [tmp1 + dist] %if (COMPARE_TYPE == 1) compare250 tmp1, tmp2, len, tmp3 %elif (COMPARE_TYPE == 2) @@ -681,6 +716,8 @@ compare_loop: jmp len_dist_huffman compare_loop2: + add tmp1, 1 + lea tmp2, [tmp1 + dist2] %if (COMPARE_TYPE == 1) compare250 tmp1, tmp2, len2, tmp3 %elif (COMPARE_TYPE == 2) @@ -701,15 +738,24 @@ write_first_byte: ja bitbuf_full mov dword [rsp + empty_buffer_flag], 0 - compute_hash hash, curr_data - and hash %+ d, HASH_MASK + mov [stream + _internal_state_head + 2 * hash], f_i %+ w + + mov hash, hash2 + shr tmp6, 16 + compute_hash hash2, tmp6 + + MOVD xhash, hash %+ d + PINSRD xhash, hash2 %+ d, 1 + PAND xhash, xhash, xmask + and curr_data, 0xff get_lit_code curr_data, code2, code_len2, hufftables jmp write_lit_bits section .data - align 4 + align 16 +mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK const_D: dq D %endif ;; ifndef TEST diff --git a/igzip/igzip_body_02.asm b/igzip/igzip_body_02.asm new file mode 100644 index 0000000..4c25278 --- /dev/null +++ b/igzip/igzip_body_02.asm @@ -0,0 +1,8 @@ +%define ARCH 02 + +%ifndef COMPARE_TYPE +%define COMPARE_TYPE 2 +%endif + +%include "igzip_buffer_utils_04.asm" +%include "igzip_body.asm" diff --git a/igzip/igzip_buffer_utils_01.asm b/igzip/igzip_buffer_utils_01.asm index c6cb834..672b042 100644 --- a/igzip/igzip_buffer_utils_01.asm +++ b/igzip/igzip_buffer_utils_01.asm @@ -467,7 +467,7 @@ align 16 ;%assign K 1024; ;%assign D 8 * K; ; Amount of history -;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary +;%assign LA 18 * 16; ; Max look-ahead, rounded up to 32 byte boundary ; copy D + LA bytes from src to dst ; dst is aligned diff --git a/igzip/igzip_compare_types.asm b/igzip/igzip_compare_types.asm index 6a49424..a5fb6cd 100644 --- a/igzip/igzip_compare_types.asm +++ b/igzip/igzip_compare_types.asm @@ -28,6 +28,8 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "options.asm" +%include "stdmac.asm" + %ifndef UTILS_ASM %define UTILS_ASM ; compare macro @@ -141,18 +143,18 @@ xor %%result, %%result %%loop1: - movdqu %%xtmp, [%%src1 + %%result] - movdqu %%xtmp2, [%%src2 + %%result] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + %%result] + MOVDQU %%xtmp2, [%%src2 + %%result] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare add %%result, 16 - movdqu %%xtmp, [%%src1 + %%result] - movdqu %%xtmp2, [%%src2 + %%result] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + %%result] + MOVDQU %%xtmp2, [%%src2 + %%result] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare add %%result, 16 @@ -195,26 +197,26 @@ %define %%xtmp2 %6 mov %%result, 8 - movdqu %%xtmp, [%%src1 + 8] - movdqu %%xtmp2, [%%src2 + 8] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + 8] + MOVDQU %%xtmp2, [%%src2 + 8] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare add %%result, 16 %%loop1: - movdqu %%xtmp, [%%src1 + %%result] - movdqu %%xtmp2, [%%src2 + %%result] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + %%result] + MOVDQU %%xtmp2, [%%src2 + %%result] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare add %%result, 16 - movdqu %%xtmp, [%%src1 + %%result] - movdqu %%xtmp2, [%%src2 + %%result] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + %%result] + MOVDQU %%xtmp2, [%%src2 + %%result] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare add %%result, 16 @@ -222,10 +224,10 @@ cmp %%result, 258 - 16 jb %%loop1 - movdqu %%xtmp, [%%src1 + %%result] - movdqu %%xtmp2, [%%src2 + %%result] - pcmpeqb %%xtmp, %%xtmp2 - pmovmskb %%tmp32, %%xtmp + MOVDQU %%xtmp, [%%src1 + %%result] + MOVDQU %%xtmp2, [%%src2 + %%result] + PCMPEQB %%xtmp, %%xtmp, %%xtmp2 + PMOVMSKB %%tmp32, %%xtmp xor %%tmp, 0xFFFF jnz %%miscompare_last ; no miscompares, return 258 diff --git a/igzip/igzip_file_perf.c b/igzip/igzip_file_perf.c index 4f74faa..4e56f8e 100644 --- a/igzip/igzip_file_perf.c +++ b/igzip/igzip_file_perf.c @@ -35,7 +35,7 @@ #include "test.h" #define BUF_SIZE 1024 -#define MIN_TEST_LOOPS 100 +#define MIN_TEST_LOOPS 8 #ifndef RUN_MEM_SIZE # define RUN_MEM_SIZE 500000000 #endif diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm index 8743aea..0b3ebeb 100644 --- a/igzip/igzip_multibinary.asm +++ b/igzip/igzip_multibinary.asm @@ -45,6 +45,7 @@ extern isal_deflate_body_stateless_04 extern isal_deflate_body_base extern isal_deflate_body_01 +extern isal_deflate_body_02 extern isal_deflate_body_04 extern isal_deflate_finish_base extern isal_deflate_finish_01 @@ -70,7 +71,7 @@ mbin_interface isal_deflate_body_stateless mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_02, isal_deflate_body_stateless_04 mbin_interface isal_deflate_body -mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body_01, isal_deflate_body_01, isal_deflate_body_04 +mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body_01, isal_deflate_body_02, isal_deflate_body_04 mbin_interface isal_deflate_finish mbin_dispatch_init5 isal_deflate_finish, isal_deflate_finish_base, isal_deflate_finish_01, isal_deflate_finish_01, isal_deflate_finish_01 diff --git a/igzip/lz0a_const.asm b/igzip/lz0a_const.asm index e788c79..ea12372 100644 --- a/igzip/lz0a_const.asm +++ b/igzip/lz0a_const.asm @@ -29,7 +29,7 @@ %assign K 1024 %assign D HIST_SIZE * K ;; Amount of history -%assign LA 17 * 16 ;; Max look-ahead, rounded up to 32 byte boundary +%assign LA 18 * 16 ;; Max look-ahead, rounded up to 32 byte boundary %assign BSIZE 2*HIST_SIZE*K + LA ;; Nominal buffer size ;; Constants for stateless compression diff --git a/igzip/stdmac.asm b/igzip/stdmac.asm index 4ee1a71..719765c 100644 --- a/igzip/stdmac.asm +++ b/igzip/stdmac.asm @@ -27,7 +27,8 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - +%ifndef STDMAC_ASM +%define STDMAC_ASM ;; internal macro used by push_all ;; push args L to R %macro push_all_ 1-* @@ -347,3 +348,29 @@ ssc: pand %%dest, %%src2 %endif %endm + +%macro PCMPEQB 3 +%define %%dest %1 +%define %%src1 %2 +%define %%src2 %3 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vpcmpeqb %%dest, %%src1, %%src2 +%else +%ifnidn %%dest, %%src1 + movdqa %%dest, %%src1 +%endif + pcmpeqb %%dest, %%src2 +%endif +%endm + +%macro PMOVMSKB 2 +%define %%dest %1 +%define %%src %2 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vpmovmskb %%dest, %%src +%else + pmovmskb %%dest, %%src +%endif +%endm + +%endif ;; ifndef STDMAC_ASM diff --git a/include/igzip_lib.h b/include/igzip_lib.h index fe3ff3a..4c617e3 100644 --- a/include/igzip_lib.h +++ b/include/igzip_lib.h @@ -118,7 +118,7 @@ extern "C" { #define IGZIP_K 1024 #define IGZIP_D (HIST_SIZE * IGZIP_K) /* Amount of history */ -#define IGZIP_LA (17 * 16) /* Max look-ahead, rounded up to 32 byte boundary */ +#define IGZIP_LA (18 * 16) /* Max look-ahead, rounded up to 32 byte boundary */ #define BSIZE (2*IGZIP_D + IGZIP_LA) /* Nominal buffer size */ #define HASH_SIZE IGZIP_D