mirror of
https://github.com/intel/isa-l.git
synced 2024-12-13 09:52:56 +01:00
igzip: Modify igzip_body assembly to run to last 16 bytes.
Change-Id: Ib2c688d0b2d7ff5d4fd7b14bb6eea72a7f689cd3 Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
parent
52d974762b
commit
7345490999
@ -38,6 +38,9 @@
|
||||
|
||||
%include "stdmac.asm"
|
||||
|
||||
%define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements
|
||||
%define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop
|
||||
%define MIN_INBUF_PADDING 16
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@ -69,6 +72,7 @@
|
||||
%define curr_data2 r8
|
||||
%define len2 r8
|
||||
%define tmp6 r8
|
||||
%define f_end_i r8
|
||||
|
||||
%define m_bits r9
|
||||
|
||||
@ -76,7 +80,6 @@
|
||||
|
||||
%define m_out_buf r11
|
||||
|
||||
%define f_end_i r12
|
||||
%define dist2 r12
|
||||
%define tmp7 r12
|
||||
%define code4 r12
|
||||
@ -107,9 +110,10 @@
|
||||
|
||||
blen_mem_offset equ 0 ; local variable (8 bytes)
|
||||
f_end_i_mem_offset equ 8
|
||||
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
|
||||
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
|
||||
stack_size equ 2*8 + 8*8 + 4*16 + 8
|
||||
inbuf_slop_offset equ 16
|
||||
gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes)
|
||||
xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
|
||||
stack_size equ 4*8 + 8*8 + 4*16 + 8
|
||||
;;; 8 because stack address is odd multiple of 8 after a function call and
|
||||
;;; we want it aligned to 16 bytes
|
||||
|
||||
@ -197,8 +201,16 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
mov f_end_i %+ d, [stream + _avail_in]
|
||||
add f_end_i, f_i
|
||||
|
||||
; f_end_i -= LA;
|
||||
sub f_end_i, LA
|
||||
mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
|
||||
cmp byte [stream + _end_of_stream], 0
|
||||
jnz .default_inbuf_padding
|
||||
cmp byte [stream + _flush], 0
|
||||
jnz .default_inbuf_padding
|
||||
mov qword [rsp + inbuf_slop_offset], LA
|
||||
.default_inbuf_padding:
|
||||
|
||||
; f_end_i -= INBUF_PADDING;
|
||||
sub f_end_i, [rsp + inbuf_slop_offset]
|
||||
mov [rsp + f_end_i_mem_offset], f_end_i
|
||||
; if (f_end_i <= 0) continue;
|
||||
|
||||
@ -340,6 +352,10 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
;; Setup for updating hash
|
||||
lea tmp3, [f_i + len2 + 1] ; tmp3 <= k
|
||||
|
||||
mov tmp6, [rsp + f_end_i_mem_offset]
|
||||
cmp f_i, tmp6
|
||||
jge .len_dist_lit_huffman_finish
|
||||
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
mov curr_data, [file_start + f_i]
|
||||
|
||||
@ -356,7 +372,6 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
|
||||
|
||||
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
|
||||
mov curr_data2, curr_data
|
||||
shr curr_data2, 8
|
||||
@ -380,11 +395,21 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
and hash2 %+ d, hmask1 %+ d
|
||||
|
||||
; continue
|
||||
cmp f_i, f_end_i
|
||||
jl .loop2
|
||||
jmp .input_end
|
||||
jmp .loop2
|
||||
;; encode as dist/len
|
||||
.len_dist_lit_huffman_finish:
|
||||
MOVD hash %+ d, xhash
|
||||
PEXTRD tmp6 %+ d, xhash, 1
|
||||
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
||||
add tmp3,1
|
||||
mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
|
||||
add tmp3, 1
|
||||
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
|
||||
|
||||
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
|
||||
jmp .input_end
|
||||
|
||||
align 16
|
||||
.len_dist_huffman_pre:
|
||||
bsf len, len
|
||||
shr len, 3
|
||||
@ -421,12 +446,15 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
|
||||
|
||||
MOVD hmask1 %+ d, xmask
|
||||
|
||||
cmp f_i, [rsp + f_end_i_mem_offset]
|
||||
jge .len_dist_huffman_finish
|
||||
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
mov curr_data, [file_start + f_i]
|
||||
compute_hash hash, curr_data
|
||||
|
||||
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
|
||||
mov curr_data2, curr_data
|
||||
shr curr_data2, 8
|
||||
@ -450,25 +478,32 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
and hash2 %+ d, hmask1 %+ d
|
||||
|
||||
; continue
|
||||
cmp f_i, f_end_i
|
||||
jl .loop2
|
||||
jmp .loop2
|
||||
|
||||
.len_dist_huffman_finish:
|
||||
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
|
||||
jmp .input_end
|
||||
|
||||
align 16
|
||||
.write_lit_bits:
|
||||
MOVDQU xdata, [file_start + f_i + 1]
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
PSRLDQ xdata, 1
|
||||
|
||||
add f_i, 1
|
||||
mov curr_data, [file_start + f_i]
|
||||
cmp f_i, [rsp + f_end_i_mem_offset]
|
||||
jge .write_lit_bits_finish
|
||||
|
||||
MOVQ curr_data, xdata
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
|
||||
MOVD hash %+ d, xhash
|
||||
|
||||
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf
|
||||
|
||||
PEXTRD hash2 %+ d, xhash, 1
|
||||
jmp .loop2
|
||||
|
||||
; continue
|
||||
cmp f_i, f_end_i
|
||||
jl .loop2
|
||||
.write_lit_bits_finish:
|
||||
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf
|
||||
|
||||
.input_end:
|
||||
mov tmp1, ZSTATE_FLUSH_READ_BUFFER
|
||||
@ -481,7 +516,8 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
|
||||
.output_end:
|
||||
;; update input buffer
|
||||
add f_end_i, LA
|
||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||
add f_end_i, [rsp + inbuf_slop_offset]
|
||||
mov [stream + _total_in], f_i %+ d
|
||||
add file_start, f_i
|
||||
mov [stream + _next_in], file_start
|
||||
@ -514,26 +550,196 @@ isal_deflate_body_ %+ ARCH %+ :
|
||||
%endif
|
||||
ret
|
||||
|
||||
align 16
|
||||
.compare_loop:
|
||||
MOVD xhash, tmp6 %+ d
|
||||
PINSRD xhash, tmp2 %+ d, 1
|
||||
PAND xhash, xhash, xmask
|
||||
lea tmp2, [tmp1 + dist - 1]
|
||||
|
||||
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
|
||||
mov len2, [rsp + f_end_i_mem_offset]
|
||||
sub len2, f_i
|
||||
add len2, [rsp + inbuf_slop_offset]
|
||||
add len2, 1
|
||||
|
||||
mov len, 8
|
||||
compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
|
||||
|
||||
cmp len, 258
|
||||
jle .len_dist_huffman
|
||||
cmp len, LARGE_MATCH_MIN
|
||||
jge .do_emit
|
||||
mov len, 258
|
||||
jmp .len_dist_huffman
|
||||
|
||||
align 16
|
||||
.compare_loop2:
|
||||
lea tmp2, [tmp1 + dist2]
|
||||
add tmp1, 1
|
||||
|
||||
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
|
||||
mov len, [rsp + f_end_i_mem_offset]
|
||||
sub len, f_i
|
||||
add len, [rsp + inbuf_slop_offset]
|
||||
|
||||
mov len2, 8
|
||||
compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
|
||||
|
||||
and curr_data, 0xff
|
||||
get_lit_code curr_data, code3, code_len3, hufftables
|
||||
cmp len2, 258
|
||||
jle .len_dist_lit_huffman
|
||||
cmp len2, LARGE_MATCH_MIN
|
||||
jge .do_emit2
|
||||
mov len2, 258
|
||||
jmp .len_dist_lit_huffman
|
||||
|
||||
align 16
|
||||
.do_emit2:
|
||||
neg dist2
|
||||
|
||||
; get_dist_code(dist2, &code2, &code_len2);
|
||||
get_dist_code dist2, code2, code_len2, hufftables
|
||||
|
||||
; get_len_code(len, &code, &code_len);
|
||||
get_len_code 258, code, rcx, hufftables ;; rcx is code_len
|
||||
|
||||
; code2 <<= code_len
|
||||
; code2 |= code
|
||||
; code_len2 += code_len
|
||||
SHLX code4, code2, rcx
|
||||
or code4, code
|
||||
add code_len2, rcx
|
||||
mov tmp5, rcx
|
||||
|
||||
mov rcx, code_len3
|
||||
SHLX tmp8, code4, rcx
|
||||
or code3, tmp8
|
||||
add rcx, code_len2
|
||||
mov code_len3, rcx
|
||||
|
||||
write_bits m_bits, m_bit_count, code3, code_len3, m_out_buf
|
||||
|
||||
lea tmp3, [f_i + 2] ; tmp3 <= k
|
||||
MOVD tmp2 %+ d, xhash
|
||||
mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
|
||||
add tmp3,1
|
||||
PEXTRD tmp2 %+ d, xhash, 1
|
||||
mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
|
||||
|
||||
add f_i, 258
|
||||
lea len, [len2 - 258]
|
||||
|
||||
jmp .emit_loop
|
||||
|
||||
.do_emit:
|
||||
dec f_i
|
||||
neg dist
|
||||
|
||||
; get_dist_code(dist, &code2, &code_len2);
|
||||
%ifndef LONGER_HUFFTABLE
|
||||
mov tmp3, dist ; since code2 and dist are rbx
|
||||
get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
|
||||
%else
|
||||
get_dist_code dist, code2, code_len2, hufftables
|
||||
%endif
|
||||
; get_len_code(len, &code, &code_len);
|
||||
get_len_code 258, code, rcx, hufftables ;; rcx is code_len
|
||||
|
||||
; code2 <<= code_len
|
||||
; code2 |= code
|
||||
; code_len2 += code_len
|
||||
SHLX code4, code2, rcx
|
||||
or code4, code
|
||||
add code_len2, rcx
|
||||
|
||||
lea tmp3, [f_i + 2] ; tmp3 <= k
|
||||
MOVD tmp6 %+ d, xhash
|
||||
PEXTRD tmp5 %+ d, xhash, 1
|
||||
mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
|
||||
add tmp3,1
|
||||
mov [stream + _internal_state_head + 2 * tmp5], tmp3 %+ w
|
||||
mov tmp5, rcx
|
||||
|
||||
.emit:
|
||||
add f_i, 258
|
||||
sub len, 258
|
||||
mov code3, code4
|
||||
|
||||
write_bits m_bits, m_bit_count, code3, code_len2, m_out_buf
|
||||
|
||||
.emit_loop:
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja .output_end
|
||||
cmp len, LARGE_MATCH_MIN
|
||||
jge .emit
|
||||
|
||||
mov len2, 258
|
||||
cmp len, len2
|
||||
cmovg len, len2
|
||||
|
||||
add f_i, len
|
||||
|
||||
sub code_len2, tmp5
|
||||
get_len_code len, code, rcx, hufftables
|
||||
SHLX code4, code2, rcx
|
||||
or code4, code
|
||||
add code_len2, rcx
|
||||
|
||||
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
|
||||
|
||||
cmp f_i, [rsp + f_end_i_mem_offset]
|
||||
jge .input_end
|
||||
|
||||
lea tmp7, [f_i - 4 * LARGE_MATCH_HASH_REP]
|
||||
MOVD hmask1 %+ d, xmask
|
||||
%rep LARGE_MATCH_HASH_REP
|
||||
mov curr_data %+ d, dword [file_start + tmp7]
|
||||
mov curr_data2 %+ d, dword [file_start + tmp7 + 1]
|
||||
|
||||
compute_hash hash, curr_data
|
||||
compute_hash hash2, curr_data2
|
||||
|
||||
and hash %+ d, hmask1 %+ d
|
||||
and hash2 %+ d, hmask1 %+ d
|
||||
|
||||
mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w
|
||||
add tmp7, 1
|
||||
mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w
|
||||
add tmp7, 1
|
||||
|
||||
mov curr_data %+ d, dword [file_start + tmp7]
|
||||
mov curr_data2 %+ d, dword [file_start + tmp7 + 1]
|
||||
|
||||
compute_hash hash, curr_data
|
||||
compute_hash hash2, curr_data2
|
||||
|
||||
and hash %+ d, hmask1 %+ d
|
||||
and hash2 %+ d, hmask1 %+ d
|
||||
|
||||
mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w
|
||||
add tmp7, 1
|
||||
mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w
|
||||
%if (LARGE_MATCH_HASH_REP > 1)
|
||||
add tmp7, 1
|
||||
%endif
|
||||
%endrep
|
||||
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
mov curr_data, [file_start + f_i]
|
||||
compute_hash hash, curr_data
|
||||
|
||||
|
||||
mov curr_data2, curr_data
|
||||
shr curr_data2, 8
|
||||
compute_hash hash2, curr_data2
|
||||
|
||||
; hash = compute_hash(state->file_start + f_i) & hash_mask;
|
||||
and hash %+ d, hmask1 %+ d
|
||||
and hash2 %+ d, hmask1 %+ d
|
||||
|
||||
; continue
|
||||
jmp .loop2
|
||||
|
||||
.write_first_byte:
|
||||
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
|
||||
ja .output_end
|
||||
|
@ -37,118 +37,90 @@
|
||||
;; sttni2 is faster, but it can't be debugged
|
||||
;; so following code is based on "mine5"
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2
|
||||
;; tmp16 is a 16-bit version of tmp
|
||||
;; compare258 src1, src2, result, tmp
|
||||
%macro compare258 4
|
||||
;; compares 8 bytes at a time, using xor
|
||||
;; assumes the input buffer has size at least 8
|
||||
;; compare_r src1, src2, result, result_max, tmp
|
||||
%macro compare_r 5
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp16 %4w ; tmp as a 16-bit register
|
||||
%define %%result_max %4
|
||||
%define %%tmp %5
|
||||
%define %%tmp16 %5w ; tmp as a 16-bit register
|
||||
|
||||
sub %%result_max, 16
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_8
|
||||
|
||||
xor %%result, %%result
|
||||
%%loop1:
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
cmp %%result, %%result_max
|
||||
jle %%loop1
|
||||
|
||||
cmp %%result, 256
|
||||
jb %%loop1
|
||||
%%_by_8:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
jg %%_cmp_last
|
||||
|
||||
; compare last two bytes
|
||||
mov %%tmp16, [%%src1 + %%result]
|
||||
xor %%tmp16, [%%src2 + %%result]
|
||||
jnz %%miscompare16
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
|
||||
; no miscompares, return 258
|
||||
add %%result, 2
|
||||
%%_cmp_last:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
je %%end
|
||||
|
||||
lea %%result, [%%result_max - 8]
|
||||
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
jmp %%end
|
||||
|
||||
%%miscompare16:
|
||||
and %%tmp, 0xFFFF
|
||||
%%miscompare:
|
||||
%%miscompare_reg:
|
||||
bsf %%tmp, %%tmp
|
||||
shr %%tmp, 3
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2
|
||||
;; tmp16 is a 16-bit version of tmp
|
||||
;; compare258 src1, src2, result, tmp
|
||||
%macro compare250_r 4
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp16 %4w ; tmp as a 16-bit register
|
||||
|
||||
mov %%result, 8
|
||||
mov %%tmp, [%%src1 + 8]
|
||||
xor %%tmp, [%%src2 + 8]
|
||||
jnz %%miscompare
|
||||
add %%result, 8
|
||||
|
||||
%%loop1:
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare
|
||||
add %%result, 8
|
||||
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare
|
||||
add %%result, 8
|
||||
|
||||
cmp %%result, 256
|
||||
jb %%loop1
|
||||
|
||||
; compare last two bytes
|
||||
mov %%tmp16, [%%src1 + %%result]
|
||||
xor %%tmp16, [%%src2 + %%result]
|
||||
jnz %%miscompare16
|
||||
|
||||
; no miscompares, return 258
|
||||
add %%result, 2
|
||||
jmp %%end
|
||||
|
||||
%%miscompare16:
|
||||
and %%tmp, 0xFFFF
|
||||
%%miscompare:
|
||||
bsf %%tmp, %%tmp
|
||||
shr %%tmp, 3
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2
|
||||
;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
|
||||
;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
|
||||
%macro compare258_x 6
|
||||
;; assumes the input buffer has size at least 8
|
||||
;; compare_x src1, src2, result, result_max, tmp, xtmp1, xtmp2
|
||||
%macro compare_x 7
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp32 %4d
|
||||
%define %%tmp16 %4w ; tmp as a 16-bit register
|
||||
%define %%xtmp %5
|
||||
%define %%xtmp2 %6
|
||||
%define %%result %3 ; Accumulator for match_length
|
||||
%define %%result_max %4
|
||||
%define %%tmp %5
|
||||
%define %%tmp16 %5w ; tmp as a 16-bit register
|
||||
%define %%tmp32 %5d ; tmp as a 32-bit register
|
||||
%define %%xtmp %6
|
||||
%define %%xtmp2 %7
|
||||
|
||||
sub %%result_max, 32
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_16
|
||||
|
||||
xor %%result, %%result
|
||||
%%loop1:
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
MOVDQU %%xtmp2, [%%src2 + %%result]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 16
|
||||
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
@ -156,120 +128,86 @@
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 16
|
||||
|
||||
cmp %%result, 256
|
||||
jb %%loop1
|
||||
cmp %%result, %%result_max
|
||||
jle %%loop1
|
||||
|
||||
%%_by_16:
|
||||
add %%result_max, 16
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_8
|
||||
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
MOVDQU %%xtmp2, [%%src2 + %%result]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 16
|
||||
|
||||
%%_by_8:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
jg %%_cmp_last
|
||||
|
||||
; compare last two bytes
|
||||
mov %%tmp16, [%%src1 + %%result]
|
||||
xor %%tmp16, [%%src2 + %%result]
|
||||
jnz %%miscompare16
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
|
||||
; no miscompares, return 258
|
||||
add %%result, 2
|
||||
%%_cmp_last:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
je %%end
|
||||
|
||||
lea %%result, [%%result_max - 8]
|
||||
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
jmp %%end
|
||||
|
||||
%%miscompare16:
|
||||
and %%tmp, 0xFFFF
|
||||
%%miscompare_reg:
|
||||
bsf %%tmp, %%tmp
|
||||
shr %%tmp, 3
|
||||
add %%result, %%tmp
|
||||
jmp %%end
|
||||
%%miscompare:
|
||||
|
||||
%%miscompare_vect:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
|
||||
;; were already checked
|
||||
;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
|
||||
;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
|
||||
%macro compare250_x 6
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp32 %4d ; tmp as a 16-bit register
|
||||
%define %%xtmp %5
|
||||
%define %%xtmp2 %6
|
||||
|
||||
mov %%result, 8
|
||||
MOVDQU %%xtmp, [%%src1 + 8]
|
||||
MOVDQU %%xtmp2, [%%src2 + 8]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 16
|
||||
%%loop1:
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
MOVDQU %%xtmp2, [%%src2 + %%result]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 16
|
||||
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
MOVDQU %%xtmp2, [%%src2 + %%result]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 16
|
||||
|
||||
cmp %%result, 258 - 16
|
||||
jb %%loop1
|
||||
|
||||
MOVDQU %%xtmp, [%%src1 + %%result]
|
||||
MOVDQU %%xtmp2, [%%src2 + %%result]
|
||||
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
|
||||
PMOVMSKB %%tmp32, %%xtmp
|
||||
xor %%tmp, 0xFFFF
|
||||
jnz %%miscompare_last
|
||||
; no miscompares, return 258
|
||||
mov %%result, 258
|
||||
jmp %%end
|
||||
|
||||
%%miscompare_last:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
|
||||
;; Guarantee the result has length at most 258.
|
||||
mov %%tmp, 258
|
||||
cmp %%result, 258
|
||||
cmova %%result, %%tmp
|
||||
jmp %%end
|
||||
%%miscompare:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2
|
||||
;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
|
||||
;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
|
||||
%macro compare258_y 6
|
||||
;; assumes the input buffer has size at least 8
|
||||
;; compare_y src1, src2, result, result_max, tmp, xtmp1, xtmp2
|
||||
%macro compare_y 7
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp16 %4w ; tmp as a 16-bit register
|
||||
%define %%tmp32 %4d ; tmp as a 32-bit register
|
||||
%define %%ytmp %5
|
||||
%define %%ytmp2 %6
|
||||
%define %%result %3 ; Accumulator for match_length
|
||||
%define %%result_max %4
|
||||
%define %%tmp %5
|
||||
%define %%tmp16 %5w ; tmp as a 16-bit register
|
||||
%define %%tmp32 %5d ; tmp as a 32-bit register
|
||||
%define %%ytmp %6
|
||||
%define %%ytmp2 %7
|
||||
|
||||
sub %%result_max, 64
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_32
|
||||
|
||||
xor %%result, %%result
|
||||
%%loop1:
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 32
|
||||
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
@ -277,123 +215,125 @@
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 32
|
||||
|
||||
cmp %%result, 256
|
||||
jb %%loop1
|
||||
cmp %%result, %%result_max
|
||||
jle %%loop1
|
||||
|
||||
%%_by_32:
|
||||
add %%result_max, 32
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_16
|
||||
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 32
|
||||
|
||||
%%_by_16:
|
||||
add %%result_max, 16
|
||||
cmp %%result, %%result_max
|
||||
jg %%_by_8
|
||||
|
||||
vmovdqu %%ytmp %+ x, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2 %+ x, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp %+ x, %%ytmp %+ x, %%ytmp2 %+ x
|
||||
vpmovmskb %%tmp, %%ytmp %+ x
|
||||
xor %%tmp32, 0xFFFF
|
||||
jnz %%miscompare_vect
|
||||
add %%result, 16
|
||||
|
||||
%%_by_8:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
jg %%_cmp_last
|
||||
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
|
||||
%%_cmp_last:
|
||||
add %%result_max, 8
|
||||
cmp %%result, %%result_max
|
||||
je %%end
|
||||
|
||||
lea %%result, [%%result_max - 8]
|
||||
|
||||
; compare last two bytes
|
||||
mov %%tmp16, [%%src1 + %%result]
|
||||
xor %%tmp16, [%%src2 + %%result]
|
||||
jnz %%miscompare16
|
||||
|
||||
; no miscompares, return 258
|
||||
add %%result, 2
|
||||
mov %%tmp, [%%src1 + %%result]
|
||||
xor %%tmp, [%%src2 + %%result]
|
||||
jnz %%miscompare_reg
|
||||
add %%result, 8
|
||||
jmp %%end
|
||||
|
||||
%%miscompare16:
|
||||
and %%tmp, 0xFFFF
|
||||
%%miscompare_reg:
|
||||
bsf %%tmp, %%tmp
|
||||
shr %%tmp, 3
|
||||
add %%result, %%tmp
|
||||
jmp %%end
|
||||
%%miscompare:
|
||||
|
||||
%%miscompare_vect:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
|
||||
;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
|
||||
;; were already checked
|
||||
;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
|
||||
;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
|
||||
%macro compare250_y 6
|
||||
%macro compare250 7
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%tmp16 %4w ; tmp as a 16-bit register
|
||||
%define %%tmp32 %4d ; tmp as a 32-bit register
|
||||
%define %%ytmp %5
|
||||
%define %%ytmp2 %6
|
||||
%define %%result_max %4
|
||||
%define %%tmp %5
|
||||
%define %%xtmp0 %6x
|
||||
%define %%xtmp1 %7x
|
||||
%define %%ytmp0 %6
|
||||
%define %%ytmp1 %7
|
||||
|
||||
mov %%result, 8
|
||||
vmovdqu %%ytmp, [%%src1 + 8]
|
||||
vmovdqu %%ytmp2, [%%src2 + 8]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 32
|
||||
%%loop1:
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 32
|
||||
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare
|
||||
add %%result, 32
|
||||
|
||||
cmp %%result, 258 - 32
|
||||
jb %%loop1
|
||||
|
||||
vmovdqu %%ytmp, [%%src1 + %%result]
|
||||
vmovdqu %%ytmp2, [%%src2 + %%result]
|
||||
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
|
||||
vpmovmskb %%tmp, %%ytmp
|
||||
xor %%tmp32, 0xFFFFFFFF
|
||||
jnz %%miscompare_last
|
||||
mov %%result, 258
|
||||
jmp %%end
|
||||
|
||||
%%miscompare_last:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
|
||||
;; Guarantee the result has length at most 258.
|
||||
mov %%tmp, 258
|
||||
cmp %%result, 258
|
||||
cmova %%result, %%tmp
|
||||
jmp %%end
|
||||
|
||||
%%miscompare:
|
||||
bsf %%tmp, %%tmp
|
||||
add %%result, %%tmp
|
||||
%%end:
|
||||
%endm
|
||||
|
||||
%macro compare250 6
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%tmp %4
|
||||
%define %%xtmp0 %5x
|
||||
%define %%xtmp1 %6x
|
||||
%define %%ytmp0 %5
|
||||
%define %%ytmp1 %6
|
||||
mov %%tmp, 250
|
||||
cmp %%result_max, 250
|
||||
cmovg %%result_max, %%tmp
|
||||
|
||||
%if (COMPARE_TYPE == 1)
|
||||
compare250_r %%src1, %%src2, %%result, %%tmp
|
||||
compare_r %%src1, %%src2, %%result, %%result_max, %%tmp
|
||||
%elif (COMPARE_TYPE == 2)
|
||||
compare250_x %%src1, %%src2, %%result, %%tmp, %%xtmp0, %%xtmp1
|
||||
compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
|
||||
%elif (COMPARE_TYPE == 3)
|
||||
compare250_y %%src1, %%src2, %%result, %%tmp, %%ytmp0, %%ytmp1
|
||||
compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
|
||||
%else
|
||||
%error Unknown Compare type COMPARE_TYPE
|
||||
% error
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Assumes the buffer has at least 8 bytes
|
||||
; Accumulates match length onto result
|
||||
%macro compare_large 7
|
||||
%define %%src1 %1
|
||||
%define %%src2 %2
|
||||
%define %%result %3
|
||||
%define %%result_max %4
|
||||
%define %%tmp %5
|
||||
%define %%xtmp0 %6x
|
||||
%define %%xtmp1 %7x
|
||||
%define %%ytmp0 %6
|
||||
%define %%ytmp1 %7
|
||||
|
||||
%if (COMPARE_TYPE == 1)
|
||||
compare_r %%src1, %%src2, %%result, %%result_max, %%tmp
|
||||
%elif (COMPARE_TYPE == 2)
|
||||
compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
|
||||
%elif (COMPARE_TYPE == 3)
|
||||
compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
|
||||
%else
|
||||
%error Unknown Compare type COMPARE_TYPE
|
||||
% error
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
@ -47,6 +47,9 @@ global %1
|
||||
%endm
|
||||
%endif
|
||||
|
||||
%define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements
|
||||
%define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop
|
||||
%define MIN_INBUF_PADDING 16
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@ -76,6 +79,7 @@ global %1
|
||||
%define len2 r8
|
||||
%define tmp4 r8
|
||||
%define hmask1 r8
|
||||
%define len_code2 r8
|
||||
|
||||
%define len rdx
|
||||
%define len_code rdx
|
||||
@ -110,9 +114,10 @@ dist_mask_offset equ 16
|
||||
hash_mask_offset equ 24
|
||||
f_end_i_mem_offset equ 32
|
||||
stream_offset equ 40
|
||||
gpr_save_mem_offset equ 48 ; gpr save area (8*8 bytes)
|
||||
inbuf_slop_offset equ 48
|
||||
gpr_save_mem_offset equ 64 ; gpr save area (8*8 bytes)
|
||||
xmm_save_mem_offset equ gpr_save_mem_offset + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
|
||||
stack_size equ 7*8 + 8*8 + 4*16
|
||||
stack_size equ 9*8 + 8*8 + 4*16
|
||||
|
||||
;;; 8 because stack address is odd multiple of 8 after a function call and
|
||||
;;; we want it aligned to 16 bytes
|
||||
@ -208,8 +213,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
mov file_length %+ d, [stream + _avail_in]
|
||||
add file_length, f_i
|
||||
|
||||
; file_length -= LA;
|
||||
sub file_length, LA
|
||||
mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
|
||||
cmp byte [stream + _end_of_stream], 0
|
||||
jnz .default_inbuf_padding
|
||||
cmp byte [stream + _flush], 0
|
||||
jnz .default_inbuf_padding
|
||||
mov qword [rsp + inbuf_slop_offset], LA
|
||||
.default_inbuf_padding:
|
||||
|
||||
; file_length -= INBUF_PADDING;
|
||||
sub file_length, [rsp + inbuf_slop_offset]
|
||||
; if (file_length <= 0) continue;
|
||||
mov hmask1 %+ d, [rsp + hash_mask_offset]
|
||||
|
||||
@ -220,7 +233,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
mov curr_data, [file_start + f_i]
|
||||
mov tmp1, curr_data
|
||||
mov tmp2, curr_data
|
||||
|
||||
compute_hash hash, curr_data
|
||||
|
||||
@ -295,6 +307,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
test len %+ d, 0xFFFFFFFF
|
||||
jz .len_dist_huffman_pre
|
||||
|
||||
PSRLDQ xdata, 1
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
|
||||
movzx lit_code2, curr_data %+ b
|
||||
;; Check for len/dist match for second literal
|
||||
@ -318,9 +331,15 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
;; Setup for updating hash
|
||||
lea tmp3, [f_i + 1] ; tmp3 <= k
|
||||
|
||||
mov tmp2, f_i
|
||||
add file_start, f_i
|
||||
add f_i, len2
|
||||
cmp f_i, file_length
|
||||
jg .len_dist_lit_huffman_finish
|
||||
|
||||
MOVDQU xdata, [file_start + len2]
|
||||
mov tmp1, [file_start + len2]
|
||||
sub file_start, tmp2
|
||||
|
||||
shr curr_data, 24
|
||||
compute_hash hash3, curr_data
|
||||
@ -329,9 +348,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
mov curr_data, tmp1
|
||||
shr tmp1, 8
|
||||
|
||||
sub file_start, f_i
|
||||
add f_i, len2
|
||||
|
||||
mov [hash_table + 2 * hash], tmp3 %+ w
|
||||
|
||||
compute_hash hash, curr_data
|
||||
@ -361,10 +377,28 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
and hash2 %+ d, hmask3 %+ d
|
||||
|
||||
; continue
|
||||
cmp f_i, file_length
|
||||
jl .loop2
|
||||
jmp .loop2
|
||||
|
||||
.len_dist_lit_huffman_finish:
|
||||
sub file_start, tmp2
|
||||
|
||||
mov [hash_table + 2 * hash], tmp3 %+ w
|
||||
add tmp3,1
|
||||
mov [hash_table + 2 * hash2], tmp3 %+ w
|
||||
|
||||
add dist_code2, 254
|
||||
add dist_code2, len2
|
||||
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*(len2 + 254)]
|
||||
|
||||
movnti dword [m_out_buf + 4], dist_code2 %+ d
|
||||
add m_out_buf, 8
|
||||
|
||||
shr dist_code2, DIST_OFFSET
|
||||
and dist_code2, 0x1F
|
||||
inc dword [dist_hist + HIST_ELEM_SIZE*dist_code2]
|
||||
|
||||
jmp .input_end
|
||||
;; encode as dist/len
|
||||
|
||||
.len_dist_huffman_pre:
|
||||
bsf len, len
|
||||
@ -380,14 +414,21 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
; get_dist_code(dist, &code2, &code_len2);
|
||||
get_dist_icf_code dist, dist_code, tmp1
|
||||
|
||||
.len_dist_huffman_skip:
|
||||
|
||||
mov hmask2 %+ d, [rsp + hash_mask_offset]
|
||||
|
||||
mov tmp1, f_i
|
||||
add file_start, f_i
|
||||
|
||||
add f_i, len
|
||||
cmp f_i, file_length
|
||||
jg .len_dist_huffman_finish
|
||||
|
||||
MOVDQU xdata, [file_start + len]
|
||||
mov curr_data2, [file_start + len]
|
||||
mov curr_data, curr_data2
|
||||
sub file_start, f_i
|
||||
add f_i, len
|
||||
sub file_start, tmp1
|
||||
; get_len_code(len, &code, &code_len);
|
||||
lea len_code, [len + 254]
|
||||
or dist_code, len_code
|
||||
@ -415,15 +456,39 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
and hash2 %+ d, hmask2 %+ d
|
||||
|
||||
; continue
|
||||
cmp f_i, file_length
|
||||
jl .loop2
|
||||
jmp .loop2
|
||||
|
||||
.len_dist_huffman_finish:
|
||||
sub file_start, tmp1
|
||||
|
||||
; get_len_code(len, &code, &code_len);
|
||||
lea len_code, [len + 254]
|
||||
or dist_code, len_code
|
||||
|
||||
mov [hash_table + 2 * hash], tmp3 %+ w
|
||||
add tmp3,1
|
||||
mov [hash_table + 2 * hash2], tmp3 %+ w
|
||||
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
|
||||
|
||||
movnti dword [m_out_buf], dist_code %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
shr dist_code, DIST_OFFSET
|
||||
and dist_code, 0x1F
|
||||
inc dword [dist_hist + HIST_ELEM_SIZE*dist_code]
|
||||
|
||||
jmp .input_end
|
||||
|
||||
.write_lit_bits:
|
||||
MOVDQU xdata, [file_start + f_i + 1]
|
||||
add f_i, 1
|
||||
MOVQ curr_data, xdata
|
||||
|
||||
add f_i, 1
|
||||
cmp f_i, file_length
|
||||
jg .write_lit_bits_finish
|
||||
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
|
||||
|
||||
shl lit_code2, DIST_OFFSET
|
||||
@ -432,9 +497,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
movnti dword [m_out_buf], lit_code %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
; continue
|
||||
cmp f_i, file_length
|
||||
jl .loop2
|
||||
jmp .loop2
|
||||
|
||||
.write_lit_bits_finish:
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
|
||||
|
||||
shl lit_code2, DIST_OFFSET
|
||||
lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
|
||||
|
||||
movnti dword [m_out_buf], lit_code %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
.input_end:
|
||||
mov stream, [rsp + stream_offset]
|
||||
@ -454,7 +526,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
|
||||
.end:
|
||||
;; update input buffer
|
||||
add file_length, LA
|
||||
add file_length, [rsp + inbuf_slop_offset]
|
||||
mov [stream + _total_in], f_i %+ d
|
||||
mov [stream + _internal_state_block_end], f_i %+ d
|
||||
add file_start, f_i
|
||||
@ -487,21 +559,143 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
|
||||
.compare_loop:
|
||||
lea tmp2, [tmp1 + dist - 1]
|
||||
|
||||
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
|
||||
mov len2, file_length
|
||||
sub len2, f_i
|
||||
add len2, [rsp + inbuf_slop_offset]
|
||||
add len2, 1
|
||||
|
||||
mov len, 8
|
||||
compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
|
||||
|
||||
cmp len, 258
|
||||
jle .len_dist_huffman
|
||||
cmp len, LARGE_MATCH_MIN
|
||||
jge .do_emit
|
||||
mov len, 258
|
||||
jmp .len_dist_huffman
|
||||
|
||||
.compare_loop2:
|
||||
lea tmp2, [tmp1 + dist2]
|
||||
add tmp1, 1
|
||||
|
||||
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
|
||||
mov len, file_length
|
||||
sub len, f_i
|
||||
add len, [rsp + inbuf_slop_offset]
|
||||
mov len2, 8
|
||||
compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
|
||||
|
||||
movzx lit_code, curr_data %+ b
|
||||
shr curr_data, 8
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
|
||||
cmp len2, 258
|
||||
jle .len_dist_lit_huffman
|
||||
cmp len2, LARGE_MATCH_MIN
|
||||
jge .do_emit2
|
||||
mov len2, 258
|
||||
jmp .len_dist_lit_huffman
|
||||
|
||||
.do_emit2:
|
||||
or lit_code, LIT
|
||||
movnti dword [m_out_buf], lit_code %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
inc f_i
|
||||
mov dist, dist2
|
||||
mov len, len2
|
||||
|
||||
.do_emit:
|
||||
neg dist
|
||||
get_dist_icf_code dist, dist_code, tmp1
|
||||
|
||||
mov len_code2, 258 + 254
|
||||
or len_code2, dist_code
|
||||
mov tmp1, dist_code
|
||||
shr tmp1, DIST_OFFSET
|
||||
and tmp1, 0x1F
|
||||
lea tmp3, [f_i + 1]
|
||||
dec f_i
|
||||
|
||||
mov [hash_table + 2 * hash], tmp3 %+ w
|
||||
add tmp3,1
|
||||
mov [hash_table + 2 * hash2], tmp3 %+ w
|
||||
.emit:
|
||||
sub len, 258
|
||||
add f_i, 258
|
||||
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*(258 + 254)]
|
||||
inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
|
||||
movnti dword [m_out_buf], len_code2 %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
cmp m_out_buf, [rsp + m_out_end]
|
||||
ja .output_end
|
||||
|
||||
cmp len, LARGE_MATCH_MIN
|
||||
jge .emit
|
||||
|
||||
mov len2, 258
|
||||
cmp len, len2
|
||||
cmovg len, len2
|
||||
|
||||
; get_len_code(len, &code, &code_len);
|
||||
add f_i, len
|
||||
lea len_code, [len + 254]
|
||||
or dist_code, len_code
|
||||
|
||||
inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
|
||||
inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
|
||||
|
||||
movnti dword [m_out_buf], dist_code %+ d
|
||||
add m_out_buf, 4
|
||||
|
||||
cmp file_length, f_i
|
||||
jle .input_end
|
||||
|
||||
lea tmp2, [f_i - 4 * LARGE_MATCH_HASH_REP]
|
||||
mov hmask2 %+ d, [rsp + hash_mask_offset]
|
||||
|
||||
%rep LARGE_MATCH_HASH_REP
|
||||
mov curr_data %+ d, dword [file_start + tmp2]
|
||||
mov curr_data2 %+ d, dword [file_start + tmp2 + 1]
|
||||
mov tmp3 %+ d, dword [file_start + tmp2 + 2]
|
||||
mov tmp1 %+ d, dword [file_start + tmp2 + 3]
|
||||
|
||||
compute_hash hash, curr_data
|
||||
compute_hash hash2, curr_data2
|
||||
compute_hash hash3, tmp3
|
||||
compute_hash hmask3, tmp1
|
||||
|
||||
and hash %+ d, hmask2 %+ d
|
||||
and hash2 %+ d, hmask2 %+ d
|
||||
and hash3 %+ d, hmask2 %+ d
|
||||
and hmask3 %+ d, hmask2 %+ d
|
||||
|
||||
mov [hash_table + 2 * hash], tmp2 %+ w
|
||||
add tmp2, 1
|
||||
mov [hash_table + 2 * hash2], tmp2 %+ w
|
||||
add tmp2, 1
|
||||
mov [hash_table + 2 * hash3], tmp2 %+ w
|
||||
add tmp2, 1
|
||||
mov [hash_table + 2 * hmask3], tmp2 %+ w
|
||||
%if (LARGE_MATCH_HASH_REP > 1)
|
||||
add tmp2, 1
|
||||
%endif
|
||||
%endrep
|
||||
; for (f_i = f_start_i; f_i < file_length; f_i++) {
|
||||
MOVDQU xdata, [file_start + f_i]
|
||||
mov curr_data, [file_start + f_i]
|
||||
mov tmp1, curr_data
|
||||
|
||||
compute_hash hash, curr_data
|
||||
|
||||
shr tmp1, 8
|
||||
compute_hash hash2, tmp1
|
||||
|
||||
and hash, hmask2
|
||||
and hash2, hmask2
|
||||
|
||||
jmp .loop2
|
||||
|
||||
.write_first_byte:
|
||||
mov hmask1 %+ d, [rsp + hash_mask_offset]
|
||||
cmp m_out_buf, [rsp + m_out_end]
|
||||
|
@ -545,7 +545,9 @@ compare_loop:
|
||||
and hash2 %+ d, LVL0_HASH_MASK
|
||||
lea tmp2, [tmp1 + dist - 1]
|
||||
|
||||
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
|
||||
mov len2, 250
|
||||
mov len, 8
|
||||
compare250 tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
|
||||
|
||||
lea tmp3, [f_i + 1]
|
||||
jmp len_dist_huffman
|
||||
@ -554,7 +556,9 @@ compare_loop2:
|
||||
add tmp1, 1
|
||||
lea tmp2, [tmp1 + dist2 - 1]
|
||||
|
||||
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
|
||||
mov len, 250
|
||||
mov len2, 8
|
||||
compare250 tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
|
||||
|
||||
and curr_data, 0xff
|
||||
inc qword [histogram + _lit_len_offset + 8 * curr_data]
|
||||
|
Loading…
Reference in New Issue
Block a user