igzip: Modify igzip_body assembly to run to last 16 bytes.

Change-Id: Ib2c688d0b2d7ff5d4fd7b14bb6eea72a7f689cd3
Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
Roy Oursler 2018-07-27 17:26:13 -04:00
parent 52d974762b
commit 7345490999
4 changed files with 645 additions and 301 deletions

View File

@ -38,6 +38,9 @@
%include "stdmac.asm"
%define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements
%define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop
%define MIN_INBUF_PADDING 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -69,6 +72,7 @@
%define curr_data2 r8
%define len2 r8
%define tmp6 r8
%define f_end_i r8
%define m_bits r9
@ -76,7 +80,6 @@
%define m_out_buf r11
%define f_end_i r12
%define dist2 r12
%define tmp7 r12
%define code4 r12
@ -107,9 +110,10 @@
blen_mem_offset equ 0 ; local variable (8 bytes)
f_end_i_mem_offset equ 8
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
stack_size equ 2*8 + 8*8 + 4*16 + 8
inbuf_slop_offset equ 16
gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
stack_size equ 4*8 + 8*8 + 4*16 + 8
;;; 8 because stack address is odd multiple of 8 after a function call and
;;; we want it aligned to 16 bytes
@ -197,8 +201,16 @@ isal_deflate_body_ %+ ARCH %+ :
mov f_end_i %+ d, [stream + _avail_in]
add f_end_i, f_i
; f_end_i -= LA;
sub f_end_i, LA
mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
cmp byte [stream + _end_of_stream], 0
jnz .default_inbuf_padding
cmp byte [stream + _flush], 0
jnz .default_inbuf_padding
mov qword [rsp + inbuf_slop_offset], LA
.default_inbuf_padding:
; f_end_i -= INBUF_PADDING;
sub f_end_i, [rsp + inbuf_slop_offset]
mov [rsp + f_end_i_mem_offset], f_end_i
; if (f_end_i <= 0) continue;
@ -340,6 +352,10 @@ isal_deflate_body_ %+ ARCH %+ :
;; Setup for updating hash
lea tmp3, [f_i + len2 + 1] ; tmp3 <= k
mov tmp6, [rsp + f_end_i_mem_offset]
cmp f_i, tmp6
jge .len_dist_lit_huffman_finish
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
@ -356,7 +372,6 @@ isal_deflate_body_ %+ ARCH %+ :
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
mov f_end_i, [rsp + f_end_i_mem_offset]
mov curr_data2, curr_data
shr curr_data2, 8
@ -380,11 +395,21 @@ isal_deflate_body_ %+ ARCH %+ :
and hash2 %+ d, hmask1 %+ d
; continue
cmp f_i, f_end_i
jl .loop2
jmp .input_end
jmp .loop2
;; encode as dist/len
.len_dist_lit_huffman_finish:
MOVD hash %+ d, xhash
PEXTRD tmp6 %+ d, xhash, 1
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
add tmp3,1
mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
add tmp3, 1
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
jmp .input_end
align 16
.len_dist_huffman_pre:
bsf len, len
shr len, 3
@ -421,12 +446,15 @@ isal_deflate_body_ %+ ARCH %+ :
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
MOVD hmask1 %+ d, xmask
cmp f_i, [rsp + f_end_i_mem_offset]
jge .len_dist_huffman_finish
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
compute_hash hash, curr_data
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
mov f_end_i, [rsp + f_end_i_mem_offset]
mov curr_data2, curr_data
shr curr_data2, 8
@ -450,25 +478,32 @@ isal_deflate_body_ %+ ARCH %+ :
and hash2 %+ d, hmask1 %+ d
; continue
cmp f_i, f_end_i
jl .loop2
jmp .loop2
.len_dist_huffman_finish:
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
jmp .input_end
align 16
.write_lit_bits:
MOVDQU xdata, [file_start + f_i + 1]
mov f_end_i, [rsp + f_end_i_mem_offset]
PSRLDQ xdata, 1
add f_i, 1
mov curr_data, [file_start + f_i]
cmp f_i, [rsp + f_end_i_mem_offset]
jge .write_lit_bits_finish
MOVQ curr_data, xdata
MOVDQU xdata, [file_start + f_i]
MOVD hash %+ d, xhash
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf
PEXTRD hash2 %+ d, xhash, 1
jmp .loop2
; continue
cmp f_i, f_end_i
jl .loop2
.write_lit_bits_finish:
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf
.input_end:
mov tmp1, ZSTATE_FLUSH_READ_BUFFER
@ -481,7 +516,8 @@ isal_deflate_body_ %+ ARCH %+ :
.output_end:
;; update input buffer
add f_end_i, LA
mov f_end_i, [rsp + f_end_i_mem_offset]
add f_end_i, [rsp + inbuf_slop_offset]
mov [stream + _total_in], f_i %+ d
add file_start, f_i
mov [stream + _next_in], file_start
@ -514,26 +550,196 @@ isal_deflate_body_ %+ ARCH %+ :
%endif
ret
align 16
.compare_loop:
MOVD xhash, tmp6 %+ d
PINSRD xhash, tmp2 %+ d, 1
PAND xhash, xhash, xmask
lea tmp2, [tmp1 + dist - 1]
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
mov len2, [rsp + f_end_i_mem_offset]
sub len2, f_i
add len2, [rsp + inbuf_slop_offset]
add len2, 1
mov len, 8
compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
cmp len, 258
jle .len_dist_huffman
cmp len, LARGE_MATCH_MIN
jge .do_emit
mov len, 258
jmp .len_dist_huffman
align 16
.compare_loop2:
lea tmp2, [tmp1 + dist2]
add tmp1, 1
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
mov len, [rsp + f_end_i_mem_offset]
sub len, f_i
add len, [rsp + inbuf_slop_offset]
mov len2, 8
compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
and curr_data, 0xff
get_lit_code curr_data, code3, code_len3, hufftables
cmp len2, 258
jle .len_dist_lit_huffman
cmp len2, LARGE_MATCH_MIN
jge .do_emit2
mov len2, 258
jmp .len_dist_lit_huffman
align 16
.do_emit2:
neg dist2
; get_dist_code(dist2, &code2, &code_len2);
get_dist_code dist2, code2, code_len2, hufftables
; get_len_code(len, &code, &code_len);
get_len_code 258, code, rcx, hufftables ;; rcx is code_len
; code2 <<= code_len
; code2 |= code
; code_len2 += code_len
SHLX code4, code2, rcx
or code4, code
add code_len2, rcx
mov tmp5, rcx
mov rcx, code_len3
SHLX tmp8, code4, rcx
or code3, tmp8
add rcx, code_len2
mov code_len3, rcx
write_bits m_bits, m_bit_count, code3, code_len3, m_out_buf
lea tmp3, [f_i + 2] ; tmp3 <= k
MOVD tmp2 %+ d, xhash
mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
add tmp3,1
PEXTRD tmp2 %+ d, xhash, 1
mov [stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
add f_i, 258
lea len, [len2 - 258]
jmp .emit_loop
.do_emit:
dec f_i
neg dist
; get_dist_code(dist, &code2, &code_len2);
%ifndef LONGER_HUFFTABLE
mov tmp3, dist ; since code2 and dist are rbx
get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
%else
get_dist_code dist, code2, code_len2, hufftables
%endif
; get_len_code(len, &code, &code_len);
get_len_code 258, code, rcx, hufftables ;; rcx is code_len
; code2 <<= code_len
; code2 |= code
; code_len2 += code_len
SHLX code4, code2, rcx
or code4, code
add code_len2, rcx
lea tmp3, [f_i + 2] ; tmp3 <= k
MOVD tmp6 %+ d, xhash
PEXTRD tmp5 %+ d, xhash, 1
mov [stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
add tmp3,1
mov [stream + _internal_state_head + 2 * tmp5], tmp3 %+ w
mov tmp5, rcx
.emit:
add f_i, 258
sub len, 258
mov code3, code4
write_bits m_bits, m_bit_count, code3, code_len2, m_out_buf
.emit_loop:
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja .output_end
cmp len, LARGE_MATCH_MIN
jge .emit
mov len2, 258
cmp len, len2
cmovg len, len2
add f_i, len
sub code_len2, tmp5
get_len_code len, code, rcx, hufftables
SHLX code4, code2, rcx
or code4, code
add code_len2, rcx
write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf
cmp f_i, [rsp + f_end_i_mem_offset]
jge .input_end
lea tmp7, [f_i - 4 * LARGE_MATCH_HASH_REP]
MOVD hmask1 %+ d, xmask
%rep LARGE_MATCH_HASH_REP
mov curr_data %+ d, dword [file_start + tmp7]
mov curr_data2 %+ d, dword [file_start + tmp7 + 1]
compute_hash hash, curr_data
compute_hash hash2, curr_data2
and hash %+ d, hmask1 %+ d
and hash2 %+ d, hmask1 %+ d
mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w
add tmp7, 1
mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w
add tmp7, 1
mov curr_data %+ d, dword [file_start + tmp7]
mov curr_data2 %+ d, dword [file_start + tmp7 + 1]
compute_hash hash, curr_data
compute_hash hash2, curr_data2
and hash %+ d, hmask1 %+ d
and hash2 %+ d, hmask1 %+ d
mov [stream + _internal_state_head + 2 * hash], tmp7 %+ w
add tmp7, 1
mov [stream + _internal_state_head + 2 * hash2], tmp7 %+ w
%if (LARGE_MATCH_HASH_REP > 1)
add tmp7, 1
%endif
%endrep
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
compute_hash hash, curr_data
mov curr_data2, curr_data
shr curr_data2, 8
compute_hash hash2, curr_data2
; hash = compute_hash(state->file_start + f_i) & hash_mask;
and hash %+ d, hmask1 %+ d
and hash2 %+ d, hmask1 %+ d
; continue
jmp .loop2
.write_first_byte:
cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
ja .output_end

View File

@ -37,118 +37,90 @@
;; sttni2 is faster, but it can't be debugged
;; so following code is based on "mine5"
;; compare 258 bytes = 8 * 32 + 2
;; tmp16 is a 16-bit version of tmp
;; compare258 src1, src2, result, tmp
%macro compare258 4
;; compares 8 bytes at a time, using xor
;; assumes the input buffer has size at least 8
;; compare_r src1, src2, result, result_max, tmp
%macro compare_r 5
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp16 %4w ; tmp as a 16-bit register
%define %%result_max %4
%define %%tmp %5
%define %%tmp16 %5w ; tmp as a 16-bit register
sub %%result_max, 16
cmp %%result, %%result_max
jg %%_by_8
xor %%result, %%result
%%loop1:
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare
jnz %%miscompare_reg
add %%result, 8
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare
jnz %%miscompare_reg
add %%result, 8
cmp %%result, %%result_max
jle %%loop1
cmp %%result, 256
jb %%loop1
%%_by_8:
add %%result_max, 8
cmp %%result, %%result_max
jg %%_cmp_last
; compare last two bytes
mov %%tmp16, [%%src1 + %%result]
xor %%tmp16, [%%src2 + %%result]
jnz %%miscompare16
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
; no miscompares, return 258
add %%result, 2
%%_cmp_last:
add %%result_max, 8
cmp %%result, %%result_max
je %%end
lea %%result, [%%result_max - 8]
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
jmp %%end
%%miscompare16:
and %%tmp, 0xFFFF
%%miscompare:
%%miscompare_reg:
bsf %%tmp, %%tmp
shr %%tmp, 3
add %%result, %%tmp
%%end:
%endm
;; compare 258 bytes = 8 * 32 + 2
;; tmp16 is a 16-bit version of tmp
;; compare258 src1, src2, result, tmp
%macro compare250_r 4
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp16 %4w ; tmp as a 16-bit register
mov %%result, 8
mov %%tmp, [%%src1 + 8]
xor %%tmp, [%%src2 + 8]
jnz %%miscompare
add %%result, 8
%%loop1:
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare
add %%result, 8
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare
add %%result, 8
cmp %%result, 256
jb %%loop1
; compare last two bytes
mov %%tmp16, [%%src1 + %%result]
xor %%tmp16, [%%src2 + %%result]
jnz %%miscompare16
; no miscompares, return 258
add %%result, 2
jmp %%end
%%miscompare16:
and %%tmp, 0xFFFF
%%miscompare:
bsf %%tmp, %%tmp
shr %%tmp, 3
add %%result, %%tmp
%%end:
%endm
;; compare 258 bytes = 8 * 32 + 2
;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
%macro compare258_x 6
;; assumes the input buffer has size at least 8
;; compare_x src1, src2, result, result_max, tmp, xtmp1, xtmp2
%macro compare_x 7
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp32 %4d
%define %%tmp16 %4w ; tmp as a 16-bit register
%define %%xtmp %5
%define %%xtmp2 %6
%define %%result %3 ; Accumulator for match_length
%define %%result_max %4
%define %%tmp %5
%define %%tmp16 %5w ; tmp as a 16-bit register
%define %%tmp32 %5d ; tmp as a 32-bit register
%define %%xtmp %6
%define %%xtmp2 %7
sub %%result_max, 32
cmp %%result, %%result_max
jg %%_by_16
xor %%result, %%result
%%loop1:
MOVDQU %%xtmp, [%%src1 + %%result]
MOVDQU %%xtmp2, [%%src2 + %%result]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare
jnz %%miscompare_vect
add %%result, 16
MOVDQU %%xtmp, [%%src1 + %%result]
@ -156,120 +128,86 @@
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare
jnz %%miscompare_vect
add %%result, 16
cmp %%result, 256
jb %%loop1
cmp %%result, %%result_max
jle %%loop1
%%_by_16:
add %%result_max, 16
cmp %%result, %%result_max
jg %%_by_8
MOVDQU %%xtmp, [%%src1 + %%result]
MOVDQU %%xtmp2, [%%src2 + %%result]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare_vect
add %%result, 16
%%_by_8:
add %%result_max, 8
cmp %%result, %%result_max
jg %%_cmp_last
; compare last two bytes
mov %%tmp16, [%%src1 + %%result]
xor %%tmp16, [%%src2 + %%result]
jnz %%miscompare16
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
; no miscompares, return 258
add %%result, 2
%%_cmp_last:
add %%result_max, 8
cmp %%result, %%result_max
je %%end
lea %%result, [%%result_max - 8]
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
jmp %%end
%%miscompare16:
and %%tmp, 0xFFFF
%%miscompare_reg:
bsf %%tmp, %%tmp
shr %%tmp, 3
add %%result, %%tmp
jmp %%end
%%miscompare:
%%miscompare_vect:
bsf %%tmp, %%tmp
add %%result, %%tmp
%%end:
%endm
;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
;; were already checked
;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
%macro compare250_x 6
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp32 %4d ; tmp as a 16-bit register
%define %%xtmp %5
%define %%xtmp2 %6
mov %%result, 8
MOVDQU %%xtmp, [%%src1 + 8]
MOVDQU %%xtmp2, [%%src2 + 8]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare
add %%result, 16
%%loop1:
MOVDQU %%xtmp, [%%src1 + %%result]
MOVDQU %%xtmp2, [%%src2 + %%result]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare
add %%result, 16
MOVDQU %%xtmp, [%%src1 + %%result]
MOVDQU %%xtmp2, [%%src2 + %%result]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare
add %%result, 16
cmp %%result, 258 - 16
jb %%loop1
MOVDQU %%xtmp, [%%src1 + %%result]
MOVDQU %%xtmp2, [%%src2 + %%result]
PCMPEQB %%xtmp, %%xtmp, %%xtmp2
PMOVMSKB %%tmp32, %%xtmp
xor %%tmp, 0xFFFF
jnz %%miscompare_last
; no miscompares, return 258
mov %%result, 258
jmp %%end
%%miscompare_last:
bsf %%tmp, %%tmp
add %%result, %%tmp
;; Guarantee the result has length at most 258.
mov %%tmp, 258
cmp %%result, 258
cmova %%result, %%tmp
jmp %%end
%%miscompare:
bsf %%tmp, %%tmp
add %%result, %%tmp
%%end:
%endm
;; compare 258 bytes = 8 * 32 + 2
;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
%macro compare258_y 6
;; assumes the input buffer has size at least 8
;; compare_y src1, src2, result, result_max, tmp, xtmp1, xtmp2
%macro compare_y 7
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp16 %4w ; tmp as a 16-bit register
%define %%tmp32 %4d ; tmp as a 32-bit register
%define %%ytmp %5
%define %%ytmp2 %6
%define %%result %3 ; Accumulator for match_length
%define %%result_max %4
%define %%tmp %5
%define %%tmp16 %5w ; tmp as a 16-bit register
%define %%tmp32 %5d ; tmp as a 32-bit register
%define %%ytmp %6
%define %%ytmp2 %7
sub %%result_max, 64
cmp %%result, %%result_max
jg %%_by_32
xor %%result, %%result
%%loop1:
vmovdqu %%ytmp, [%%src1 + %%result]
vmovdqu %%ytmp2, [%%src2 + %%result]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare
jnz %%miscompare_vect
add %%result, 32
vmovdqu %%ytmp, [%%src1 + %%result]
@ -277,123 +215,125 @@
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare
jnz %%miscompare_vect
add %%result, 32
cmp %%result, 256
jb %%loop1
cmp %%result, %%result_max
jle %%loop1
%%_by_32:
add %%result_max, 32
cmp %%result, %%result_max
jg %%_by_16
vmovdqu %%ytmp, [%%src1 + %%result]
vmovdqu %%ytmp2, [%%src2 + %%result]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare_vect
add %%result, 32
%%_by_16:
add %%result_max, 16
cmp %%result, %%result_max
jg %%_by_8
vmovdqu %%ytmp %+ x, [%%src1 + %%result]
vmovdqu %%ytmp2 %+ x, [%%src2 + %%result]
vpcmpeqb %%ytmp %+ x, %%ytmp %+ x, %%ytmp2 %+ x
vpmovmskb %%tmp, %%ytmp %+ x
xor %%tmp32, 0xFFFF
jnz %%miscompare_vect
add %%result, 16
%%_by_8:
add %%result_max, 8
cmp %%result, %%result_max
jg %%_cmp_last
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
%%_cmp_last:
add %%result_max, 8
cmp %%result, %%result_max
je %%end
lea %%result, [%%result_max - 8]
; compare last two bytes
mov %%tmp16, [%%src1 + %%result]
xor %%tmp16, [%%src2 + %%result]
jnz %%miscompare16
; no miscompares, return 258
add %%result, 2
mov %%tmp, [%%src1 + %%result]
xor %%tmp, [%%src2 + %%result]
jnz %%miscompare_reg
add %%result, 8
jmp %%end
%%miscompare16:
and %%tmp, 0xFFFF
%%miscompare_reg:
bsf %%tmp, %%tmp
shr %%tmp, 3
add %%result, %%tmp
jmp %%end
%%miscompare:
%%miscompare_vect:
bsf %%tmp, %%tmp
add %%result, %%tmp
%%end:
%endm
;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
;; were already checked
;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
%macro compare250_y 6
%macro compare250 7
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%tmp16 %4w ; tmp as a 16-bit register
%define %%tmp32 %4d ; tmp as a 32-bit register
%define %%ytmp %5
%define %%ytmp2 %6
%define %%result_max %4
%define %%tmp %5
%define %%xtmp0 %6x
%define %%xtmp1 %7x
%define %%ytmp0 %6
%define %%ytmp1 %7
mov %%result, 8
vmovdqu %%ytmp, [%%src1 + 8]
vmovdqu %%ytmp2, [%%src2 + 8]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare
add %%result, 32
%%loop1:
vmovdqu %%ytmp, [%%src1 + %%result]
vmovdqu %%ytmp2, [%%src2 + %%result]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare
add %%result, 32
vmovdqu %%ytmp, [%%src1 + %%result]
vmovdqu %%ytmp2, [%%src2 + %%result]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare
add %%result, 32
cmp %%result, 258 - 32
jb %%loop1
vmovdqu %%ytmp, [%%src1 + %%result]
vmovdqu %%ytmp2, [%%src2 + %%result]
vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
vpmovmskb %%tmp, %%ytmp
xor %%tmp32, 0xFFFFFFFF
jnz %%miscompare_last
mov %%result, 258
jmp %%end
%%miscompare_last:
bsf %%tmp, %%tmp
add %%result, %%tmp
;; Guarantee the result has length at most 258.
mov %%tmp, 258
cmp %%result, 258
cmova %%result, %%tmp
jmp %%end
%%miscompare:
bsf %%tmp, %%tmp
add %%result, %%tmp
%%end:
%endm
%macro compare250 6
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%tmp %4
%define %%xtmp0 %5x
%define %%xtmp1 %6x
%define %%ytmp0 %5
%define %%ytmp1 %6
mov %%tmp, 250
cmp %%result_max, 250
cmovg %%result_max, %%tmp
%if (COMPARE_TYPE == 1)
compare250_r %%src1, %%src2, %%result, %%tmp
compare_r %%src1, %%src2, %%result, %%result_max, %%tmp
%elif (COMPARE_TYPE == 2)
compare250_x %%src1, %%src2, %%result, %%tmp, %%xtmp0, %%xtmp1
compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
%elif (COMPARE_TYPE == 3)
compare250_y %%src1, %%src2, %%result, %%tmp, %%ytmp0, %%ytmp1
compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
%else
%error Unknown Compare type COMPARE_TYPE
% error
%endif
%endmacro
; Assumes the buffer has at least 8 bytes
; Accumulates match length onto result
%macro compare_large 7
%define %%src1 %1
%define %%src2 %2
%define %%result %3
%define %%result_max %4
%define %%tmp %5
%define %%xtmp0 %6x
%define %%xtmp1 %7x
%define %%ytmp0 %6
%define %%ytmp1 %7
%if (COMPARE_TYPE == 1)
compare_r %%src1, %%src2, %%result, %%result_max, %%tmp
%elif (COMPARE_TYPE == 2)
compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
%elif (COMPARE_TYPE == 3)
compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
%else
%error Unknown Compare type COMPARE_TYPE
% error
%endif
%endmacro
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@ -47,6 +47,9 @@ global %1
%endm
%endif
%define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements
%define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop
%define MIN_INBUF_PADDING 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -76,6 +79,7 @@ global %1
%define len2 r8
%define tmp4 r8
%define hmask1 r8
%define len_code2 r8
%define len rdx
%define len_code rdx
@ -110,9 +114,10 @@ dist_mask_offset equ 16
hash_mask_offset equ 24
f_end_i_mem_offset equ 32
stream_offset equ 40
gpr_save_mem_offset equ 48 ; gpr save area (8*8 bytes)
inbuf_slop_offset equ 48
gpr_save_mem_offset equ 64 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ gpr_save_mem_offset + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
stack_size equ 7*8 + 8*8 + 4*16
stack_size equ 9*8 + 8*8 + 4*16
;;; 8 because stack address is odd multiple of 8 after a function call and
;;; we want it aligned to 16 bytes
@ -208,8 +213,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
mov file_length %+ d, [stream + _avail_in]
add file_length, f_i
; file_length -= LA;
sub file_length, LA
mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
cmp byte [stream + _end_of_stream], 0
jnz .default_inbuf_padding
cmp byte [stream + _flush], 0
jnz .default_inbuf_padding
mov qword [rsp + inbuf_slop_offset], LA
.default_inbuf_padding:
; file_length -= INBUF_PADDING;
sub file_length, [rsp + inbuf_slop_offset]
; if (file_length <= 0) continue;
mov hmask1 %+ d, [rsp + hash_mask_offset]
@ -220,7 +233,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
mov tmp1, curr_data
mov tmp2, curr_data
compute_hash hash, curr_data
@ -295,6 +307,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
test len %+ d, 0xFFFFFFFF
jz .len_dist_huffman_pre
PSRLDQ xdata, 1
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
movzx lit_code2, curr_data %+ b
;; Check for len/dist match for second literal
@ -318,9 +331,15 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
;; Setup for updating hash
lea tmp3, [f_i + 1] ; tmp3 <= k
mov tmp2, f_i
add file_start, f_i
add f_i, len2
cmp f_i, file_length
jg .len_dist_lit_huffman_finish
MOVDQU xdata, [file_start + len2]
mov tmp1, [file_start + len2]
sub file_start, tmp2
shr curr_data, 24
compute_hash hash3, curr_data
@ -329,9 +348,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
mov curr_data, tmp1
shr tmp1, 8
sub file_start, f_i
add f_i, len2
mov [hash_table + 2 * hash], tmp3 %+ w
compute_hash hash, curr_data
@ -361,10 +377,28 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
and hash2 %+ d, hmask3 %+ d
; continue
cmp f_i, file_length
jl .loop2
jmp .loop2
.len_dist_lit_huffman_finish:
sub file_start, tmp2
mov [hash_table + 2 * hash], tmp3 %+ w
add tmp3,1
mov [hash_table + 2 * hash2], tmp3 %+ w
add dist_code2, 254
add dist_code2, len2
inc dword [lit_len_hist + HIST_ELEM_SIZE*(len2 + 254)]
movnti dword [m_out_buf + 4], dist_code2 %+ d
add m_out_buf, 8
shr dist_code2, DIST_OFFSET
and dist_code2, 0x1F
inc dword [dist_hist + HIST_ELEM_SIZE*dist_code2]
jmp .input_end
;; encode as dist/len
.len_dist_huffman_pre:
bsf len, len
@ -380,14 +414,21 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
; get_dist_code(dist, &code2, &code_len2);
get_dist_icf_code dist, dist_code, tmp1
.len_dist_huffman_skip:
mov hmask2 %+ d, [rsp + hash_mask_offset]
mov tmp1, f_i
add file_start, f_i
add f_i, len
cmp f_i, file_length
jg .len_dist_huffman_finish
MOVDQU xdata, [file_start + len]
mov curr_data2, [file_start + len]
mov curr_data, curr_data2
sub file_start, f_i
add f_i, len
sub file_start, tmp1
; get_len_code(len, &code, &code_len);
lea len_code, [len + 254]
or dist_code, len_code
@ -415,15 +456,39 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
and hash2 %+ d, hmask2 %+ d
; continue
cmp f_i, file_length
jl .loop2
jmp .loop2
.len_dist_huffman_finish:
sub file_start, tmp1
; get_len_code(len, &code, &code_len);
lea len_code, [len + 254]
or dist_code, len_code
mov [hash_table + 2 * hash], tmp3 %+ w
add tmp3,1
mov [hash_table + 2 * hash2], tmp3 %+ w
inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
movnti dword [m_out_buf], dist_code %+ d
add m_out_buf, 4
shr dist_code, DIST_OFFSET
and dist_code, 0x1F
inc dword [dist_hist + HIST_ELEM_SIZE*dist_code]
jmp .input_end
.write_lit_bits:
MOVDQU xdata, [file_start + f_i + 1]
add f_i, 1
MOVQ curr_data, xdata
add f_i, 1
cmp f_i, file_length
jg .write_lit_bits_finish
MOVDQU xdata, [file_start + f_i]
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
shl lit_code2, DIST_OFFSET
@ -432,9 +497,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
movnti dword [m_out_buf], lit_code %+ d
add m_out_buf, 4
; continue
cmp f_i, file_length
jl .loop2
jmp .loop2
.write_lit_bits_finish:
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
shl lit_code2, DIST_OFFSET
lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
movnti dword [m_out_buf], lit_code %+ d
add m_out_buf, 4
.input_end:
mov stream, [rsp + stream_offset]
@ -454,7 +526,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
.end:
;; update input buffer
add file_length, LA
add file_length, [rsp + inbuf_slop_offset]
mov [stream + _total_in], f_i %+ d
mov [stream + _internal_state_block_end], f_i %+ d
add file_start, f_i
@ -487,21 +559,143 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
.compare_loop:
lea tmp2, [tmp1 + dist - 1]
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
mov len2, file_length
sub len2, f_i
add len2, [rsp + inbuf_slop_offset]
add len2, 1
mov len, 8
compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
cmp len, 258
jle .len_dist_huffman
cmp len, LARGE_MATCH_MIN
jge .do_emit
mov len, 258
jmp .len_dist_huffman
.compare_loop2:
lea tmp2, [tmp1 + dist2]
add tmp1, 1
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
mov len, file_length
sub len, f_i
add len, [rsp + inbuf_slop_offset]
mov len2, 8
compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
movzx lit_code, curr_data %+ b
shr curr_data, 8
inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
cmp len2, 258
jle .len_dist_lit_huffman
cmp len2, LARGE_MATCH_MIN
jge .do_emit2
mov len2, 258
jmp .len_dist_lit_huffman
.do_emit2:
or lit_code, LIT
movnti dword [m_out_buf], lit_code %+ d
add m_out_buf, 4
inc f_i
mov dist, dist2
mov len, len2
.do_emit:
neg dist
get_dist_icf_code dist, dist_code, tmp1
mov len_code2, 258 + 254
or len_code2, dist_code
mov tmp1, dist_code
shr tmp1, DIST_OFFSET
and tmp1, 0x1F
lea tmp3, [f_i + 1]
dec f_i
mov [hash_table + 2 * hash], tmp3 %+ w
add tmp3,1
mov [hash_table + 2 * hash2], tmp3 %+ w
.emit:
sub len, 258
add f_i, 258
inc dword [lit_len_hist + HIST_ELEM_SIZE*(258 + 254)]
inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
movnti dword [m_out_buf], len_code2 %+ d
add m_out_buf, 4
cmp m_out_buf, [rsp + m_out_end]
ja .output_end
cmp len, LARGE_MATCH_MIN
jge .emit
mov len2, 258
cmp len, len2
cmovg len, len2
; get_len_code(len, &code, &code_len);
add f_i, len
lea len_code, [len + 254]
or dist_code, len_code
inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
movnti dword [m_out_buf], dist_code %+ d
add m_out_buf, 4
cmp file_length, f_i
jle .input_end
lea tmp2, [f_i - 4 * LARGE_MATCH_HASH_REP]
mov hmask2 %+ d, [rsp + hash_mask_offset]
%rep LARGE_MATCH_HASH_REP
mov curr_data %+ d, dword [file_start + tmp2]
mov curr_data2 %+ d, dword [file_start + tmp2 + 1]
mov tmp3 %+ d, dword [file_start + tmp2 + 2]
mov tmp1 %+ d, dword [file_start + tmp2 + 3]
compute_hash hash, curr_data
compute_hash hash2, curr_data2
compute_hash hash3, tmp3
compute_hash hmask3, tmp1
and hash %+ d, hmask2 %+ d
and hash2 %+ d, hmask2 %+ d
and hash3 %+ d, hmask2 %+ d
and hmask3 %+ d, hmask2 %+ d
mov [hash_table + 2 * hash], tmp2 %+ w
add tmp2, 1
mov [hash_table + 2 * hash2], tmp2 %+ w
add tmp2, 1
mov [hash_table + 2 * hash3], tmp2 %+ w
add tmp2, 1
mov [hash_table + 2 * hmask3], tmp2 %+ w
%if (LARGE_MATCH_HASH_REP > 1)
add tmp2, 1
%endif
%endrep
; for (f_i = f_start_i; f_i < file_length; f_i++) {
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
mov tmp1, curr_data
compute_hash hash, curr_data
shr tmp1, 8
compute_hash hash2, tmp1
and hash, hmask2
and hash2, hmask2
jmp .loop2
.write_first_byte:
mov hmask1 %+ d, [rsp + hash_mask_offset]
cmp m_out_buf, [rsp + m_out_end]

View File

@ -545,7 +545,9 @@ compare_loop:
and hash2 %+ d, LVL0_HASH_MASK
lea tmp2, [tmp1 + dist - 1]
compare250 tmp1, tmp2, len, tmp3, ytmp0, ytmp1
mov len2, 250
mov len, 8
compare250 tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
lea tmp3, [f_i + 1]
jmp len_dist_huffman
@ -554,7 +556,9 @@ compare_loop2:
add tmp1, 1
lea tmp2, [tmp1 + dist2 - 1]
compare250 tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
mov len, 250
mov len2, 8
compare250 tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
and curr_data, 0xff
inc qword [histogram + _lit_len_offset + 8 * curr_data]