igzip: Modify igzip_body assembly to run to last 16 bytes.

Change-Id: Ib2c688d0b2d7ff5d4fd7b14bb6eea72a7f689cd3 Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
2024-12-13 09:52:56 +01:00 · 2018-07-27 17:26:13 -04:00 · 2018-07-27 17:26:13 -04:00 · 7345490999
commit 7345490999
parent 52d974762b
4 changed files with 645 additions and 301 deletions
--- a/igzip/igzip_body.asm
+++ b/igzip/igzip_body.asm
@ -38,6 +38,9 @@

 %include "stdmac.asm"

+%define LARGE_MATCH_HASH_REP 1  ; Hash 4 * LARGE_MATCH_HASH_REP elements
+%define LARGE_MATCH_MIN 264 	; Minimum match size to enter large match emit loop
+%define MIN_INBUF_PADDING 16
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -69,6 +72,7 @@
 %define	curr_data2	r8
 %define	len2		r8
 %define	tmp6		r8
+%define	f_end_i		r8

 %define	m_bits		r9

@ -76,7 +80,6 @@

 %define	m_out_buf	r11

-%define	f_end_i		r12
 %define	dist2		r12
 %define	tmp7		r12
 %define	code4		r12
@ -107,9 +110,10 @@

 blen_mem_offset     equ  0	 ; local variable (8 bytes)
 f_end_i_mem_offset  equ  8
-gpr_save_mem_offset equ 16       ; gpr save area (8*8 bytes)
-xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
-stack_size          equ 2*8 + 8*8 + 4*16 + 8
+inbuf_slop_offset    equ 16
+gpr_save_mem_offset equ 32       ; gpr save area (8*8 bytes)
+xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
+stack_size          equ 4*8 + 8*8 + 4*16 + 8
 ;;; 8 because stack address is odd multiple of 8 after a function call and
 ;;; we want it aligned to 16 bytes

@ -197,8 +201,16 @@ isal_deflate_body_ %+ ARCH %+ :
 	mov	f_end_i %+ d, [stream + _avail_in]
 	add	f_end_i, f_i

-	; f_end_i -= LA;
-	sub	f_end_i, LA
+	mov	qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
+	cmp	byte [stream + _end_of_stream], 0
+	jnz	.default_inbuf_padding
+	cmp	byte [stream + _flush], 0
+	jnz	.default_inbuf_padding
+	mov	qword [rsp + inbuf_slop_offset], LA
+.default_inbuf_padding:
+
+	; f_end_i -= INBUF_PADDING;
+	sub	f_end_i, [rsp + inbuf_slop_offset]
 	mov	[rsp + f_end_i_mem_offset], f_end_i
 	; if (f_end_i <= 0) continue;

@ -340,6 +352,10 @@ isal_deflate_body_ %+ ARCH %+ :
 	;; Setup for updating hash
 	lea	tmp3, [f_i + len2 + 1]	; tmp3 <= k

+	mov	tmp6, [rsp + f_end_i_mem_offset]
+	cmp	f_i, tmp6
+	jge	.len_dist_lit_huffman_finish
+
 	MOVDQU	xdata, [file_start + f_i]
 	mov	curr_data, [file_start + f_i]

@ -356,7 +372,6 @@ isal_deflate_body_ %+ ARCH %+ :
 	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w

 	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
-	mov	f_end_i, [rsp + f_end_i_mem_offset]

 	mov	curr_data2, curr_data
 	shr	curr_data2, 8
@ -380,11 +395,21 @@ isal_deflate_body_ %+ ARCH %+ :
 	and	hash2 %+ d, hmask1 %+ d

 	; continue
-	cmp	f_i, f_end_i
-	jl	.loop2
-	jmp	.input_end
+	jmp	.loop2
 	;; encode as dist/len
+.len_dist_lit_huffman_finish:
+	MOVD	hash %+ d, xhash
+	PEXTRD	tmp6 %+ d, xhash, 1
+	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w
+	add	tmp3,1
+	mov	[stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
+	add	tmp3, 1
+	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w

+	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
+	jmp	.input_end
+
+align	16
 .len_dist_huffman_pre:
 	bsf	len, len
 	shr	len, 3
@ -421,12 +446,15 @@ isal_deflate_body_ %+ ARCH %+ :
 	mov	[stream + _internal_state_head + 2 * hash2], tmp3 %+ w

 	MOVD	hmask1 %+ d, xmask
+
+	cmp	f_i, [rsp + f_end_i_mem_offset]
+	jge	.len_dist_huffman_finish
+
 	MOVDQU	xdata, [file_start + f_i]
 	mov	curr_data, [file_start + f_i]
 	compute_hash	hash, curr_data

 	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
-	mov	f_end_i, [rsp + f_end_i_mem_offset]

 	mov	curr_data2, curr_data
 	shr	curr_data2, 8
@ -450,25 +478,32 @@ isal_deflate_body_ %+ ARCH %+ :
 	and	hash2 %+ d, hmask1 %+ d

 	; continue
-	cmp	f_i, f_end_i
-	jl	.loop2
+	jmp	.loop2
+
+.len_dist_huffman_finish:
+	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
 	jmp	.input_end

+align	16
 .write_lit_bits:
-	MOVDQU	xdata, [file_start + f_i + 1]
-	mov	f_end_i, [rsp + f_end_i_mem_offset]
+	PSRLDQ	xdata, 1
+
 	add	f_i, 1
-	mov	curr_data, [file_start + f_i]
+	cmp     f_i, [rsp + f_end_i_mem_offset]
+	jge     .write_lit_bits_finish
+
+	MOVQ	curr_data, xdata
+	MOVDQU	xdata, [file_start + f_i]

 	MOVD	hash %+ d, xhash

 	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

 	PEXTRD	hash2 %+ d, xhash, 1
+	jmp      .loop2

-	; continue
-	cmp	f_i, f_end_i
-	jl	.loop2
+.write_lit_bits_finish:
+	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

 .input_end:
 	mov	tmp1, ZSTATE_FLUSH_READ_BUFFER
@ -481,7 +516,8 @@ isal_deflate_body_ %+ ARCH %+ :

 .output_end:
 	;; update input buffer
-	add	f_end_i, LA
+	mov	f_end_i, [rsp + f_end_i_mem_offset]
+	add	f_end_i, [rsp + inbuf_slop_offset]
 	mov	[stream + _total_in], f_i %+ d
 	add	file_start, f_i
 	mov     [stream + _next_in], file_start
@ -514,26 +550,196 @@ isal_deflate_body_ %+ ARCH %+ :
 %endif
 	ret

+align	16
 .compare_loop:
 	MOVD	xhash, tmp6 %+ d
 	PINSRD	xhash, tmp2 %+ d, 1
 	PAND	xhash, xhash, xmask
 	lea	tmp2, [tmp1 + dist - 1]

-	compare250	tmp1, tmp2, len, tmp3, ytmp0, ytmp1
+	mov	len2, [rsp + f_end_i_mem_offset]
+	sub	len2, f_i
+	add	len2, [rsp + inbuf_slop_offset]
+	add	len2, 1

+	mov	len, 8
+	compare_large	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
+
+	cmp	len, 258
+	jle	.len_dist_huffman
+	cmp	len, LARGE_MATCH_MIN
+	jge	.do_emit
+	mov	len, 258
 	jmp	.len_dist_huffman

+align	16
 .compare_loop2:
 	lea	tmp2, [tmp1 + dist2]
 	add	tmp1, 1

-	compare250	tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
+	mov	len, [rsp + f_end_i_mem_offset]
+	sub	len, f_i
+	add	len, [rsp + inbuf_slop_offset]
+
+	mov	len2, 8
+	compare_large	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1

 	and	curr_data, 0xff
 	get_lit_code	curr_data, code3, code_len3, hufftables
+	cmp	len2, 258
+	jle	.len_dist_lit_huffman
+	cmp	len2, LARGE_MATCH_MIN
+	jge	.do_emit2
+	mov	len2, 258
 	jmp	.len_dist_lit_huffman

+align	16
+.do_emit2:
+	neg	dist2
+
+	; get_dist_code(dist2, &code2, &code_len2);
+	get_dist_code	dist2, code2, code_len2, hufftables
+
+	; get_len_code(len, &code, &code_len);
+	get_len_code	258, code, rcx, hufftables ;; rcx is code_len
+
+	; code2 <<= code_len
+	; code2 |= code
+	; code_len2 += code_len
+	SHLX	code4, code2, rcx
+	or	code4, code
+	add	code_len2, rcx
+	mov	tmp5, rcx
+
+	mov	rcx, code_len3
+	SHLX	tmp8, code4, rcx
+	or	code3, tmp8
+	add	rcx, code_len2
+	mov	code_len3, rcx
+
+	write_bits	m_bits, m_bit_count, code3, code_len3, m_out_buf
+
+	lea	tmp3, [f_i + 2]	; tmp3 <= k
+	MOVD	tmp2 %+ d, xhash
+	mov	[stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
+	add	tmp3,1
+	PEXTRD	tmp2 %+ d, xhash, 1
+	mov	[stream + _internal_state_head + 2 * tmp2], tmp3 %+ w
+
+	add	f_i, 258
+	lea	len, [len2 - 258]
+
+	jmp	.emit_loop
+
+.do_emit:
+	dec	f_i
+	neg	dist
+
+	; get_dist_code(dist, &code2, &code_len2);
+%ifndef LONGER_HUFFTABLE
+	mov tmp3, dist	; since code2 and dist are rbx
+	get_dist_code	tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
+%else
+	get_dist_code	dist, code2, code_len2, hufftables
+%endif
+	; get_len_code(len, &code, &code_len);
+	get_len_code	258, code, rcx, hufftables ;; rcx is code_len
+
+	; code2 <<= code_len
+	; code2 |= code
+	; code_len2 += code_len
+	SHLX	code4, code2, rcx
+	or	code4, code
+	add	code_len2, rcx
+
+	lea	tmp3, [f_i + 2]	; tmp3 <= k
+	MOVD	tmp6 %+ d, xhash
+	PEXTRD	tmp5 %+ d, xhash, 1
+	mov	[stream + _internal_state_head + 2 * tmp6], tmp3 %+ w
+	add	tmp3,1
+	mov	[stream + _internal_state_head + 2 * tmp5], tmp3 %+ w
+	mov	tmp5, rcx
+
+.emit:
+	add	f_i, 258
+	sub	len, 258
+	mov	code3, code4
+
+	write_bits	m_bits, m_bit_count, code3, code_len2, m_out_buf
+
+.emit_loop:
+	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
+	ja	.output_end
+	cmp	len, LARGE_MATCH_MIN
+	jge	.emit
+
+	mov	len2, 258
+	cmp	len, len2
+	cmovg	len, len2
+
+	add	f_i, len
+
+	sub	code_len2, tmp5
+	get_len_code	len, code, rcx, hufftables
+	SHLX	code4, code2, rcx
+	or	code4, code
+	add	code_len2, rcx
+
+	write_bits	m_bits, m_bit_count, code4, code_len2, m_out_buf
+
+	cmp	f_i, [rsp + f_end_i_mem_offset]
+	jge	.input_end
+
+	lea	tmp7, [f_i - 4 * LARGE_MATCH_HASH_REP]
+	MOVD	hmask1 %+ d, xmask
+%rep LARGE_MATCH_HASH_REP
+	mov	curr_data %+ d, dword [file_start + tmp7]
+	mov	curr_data2 %+ d, dword [file_start + tmp7 + 1]
+
+	compute_hash	hash, curr_data
+	compute_hash	hash2, curr_data2
+
+	and	hash %+ d, hmask1 %+ d
+	and	hash2 %+ d, hmask1 %+ d
+
+	mov	[stream + _internal_state_head + 2 * hash], tmp7 %+ w
+	add	tmp7, 1
+	mov	[stream + _internal_state_head + 2 * hash2], tmp7 %+ w
+	add	tmp7, 1
+
+	mov	curr_data %+ d, dword [file_start + tmp7]
+	mov	curr_data2 %+ d, dword [file_start + tmp7 + 1]
+
+	compute_hash	hash, curr_data
+	compute_hash	hash2, curr_data2
+
+	and	hash %+ d, hmask1 %+ d
+	and	hash2 %+ d, hmask1 %+ d
+
+	mov	[stream + _internal_state_head + 2 * hash], tmp7 %+ w
+	add	tmp7, 1
+	mov	[stream + _internal_state_head + 2 * hash2], tmp7 %+ w
+%if (LARGE_MATCH_HASH_REP > 1)
+	add	tmp7, 1
+%endif
+%endrep
+
+	MOVDQU	xdata, [file_start + f_i]
+	mov	curr_data, [file_start + f_i]
+	compute_hash	hash, curr_data
+
+
+	mov	curr_data2, curr_data
+	shr	curr_data2, 8
+	compute_hash	hash2, curr_data2
+
+	; hash = compute_hash(state->file_start + f_i) & hash_mask;
+	and	hash %+ d, hmask1 %+ d
+	and	hash2 %+ d, hmask1 %+ d
+
+	; continue
+	jmp	.loop2
+
 .write_first_byte:
 	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
 	ja	.output_end
--- a/igzip/igzip_compare_types.asm
+++ b/igzip/igzip_compare_types.asm
@ -37,118 +37,90 @@
 ;; sttni2 is faster, but it can't be debugged
 ;; so following code is based on "mine5"

-;; compare 258 bytes = 8 * 32 + 2
-;; tmp16 is a 16-bit version of tmp
-;; compare258 src1, src2, result, tmp
-%macro compare258 4
+;; compares 8 bytes at a time, using xor
+;; assumes the input buffer has size at least 8
+;; compare_r src1, src2, result, result_max, tmp
+%macro compare_r 5
 %define %%src1		%1
 %define %%src2		%2
 %define %%result	%3
-%define %%tmp		%4
-%define %%tmp16		%4w	; tmp as a 16-bit register
+%define %%result_max	%4
+%define %%tmp		%5
+%define %%tmp16		%5w	; tmp as a 16-bit register
+
+	sub	%%result_max, 16
+	cmp	%%result, %%result_max
+	jg	%%_by_8

-	xor	%%result, %%result
 %%loop1:
 	mov	%%tmp, [%%src1 + %%result]
 	xor	%%tmp, [%%src2 + %%result]
-	jnz	%%miscompare
+	jnz	%%miscompare_reg
 	add	%%result, 8

 	mov	%%tmp, [%%src1 + %%result]
 	xor	%%tmp, [%%src2 + %%result]
-	jnz	%%miscompare
+	jnz	%%miscompare_reg
 	add	%%result, 8
+	cmp	%%result, %%result_max
+	jle	%%loop1

-	cmp	%%result, 256
-	jb	%%loop1
+%%_by_8:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	jg	%%_cmp_last

 	; compare last two bytes
-	mov	%%tmp16, [%%src1 + %%result]
-	xor	%%tmp16, [%%src2 + %%result]
-	jnz	%%miscompare16
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8

-	; no miscompares, return 258
-	add	%%result, 2
+%%_cmp_last:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	je	%%end
+
+	lea	%%result, [%%result_max - 8]
+
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8
 	jmp	%%end

-%%miscompare16:
-	and	%%tmp, 0xFFFF
-%%miscompare:
+%%miscompare_reg:
 	bsf	%%tmp, %%tmp
 	shr	%%tmp, 3
 	add	%%result, %%tmp
 %%end:
 %endm

-;; compare 258 bytes = 8 * 32 + 2
-;; tmp16 is a 16-bit version of tmp
-;; compare258 src1, src2, result, tmp
-%macro compare250_r 4
-%define %%src1		%1
-%define %%src2		%2
-%define %%result	%3
-%define %%tmp		%4
-%define %%tmp16		%4w	; tmp as a 16-bit register
-
-	mov	%%result, 8
-	mov	%%tmp, [%%src1 + 8]
-	xor	%%tmp, [%%src2 + 8]
-	jnz	%%miscompare
-	add	%%result, 8
-
-%%loop1:
-	mov	%%tmp, [%%src1 + %%result]
-	xor	%%tmp, [%%src2 + %%result]
-	jnz	%%miscompare
-	add	%%result, 8
-
-	mov	%%tmp, [%%src1 + %%result]
-	xor	%%tmp, [%%src2 + %%result]
-	jnz	%%miscompare
-	add	%%result, 8
-
-	cmp	%%result, 256
-	jb	%%loop1
-
-	; compare last two bytes
-	mov	%%tmp16, [%%src1 + %%result]
-	xor	%%tmp16, [%%src2 + %%result]
-	jnz	%%miscompare16
-
-	; no miscompares, return 258
-	add	%%result, 2
-	jmp	%%end
-
-%%miscompare16:
-	and	%%tmp, 0xFFFF
-%%miscompare:
-	bsf	%%tmp, %%tmp
-	shr	%%tmp, 3
-	add	%%result, %%tmp
-%%end:
-%endm
-
-;; compare 258 bytes = 8 * 32 + 2
 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
-;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
-%macro compare258_x 6
+;; assumes the input buffer has size at least 8
+;; compare_x src1, src2, result, result_max, tmp, xtmp1, xtmp2
+%macro compare_x 7
 %define %%src1		%1
 %define %%src2		%2
-%define %%result	%3
-%define %%tmp		%4
-%define %%tmp32		%4d
-%define %%tmp16		%4w	; tmp as a 16-bit register
-%define %%xtmp		%5
-%define %%xtmp2		%6
+%define %%result	%3	; Accumulator for match_length
+%define %%result_max	%4
+%define %%tmp		%5
+%define %%tmp16		%5w	; tmp as a 16-bit register
+%define %%tmp32		%5d	; tmp as a 32-bit register
+%define %%xtmp		%6
+%define %%xtmp2		%7
+
+	sub	%%result_max, 32
+	cmp	%%result, %%result_max
+	jg	%%_by_16

-	xor	%%result, %%result
 %%loop1:
 	MOVDQU		%%xtmp, [%%src1 + %%result]
 	MOVDQU		%%xtmp2, [%%src2 + %%result]
 	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
 	PMOVMSKB	%%tmp32, %%xtmp
 	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare
+	jnz		%%miscompare_vect
 	add		%%result, 16

 	MOVDQU		%%xtmp, [%%src1 + %%result]
@ -156,120 +128,86 @@
 	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
 	PMOVMSKB	%%tmp32, %%xtmp
 	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare
+	jnz		%%miscompare_vect
 	add		%%result, 16

-	cmp	%%result, 256
-	jb	%%loop1
+	cmp	%%result, %%result_max
+	jle	%%loop1
+
+%%_by_16:
+	add	%%result_max, 16
+	cmp	%%result, %%result_max
+	jg	%%_by_8
+
+	MOVDQU		%%xtmp, [%%src1 + %%result]
+	MOVDQU		%%xtmp2, [%%src2 + %%result]
+	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
+	PMOVMSKB	%%tmp32, %%xtmp
+	xor		%%tmp, 0xFFFF
+	jnz		%%miscompare_vect
+	add		%%result, 16
+
+%%_by_8:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	jg	%%_cmp_last

 	; compare last two bytes
-	mov	%%tmp16, [%%src1 + %%result]
-	xor	%%tmp16, [%%src2 + %%result]
-	jnz	%%miscompare16
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8

-	; no miscompares, return 258
-	add	%%result, 2
+%%_cmp_last:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	je	%%end
+
+	lea	%%result, [%%result_max - 8]
+
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8
 	jmp	%%end

-%%miscompare16:
-	and	%%tmp, 0xFFFF
+%%miscompare_reg:
 	bsf	%%tmp, %%tmp
 	shr	%%tmp, 3
 	add	%%result, %%tmp
 	jmp	%%end
-%%miscompare:
+
+%%miscompare_vect:
 	bsf	%%tmp, %%tmp
 	add	%%result, %%tmp
 %%end:
 %endm

-;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
-;; were already checked
-;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
-;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
-%macro compare250_x 6
-%define %%src1		%1
-%define %%src2		%2
-%define %%result	%3
-%define %%tmp		%4
-%define %%tmp32		%4d	; tmp as a 16-bit register
-%define %%xtmp		%5
-%define %%xtmp2		%6
-
-	mov	%%result, 8
-	MOVDQU		%%xtmp, [%%src1 + 8]
-	MOVDQU		%%xtmp2, [%%src2 + 8]
-	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
-	PMOVMSKB	%%tmp32, %%xtmp
-	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare
-	add		%%result, 16
-%%loop1:
-	MOVDQU		%%xtmp, [%%src1 + %%result]
-	MOVDQU		%%xtmp2, [%%src2 + %%result]
-	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
-	PMOVMSKB	%%tmp32, %%xtmp
-	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare
-	add		%%result, 16
-
-	MOVDQU		%%xtmp, [%%src1 + %%result]
-	MOVDQU		%%xtmp2, [%%src2 + %%result]
-	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
-	PMOVMSKB	%%tmp32, %%xtmp
-	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare
-	add		%%result, 16
-
-	cmp	%%result, 258 - 16
-	jb	%%loop1
-
-	MOVDQU		%%xtmp, [%%src1 + %%result]
-	MOVDQU		%%xtmp2, [%%src2 + %%result]
-	PCMPEQB		%%xtmp, %%xtmp, %%xtmp2
-	PMOVMSKB	%%tmp32, %%xtmp
-	xor		%%tmp, 0xFFFF
-	jnz		%%miscompare_last
-	; no miscompares, return 258
-	mov		%%result, 258
-	jmp	%%end
-
-%%miscompare_last:
-	bsf	%%tmp, %%tmp
-	add	%%result, %%tmp
-
-	;; Guarantee the result has length at most 258.
-	mov	%%tmp, 258
-	cmp	%%result, 258
-	cmova	%%result, %%tmp
-	jmp	%%end
-%%miscompare:
-	bsf	%%tmp, %%tmp
-	add	%%result, %%tmp
-%%end:
-%endm
-
-;; compare 258 bytes = 8 * 32 + 2
 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
-;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
-%macro compare258_y 6
+;; assumes the input buffer has size at least 8
+;; compare_y src1, src2, result, result_max, tmp, xtmp1, xtmp2
+%macro compare_y 7
 %define %%src1		%1
 %define %%src2		%2
-%define %%result	%3
-%define %%tmp		%4
-%define %%tmp16		%4w	; tmp as a 16-bit register
-%define %%tmp32		%4d	; tmp as a 32-bit register
-%define %%ytmp		%5
-%define %%ytmp2		%6
+%define %%result	%3	; Accumulator for match_length
+%define %%result_max	%4
+%define %%tmp		%5
+%define %%tmp16		%5w	; tmp as a 16-bit register
+%define %%tmp32		%5d	; tmp as a 32-bit register
+%define %%ytmp		%6
+%define %%ytmp2		%7
+
+	sub	%%result_max, 64
+	cmp	%%result, %%result_max
+	jg	%%_by_32

-	xor	%%result, %%result
 %%loop1:
 	vmovdqu		%%ytmp, [%%src1 + %%result]
 	vmovdqu		%%ytmp2, [%%src2 + %%result]
 	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
 	vpmovmskb	%%tmp, %%ytmp
 	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare
+	jnz		%%miscompare_vect
 	add		%%result, 32

 	vmovdqu		%%ytmp, [%%src1 + %%result]
@ -277,123 +215,125 @@
 	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
 	vpmovmskb	%%tmp, %%ytmp
 	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare
+	jnz		%%miscompare_vect
 	add		%%result, 32

-	cmp	%%result, 256
-	jb	%%loop1
+	cmp	%%result, %%result_max
+	jle	%%loop1
+
+%%_by_32:
+	add	%%result_max, 32
+	cmp	%%result, %%result_max
+	jg	%%_by_16
+
+	vmovdqu		%%ytmp, [%%src1 + %%result]
+	vmovdqu		%%ytmp2, [%%src2 + %%result]
+	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
+	vpmovmskb	%%tmp, %%ytmp
+	xor		%%tmp32, 0xFFFFFFFF
+	jnz		%%miscompare_vect
+	add		%%result, 32
+
+%%_by_16:
+	add	%%result_max, 16
+	cmp	%%result, %%result_max
+	jg	%%_by_8
+
+	vmovdqu		%%ytmp %+ x, [%%src1 + %%result]
+	vmovdqu		%%ytmp2 %+ x, [%%src2 + %%result]
+	vpcmpeqb	%%ytmp %+ x, %%ytmp %+ x, %%ytmp2 %+ x
+	vpmovmskb	%%tmp, %%ytmp %+ x
+	xor		%%tmp32, 0xFFFF
+	jnz		%%miscompare_vect
+	add		%%result, 16
+
+%%_by_8:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	jg	%%_cmp_last
+
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8
+
+%%_cmp_last:
+	add	%%result_max, 8
+	cmp	%%result, %%result_max
+	je	%%end
+
+	lea	%%result, [%%result_max - 8]

 	; compare last two bytes
-	mov	%%tmp16, [%%src1 + %%result]
-	xor	%%tmp16, [%%src2 + %%result]
-	jnz	%%miscompare16
-
-	; no miscompares, return 258
-	add	%%result, 2
+	mov	%%tmp, [%%src1 + %%result]
+	xor	%%tmp, [%%src2 + %%result]
+	jnz	%%miscompare_reg
+	add	%%result, 8
 	jmp	%%end

-%%miscompare16:
-	and	%%tmp, 0xFFFF
+%%miscompare_reg:
 	bsf	%%tmp, %%tmp
 	shr	%%tmp, 3
 	add	%%result, %%tmp
 	jmp	%%end
-%%miscompare:
+
+%%miscompare_vect:
 	bsf	%%tmp, %%tmp
 	add	%%result, %%tmp
 %%end:
 %endm

-
-;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
-;; were already checked
-;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
-;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
-%macro compare250_y 6
+%macro compare250 7
 %define %%src1		%1
 %define %%src2		%2
 %define %%result	%3
-%define %%tmp		%4
-%define %%tmp16		%4w	; tmp as a 16-bit register
-%define %%tmp32		%4d	; tmp as a 32-bit register
-%define %%ytmp		%5
-%define %%ytmp2		%6
+%define %%result_max	%4
+%define %%tmp		%5
+%define %%xtmp0		%6x
+%define %%xtmp1		%7x
+%define %%ytmp0		%6
+%define %%ytmp1		%7

-	mov	%%result, 8
-	vmovdqu		%%ytmp, [%%src1 + 8]
-	vmovdqu		%%ytmp2, [%%src2 + 8]
-	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
-	vpmovmskb	%%tmp, %%ytmp
-	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare
-	add		%%result, 32
-%%loop1:
-	vmovdqu		%%ytmp, [%%src1 + %%result]
-	vmovdqu		%%ytmp2, [%%src2 + %%result]
-	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
-	vpmovmskb	%%tmp, %%ytmp
-	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare
-	add		%%result, 32
-
-	vmovdqu		%%ytmp, [%%src1 + %%result]
-	vmovdqu		%%ytmp2, [%%src2 + %%result]
-	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
-	vpmovmskb	%%tmp, %%ytmp
-	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare
-	add		%%result, 32
-
-	cmp	%%result, 258 - 32
-	jb	%%loop1
-
-	vmovdqu		%%ytmp, [%%src1 + %%result]
-	vmovdqu		%%ytmp2, [%%src2 + %%result]
-	vpcmpeqb	%%ytmp, %%ytmp, %%ytmp2
-	vpmovmskb	%%tmp, %%ytmp
-	xor		%%tmp32, 0xFFFFFFFF
-	jnz		%%miscompare_last
-	mov	%%result, 258
-	jmp	%%end
-
-%%miscompare_last:
-	bsf	%%tmp, %%tmp
-	add	%%result, %%tmp
-
-	;; Guarantee the result has length at most 258.
-	mov	%%tmp, 258
-	cmp	%%result, 258
-	cmova	%%result, %%tmp
-	jmp	%%end
-
-%%miscompare:
-	bsf	%%tmp, %%tmp
-	add	%%result, %%tmp
-%%end:
-%endm
-
-%macro compare250 6
-%define %%src1		%1
-%define %%src2		%2
-%define %%result	%3
-%define %%tmp		%4
-%define %%xtmp0		%5x
-%define %%xtmp1		%6x
-%define %%ytmp0		%5
-%define %%ytmp1		%6
+	mov	%%tmp, 250
+	cmp	%%result_max, 250
+	cmovg	%%result_max, %%tmp

 %if (COMPARE_TYPE == 1)
-	compare250_r	%%src1, %%src2, %%result, %%tmp
+	compare_r	%%src1, %%src2, %%result, %%result_max, %%tmp
 %elif (COMPARE_TYPE == 2)
-	compare250_x	%%src1, %%src2, %%result, %%tmp, %%xtmp0, %%xtmp1
+	compare_x	%%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
 %elif (COMPARE_TYPE == 3)
-	compare250_y	%%src1, %%src2, %%result, %%tmp, %%ytmp0, %%ytmp1
+	compare_y	%%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
 %else
 %error Unknown Compare type COMPARE_TYPE
 % error
 %endif
 %endmacro

+; Assumes the buffer has at least 8 bytes
+; Accumulates match length onto result
+%macro compare_large 7
+%define %%src1		%1
+%define %%src2		%2
+%define %%result	%3
+%define %%result_max	%4
+%define %%tmp		%5
+%define %%xtmp0		%6x
+%define %%xtmp1		%7x
+%define %%ytmp0		%6
+%define %%ytmp1		%7
+
+%if (COMPARE_TYPE == 1)
+	compare_r	%%src1, %%src2, %%result, %%result_max, %%tmp
+%elif (COMPARE_TYPE == 2)
+	compare_x	%%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1
+%elif (COMPARE_TYPE == 3)
+	compare_y	%%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1
+%else
+%error Unknown Compare type COMPARE_TYPE
+ % error
+%endif
+%endmacro

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/igzip/igzip_icf_body_h1_gr_bt.asm
+++ b/igzip/igzip_icf_body_h1_gr_bt.asm
@ -47,6 +47,9 @@ global %1
 %endm
 %endif

+%define LARGE_MATCH_HASH_REP 1 	; Hash 4 * LARGE_MATCH_HASH_REP elements
+%define LARGE_MATCH_MIN 264 	; Minimum match size to enter large match emit loop
+%define MIN_INBUF_PADDING 16
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -76,6 +79,7 @@ global %1
 %define	len2		r8
 %define	tmp4		r8
 %define hmask1		r8
+%define len_code2	r8

 %define	len		rdx
 %define len_code	rdx
@ -110,9 +114,10 @@ dist_mask_offset    equ 16
 hash_mask_offset    equ 24
 f_end_i_mem_offset  equ 32
 stream_offset       equ 40
-gpr_save_mem_offset equ 48       ; gpr save area (8*8 bytes)
+inbuf_slop_offset   equ 48
+gpr_save_mem_offset equ 64       ; gpr save area (8*8 bytes)
 xmm_save_mem_offset equ gpr_save_mem_offset + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
-stack_size          equ 7*8 + 8*8 + 4*16
+stack_size          equ 9*8 + 8*8 + 4*16

 ;;; 8 because stack address is odd multiple of 8 after a function call and
 ;;; we want it aligned to 16 bytes
@ -208,8 +213,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	mov	file_length %+ d, [stream + _avail_in]
 	add	file_length, f_i

-	; file_length -= LA;
-	sub	file_length, LA
+	mov	qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
+	cmp	byte [stream + _end_of_stream], 0
+	jnz	.default_inbuf_padding
+	cmp	byte [stream + _flush], 0
+	jnz	.default_inbuf_padding
+	mov	qword [rsp + inbuf_slop_offset], LA
+.default_inbuf_padding:
+
+	; file_length -= INBUF_PADDING;
+	sub	file_length, [rsp + inbuf_slop_offset]
 	; if (file_length <= 0) continue;
 	mov	hmask1 %+ d, [rsp + hash_mask_offset]

@ -220,7 +233,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	MOVDQU	xdata, [file_start + f_i]
 	mov	curr_data, [file_start + f_i]
 	mov	tmp1, curr_data
-	mov	tmp2, curr_data

 	compute_hash	hash, curr_data

@ -295,6 +307,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	test    len %+ d, 0xFFFFFFFF
 	jz      .len_dist_huffman_pre

+	PSRLDQ	xdata, 1
 	inc	dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
 	movzx	lit_code2, curr_data %+ b
 	;; Check for len/dist match for second literal
@ -318,9 +331,15 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	;; Setup for updating hash
 	lea	tmp3, [f_i + 1]	; tmp3 <= k

+	mov	tmp2, f_i
 	add	file_start, f_i
+	add	f_i, len2
+	cmp	f_i, file_length
+	jg	.len_dist_lit_huffman_finish
+
 	MOVDQU	xdata, [file_start + len2]
 	mov	tmp1, [file_start + len2]
+	sub	file_start, tmp2

 	shr	curr_data, 24
 	compute_hash	hash3, curr_data
@ -329,9 +348,6 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	mov	curr_data, tmp1
 	shr	tmp1, 8

-	sub	file_start, f_i
-	add	f_i, len2
-
 	mov	[hash_table + 2 * hash], tmp3 %+ w

 	compute_hash	hash, curr_data
@ -361,10 +377,28 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	and	hash2 %+ d, hmask3 %+ d

 	; continue
-	cmp	f_i, file_length
-	jl	.loop2
+	jmp	.loop2
+
+.len_dist_lit_huffman_finish:
+	sub	file_start, tmp2
+
+	mov	[hash_table + 2 * hash], tmp3 %+ w
+	add	tmp3,1
+	mov	[hash_table + 2 * hash2], tmp3 %+ w
+
+	add	dist_code2, 254
+	add	dist_code2, len2
+
+	inc	dword [lit_len_hist + HIST_ELEM_SIZE*(len2 + 254)]
+
+	movnti	dword [m_out_buf + 4], dist_code2 %+ d
+	add	m_out_buf, 8
+
+	shr	dist_code2, DIST_OFFSET
+	and	dist_code2, 0x1F
+	inc	dword [dist_hist + HIST_ELEM_SIZE*dist_code2]
+
 	jmp	.input_end
-	;; encode as dist/len

 .len_dist_huffman_pre:
 	bsf	len, len
@ -380,14 +414,21 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	; get_dist_code(dist, &code2, &code_len2);
 	get_dist_icf_code   dist, dist_code, tmp1

+.len_dist_huffman_skip:
+
 	mov	hmask2 %+ d, [rsp + hash_mask_offset]

+	mov	tmp1, f_i
 	add	file_start, f_i
+
+	add	f_i, len
+	cmp	f_i, file_length
+	jg	.len_dist_huffman_finish
+
 	MOVDQU	xdata, [file_start + len]
 	mov	curr_data2, [file_start + len]
 	mov	curr_data, curr_data2
-	sub	file_start, f_i
-	add	f_i, len
+	sub	file_start, tmp1
 	; get_len_code(len, &code, &code_len);
 	lea	len_code, [len + 254]
 	or	dist_code, len_code
@ -415,15 +456,39 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	and	hash2 %+ d, hmask2 %+ d

 	; continue
-	cmp	f_i, file_length
-	jl	.loop2
+	jmp	.loop2
+
+.len_dist_huffman_finish:
+	sub	file_start, tmp1
+
+	; get_len_code(len, &code, &code_len);
+	lea	len_code, [len + 254]
+	or	dist_code, len_code
+
+	mov	[hash_table + 2 * hash], tmp3 %+ w
+	add	tmp3,1
+	mov	[hash_table + 2 * hash2], tmp3 %+ w
+
+	inc	dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
+
+	movnti	dword [m_out_buf], dist_code %+ d
+	add	m_out_buf, 4
+
+	shr     dist_code, DIST_OFFSET
+	and     dist_code, 0x1F
+	inc     dword [dist_hist + HIST_ELEM_SIZE*dist_code]
+
 	jmp	.input_end

 .write_lit_bits:
-	MOVDQU	xdata, [file_start + f_i + 1]
-	add	f_i, 1
 	MOVQ	curr_data, xdata

+	add	f_i, 1
+	cmp	f_i, file_length
+	jg	.write_lit_bits_finish
+
+	MOVDQU	xdata, [file_start + f_i]
+
 	inc	dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]

 	shl	lit_code2, DIST_OFFSET
@ -432,9 +497,16 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 	movnti	dword [m_out_buf], lit_code %+ d
 	add	m_out_buf, 4

-	; continue
-	cmp	f_i, file_length
-	jl	.loop2
+	jmp	.loop2
+
+.write_lit_bits_finish:
+	inc	dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
+
+	shl	lit_code2, DIST_OFFSET
+	lea	lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
+
+	movnti	dword [m_out_buf], lit_code %+ d
+	add	m_out_buf, 4

 .input_end:
 	mov	stream, [rsp + stream_offset]
@ -454,7 +526,7 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :

 .end:
 	;; update input buffer
-	add	file_length, LA
+	add	file_length, [rsp + inbuf_slop_offset]
 	mov	[stream + _total_in], f_i %+ d
 	mov	[stream + _internal_state_block_end], f_i %+ d
 	add	file_start, f_i
@ -487,21 +559,143 @@ isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
 .compare_loop:
 	lea	tmp2, [tmp1 + dist - 1]

-	compare250	tmp1, tmp2, len, tmp3, ytmp0, ytmp1
+	mov	len2, file_length
+	sub	len2, f_i
+	add	len2, [rsp + inbuf_slop_offset]
+	add	len2, 1

+	mov	len, 8
+	compare_large	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
+
+	cmp	len, 258
+	jle	.len_dist_huffman
+	cmp	len, LARGE_MATCH_MIN
+	jge	.do_emit
+	mov	len, 258
 	jmp	.len_dist_huffman

 .compare_loop2:
 	lea	tmp2, [tmp1 + dist2]
 	add	tmp1, 1

-	compare250	tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
+	mov	len, file_length
+	sub	len, f_i
+	add	len, [rsp + inbuf_slop_offset]
+	mov	len2, 8
+	compare_large	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1

 	movzx	lit_code, curr_data %+ b
 	shr	curr_data, 8
 	inc	dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
+	cmp	len2, 258
+	jle	.len_dist_lit_huffman
+	cmp	len2, LARGE_MATCH_MIN
+	jge	.do_emit2
+	mov	len2, 258
 	jmp	.len_dist_lit_huffman

+.do_emit2:
+	or	lit_code, LIT
+	movnti	dword [m_out_buf], lit_code %+ d
+	add	m_out_buf, 4
+
+	inc	f_i
+	mov	dist, dist2
+	mov	len, len2
+
+.do_emit:
+	neg	dist
+	get_dist_icf_code   dist, dist_code, tmp1
+
+	mov	len_code2, 258 + 254
+	or	len_code2, dist_code
+	mov	tmp1, dist_code
+	shr     tmp1, DIST_OFFSET
+	and     tmp1, 0x1F
+	lea	tmp3, [f_i + 1]
+	dec	f_i
+
+	mov	[hash_table + 2 * hash], tmp3 %+ w
+	add	tmp3,1
+	mov	[hash_table + 2 * hash2], tmp3 %+ w
+.emit:
+	sub	len, 258
+	add	f_i, 258
+
+	inc	dword [lit_len_hist + HIST_ELEM_SIZE*(258 + 254)]
+	inc     dword [dist_hist + HIST_ELEM_SIZE*tmp1]
+	movnti	dword [m_out_buf], len_code2 %+ d
+	add	m_out_buf, 4
+
+	cmp	m_out_buf, [rsp + m_out_end]
+	ja	.output_end
+
+	cmp	len, LARGE_MATCH_MIN
+	jge	.emit
+
+	mov	len2, 258
+	cmp	len, len2
+	cmovg	len, len2
+
+		; get_len_code(len, &code, &code_len);
+	add	f_i, len
+	lea	len_code, [len + 254]
+	or	dist_code, len_code
+
+	inc	dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
+	inc     dword [dist_hist + HIST_ELEM_SIZE*tmp1]
+
+	movnti	dword [m_out_buf], dist_code %+ d
+	add	m_out_buf, 4
+
+	cmp	file_length, f_i
+	jle	.input_end
+
+	lea	tmp2, [f_i - 4 * LARGE_MATCH_HASH_REP]
+	mov	hmask2 %+ d, [rsp + hash_mask_offset]
+
+%rep LARGE_MATCH_HASH_REP
+	mov	curr_data %+ d, dword [file_start + tmp2]
+	mov	curr_data2 %+ d, dword [file_start + tmp2 + 1]
+	mov	tmp3 %+ d, dword [file_start + tmp2 + 2]
+	mov	tmp1 %+ d, dword [file_start + tmp2 + 3]
+
+	compute_hash	hash, curr_data
+	compute_hash	hash2, curr_data2
+	compute_hash	hash3, tmp3
+	compute_hash	hmask3, tmp1
+
+	and	hash %+ d, hmask2 %+ d
+	and	hash2 %+ d, hmask2 %+ d
+	and	hash3 %+ d, hmask2 %+ d
+	and	hmask3 %+ d, hmask2 %+ d
+
+	mov	[hash_table + 2 * hash], tmp2 %+ w
+	add	tmp2, 1
+	mov	[hash_table + 2 * hash2], tmp2 %+ w
+	add	tmp2, 1
+	mov	[hash_table + 2 * hash3], tmp2 %+ w
+	add	tmp2, 1
+	mov	[hash_table + 2 * hmask3], tmp2 %+ w
+%if (LARGE_MATCH_HASH_REP > 1)
+	add	tmp2, 1
+%endif
+%endrep
+	; for (f_i = f_start_i; f_i < file_length; f_i++) {
+	MOVDQU	xdata, [file_start + f_i]
+	mov	curr_data, [file_start + f_i]
+	mov	tmp1, curr_data
+
+	compute_hash	hash, curr_data
+
+	shr	tmp1, 8
+	compute_hash	hash2, tmp1
+
+	and	hash, hmask2
+	and	hash2, hmask2
+
+	jmp	.loop2
+
 .write_first_byte:
 	mov	hmask1 %+ d, [rsp + hash_mask_offset]
 	cmp	m_out_buf, [rsp + m_out_end]
--- a/igzip/igzip_update_histogram.asm
+++ b/igzip/igzip_update_histogram.asm
@ -545,7 +545,9 @@ compare_loop:
 	and	hash2 %+ d, LVL0_HASH_MASK
 	lea	tmp2, [tmp1 + dist - 1]

-	compare250	tmp1, tmp2, len, tmp3, ytmp0, ytmp1
+	mov	len2, 250
+	mov	len, 8
+	compare250	tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1

 	lea	tmp3, [f_i + 1]
 	jmp	len_dist_huffman
@ -554,7 +556,9 @@ compare_loop2:
 	add	tmp1, 1
 	lea	tmp2, [tmp1 + dist2 - 1]

-	compare250	tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
+	mov	len, 250
+	mov	len2, 8
+	compare250	tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1

 	and	curr_data, 0xff
 	inc	qword [histogram + _lit_len_offset + 8 * curr_data]