igzip: Modify encode_df_04 to behave more like encode_df_06 algorithm

Change-Id: I39c5d0d8182efb0fe8aa6bea97d9361df4ee8ddf Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
2024-12-13 09:52:56 +01:00 · 2017-04-25 13:35:53 -07:00 · 2017-04-25 13:35:53 -07:00 · edacadc8fb
commit edacadc8fb
parent 5a55e3096c
1 changed files with 34 additions and 17 deletions
--- a/igzip/encode_df_04.asm
+++ b/igzip/encode_df_04.asm
@ -246,6 +246,13 @@ encode_deflate_icf_ %+ ARCH:
 	vpsrldq	codes1, 8
 	vpsrldq	code_lens1, 8
 	;; Bit align dqwords
 	vpaddq	code_lens1, code_lens1, code_lens2
 	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
 	vpermq	ybits_count, ybits_count, 0xcf
 	vpaddq	code_lens2, ybits_count
 	vpsllvq	codes2, codes2, ybits_count
 	;; Merge two qwords into dqwords
 	vmovdqa	ytmp, [q_64]
 	vpsubq	code_lens3, ytmp, code_lens2
@ -256,27 +263,30 @@ encode_deflate_icf_ %+ ARCH:
 	vpxor	codes1, codes1, codes3
 	vpxor	codes1, codes1, codes2
 	vpaddq	code_lens1, code_lens1, code_lens2
 	vmovq	tmp, code_lens1 %+ x 	;Number of bytes
 	shr	tmp, 3
-	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
+
 	;; Extract last bytes
 	vpaddq	code_lens2, code_lens1, ybits_count
 	vpsrlq	code_lens2, code_lens2, 3
 	vpshufb	codes2, codes1, code_lens2
 	vpand	codes2, codes2, [bytes_mask]
 	vextracti128	ybits %+ x, codes2, 1
 	;; Check for short codes
 	vptest code_lens2, [min_write_mask]
 	jz	.short_codes
 .short_codes_next
 	vpermq	codes2, codes2, 0x45
 	vpor	codes1, codes1, codes2
 	;; bit shift upper dqword combined bits to line up with lower dqword
 	vextracti128	codes2 %+ x, codes1, 1
 	vextracti128	code_lens2 %+ x, code_lens1, 1
 	vpbroadcastq	ybits_count, ybits_count %+ x
 	vpsrldq	codes3, codes2, 1
 	vpsllvq	codes2, codes2, ybits_count
 	vpsllvq	codes3, codes3, ybits_count
 	vpslldq	codes3, codes3, 1
 	vpor	codes2, codes2, codes3
 	; Write out lower dqword of combined bits
 	vmovdqu	[out_buf], codes1
 	movzx	bits, byte [out_buf + tmp]
 	vmovq	codes1 %+ x, bits
 	vpaddq	code_lens1, code_lens1, code_lens2
 	vmovq	tmp2, code_lens1 %+ x	;Number of bytes
@ -284,11 +294,8 @@ encode_deflate_icf_ %+ ARCH:
 	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
 	; Write out upper dqword of combined bits
-	vpor	codes1 %+ x, codes1 %+ x, codes2 %+ x
+	vextracti128	[out_buf + tmp], codes1, 1
 	vmovdqu	[out_buf + tmp], codes1 %+ x
 	add	out_buf, tmp2
 	movzx	bits, byte [out_buf]
 	vmovq	ybits %+ x, bits
 	cmp	ptr, in_buf_end
 	jbe	.main_loop
@ -298,6 +305,11 @@ encode_deflate_icf_ %+ ARCH:
 	vmovq	bits, ybits %+ x
 	jmp	.finish
 .short_codes:
 	;; Merge last bytes when the second dqword contains less than a byte
 	vpor ybits %+ x, codes2 %+ x
 	jmp .short_codes_next
 .long_codes:
 	add	end_ptr, VECTOR_SLOP
 	sub	ptr, VECTOR_SIZE
@ -509,7 +521,9 @@ encode_deflate_icf_ %+ ARCH:
 section .data
 	align 32
 max_write_d:
-	dd	0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
+	dd	0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
 min_write_mask:
 	dq	0x00, 0x00, 0xff, 0x00
 offset_mask:
 	dq	0x0000000000000007, 0x0000000000000000
 	dq	0x0000000000000000, 0x0000000000000000
@ -528,3 +542,6 @@ lit_icr_mask:
 eb_icr_mask:
 	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
 	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
 bytes_mask:
 	dq	0x00000000000000ff, 0x0000000000000000
 	dq	0x00000000000000ff, 0x0000000000000000