mirror of
https://github.com/intel/isa-l.git
synced 2024-12-13 09:52:56 +01:00
igzip: Modify encode_df_04 to behave more like encode_df_06 algorithm
Change-Id: I39c5d0d8182efb0fe8aa6bea97d9361df4ee8ddf Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
parent
5a55e3096c
commit
edacadc8fb
@ -246,6 +246,13 @@ encode_deflate_icf_ %+ ARCH:
|
|||||||
vpsrldq codes1, 8
|
vpsrldq codes1, 8
|
||||||
vpsrldq code_lens1, 8
|
vpsrldq code_lens1, 8
|
||||||
|
|
||||||
|
;; Bit align dqwords
|
||||||
|
vpaddq code_lens1, code_lens1, code_lens2
|
||||||
|
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||||
|
vpermq ybits_count, ybits_count, 0xcf
|
||||||
|
vpaddq code_lens2, ybits_count
|
||||||
|
vpsllvq codes2, codes2, ybits_count
|
||||||
|
|
||||||
;; Merge two qwords into dqwords
|
;; Merge two qwords into dqwords
|
||||||
vmovdqa ytmp, [q_64]
|
vmovdqa ytmp, [q_64]
|
||||||
vpsubq code_lens3, ytmp, code_lens2
|
vpsubq code_lens3, ytmp, code_lens2
|
||||||
@ -256,27 +263,30 @@ encode_deflate_icf_ %+ ARCH:
|
|||||||
|
|
||||||
vpxor codes1, codes1, codes3
|
vpxor codes1, codes1, codes3
|
||||||
vpxor codes1, codes1, codes2
|
vpxor codes1, codes1, codes2
|
||||||
vpaddq code_lens1, code_lens1, code_lens2
|
|
||||||
|
|
||||||
vmovq tmp, code_lens1 %+ x ;Number of bytes
|
vmovq tmp, code_lens1 %+ x ;Number of bytes
|
||||||
shr tmp, 3
|
shr tmp, 3
|
||||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
|
||||||
|
;; Extract last bytes
|
||||||
|
vpaddq code_lens2, code_lens1, ybits_count
|
||||||
|
vpsrlq code_lens2, code_lens2, 3
|
||||||
|
vpshufb codes2, codes1, code_lens2
|
||||||
|
vpand codes2, codes2, [bytes_mask]
|
||||||
|
vextracti128 ybits %+ x, codes2, 1
|
||||||
|
|
||||||
|
;; Check for short codes
|
||||||
|
vptest code_lens2, [min_write_mask]
|
||||||
|
jz .short_codes
|
||||||
|
.short_codes_next
|
||||||
|
|
||||||
|
vpermq codes2, codes2, 0x45
|
||||||
|
vpor codes1, codes1, codes2
|
||||||
|
|
||||||
;; bit shift upper dqword combined bits to line up with lower dqword
|
;; bit shift upper dqword combined bits to line up with lower dqword
|
||||||
vextracti128 codes2 %+ x, codes1, 1
|
|
||||||
vextracti128 code_lens2 %+ x, code_lens1, 1
|
vextracti128 code_lens2 %+ x, code_lens1, 1
|
||||||
|
|
||||||
vpbroadcastq ybits_count, ybits_count %+ x
|
|
||||||
vpsrldq codes3, codes2, 1
|
|
||||||
vpsllvq codes2, codes2, ybits_count
|
|
||||||
vpsllvq codes3, codes3, ybits_count
|
|
||||||
vpslldq codes3, codes3, 1
|
|
||||||
vpor codes2, codes2, codes3
|
|
||||||
|
|
||||||
; Write out lower dqword of combined bits
|
; Write out lower dqword of combined bits
|
||||||
vmovdqu [out_buf], codes1
|
vmovdqu [out_buf], codes1
|
||||||
movzx bits, byte [out_buf + tmp]
|
|
||||||
vmovq codes1 %+ x, bits
|
|
||||||
vpaddq code_lens1, code_lens1, code_lens2
|
vpaddq code_lens1, code_lens1, code_lens2
|
||||||
|
|
||||||
vmovq tmp2, code_lens1 %+ x ;Number of bytes
|
vmovq tmp2, code_lens1 %+ x ;Number of bytes
|
||||||
@ -284,11 +294,8 @@ encode_deflate_icf_ %+ ARCH:
|
|||||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||||
|
|
||||||
; Write out upper dqword of combined bits
|
; Write out upper dqword of combined bits
|
||||||
vpor codes1 %+ x, codes1 %+ x, codes2 %+ x
|
vextracti128 [out_buf + tmp], codes1, 1
|
||||||
vmovdqu [out_buf + tmp], codes1 %+ x
|
|
||||||
add out_buf, tmp2
|
add out_buf, tmp2
|
||||||
movzx bits, byte [out_buf]
|
|
||||||
vmovq ybits %+ x, bits
|
|
||||||
|
|
||||||
cmp ptr, in_buf_end
|
cmp ptr, in_buf_end
|
||||||
jbe .main_loop
|
jbe .main_loop
|
||||||
@ -298,6 +305,11 @@ encode_deflate_icf_ %+ ARCH:
|
|||||||
vmovq bits, ybits %+ x
|
vmovq bits, ybits %+ x
|
||||||
jmp .finish
|
jmp .finish
|
||||||
|
|
||||||
|
.short_codes:
|
||||||
|
;; Merge last bytes when the second dqword contains less than a byte
|
||||||
|
vpor ybits %+ x, codes2 %+ x
|
||||||
|
jmp .short_codes_next
|
||||||
|
|
||||||
.long_codes:
|
.long_codes:
|
||||||
add end_ptr, VECTOR_SLOP
|
add end_ptr, VECTOR_SLOP
|
||||||
sub ptr, VECTOR_SIZE
|
sub ptr, VECTOR_SIZE
|
||||||
@ -509,7 +521,9 @@ encode_deflate_icf_ %+ ARCH:
|
|||||||
section .data
|
section .data
|
||||||
align 32
|
align 32
|
||||||
max_write_d:
|
max_write_d:
|
||||||
dd 0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
|
dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
|
||||||
|
min_write_mask:
|
||||||
|
dq 0x00, 0x00, 0xff, 0x00
|
||||||
offset_mask:
|
offset_mask:
|
||||||
dq 0x0000000000000007, 0x0000000000000000
|
dq 0x0000000000000007, 0x0000000000000000
|
||||||
dq 0x0000000000000000, 0x0000000000000000
|
dq 0x0000000000000000, 0x0000000000000000
|
||||||
@ -528,3 +542,6 @@ lit_icr_mask:
|
|||||||
eb_icr_mask:
|
eb_icr_mask:
|
||||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||||
|
bytes_mask:
|
||||||
|
dq 0x00000000000000ff, 0x0000000000000000
|
||||||
|
dq 0x00000000000000ff, 0x0000000000000000
|
||||||
|
Loading…
Reference in New Issue
Block a user