mirror of
https://github.com/intel/isa-l.git
synced 2025-02-24 23:34:45 +01:00
igzip: Create AVX2 optimized version of level 3
Change-Id: Icfdb67445ee5afff85441cfee23beb66bfe15d5e Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
parent
52bb322912
commit
9acc3ed2ac
@ -135,6 +135,7 @@ objs = \
|
||||
bin\adler32_sse.obj \
|
||||
bin\adler32_avx2_4.obj \
|
||||
bin\igzip_deflate_hash.obj \
|
||||
bin\igzip_gen_icf_map_lh1_04.obj \
|
||||
bin\igzip_gen_icf_map_lh1_06.obj \
|
||||
bin\igzip_set_long_icf_fg_06.obj \
|
||||
bin\igzip_icf_body.obj
|
||||
|
@ -60,6 +60,7 @@ lsrc_x86_64 += igzip/igzip_body_01.asm \
|
||||
igzip/proc_heap.asm \
|
||||
igzip/igzip_deflate_hash.asm \
|
||||
igzip/igzip_gen_icf_map_lh1_06.asm \
|
||||
igzip/igzip_gen_icf_map_lh1_04.asm \
|
||||
igzip/igzip_set_long_icf_fg_06.asm
|
||||
|
||||
src_include += -I $(srcdir)/igzip
|
||||
|
626
igzip/igzip_gen_icf_map_lh1_04.asm
Normal file
626
igzip/igzip_gen_icf_map_lh1_04.asm
Normal file
@ -0,0 +1,626 @@
|
||||
%include "reg_sizes.asm"
|
||||
%include "lz0a_const.asm"
|
||||
%include "data_struct2.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg1 rcx
|
||||
%define arg2 rdx
|
||||
%define arg3 r8
|
||||
%define hash rsi
|
||||
%define next_in rdi
|
||||
%else
|
||||
%define arg1 rdi
|
||||
%define arg2 rsi
|
||||
%define arg3 rdx
|
||||
%define hash r8
|
||||
%define next_in rcx
|
||||
%endif
|
||||
|
||||
%define stream arg1
|
||||
%define level_buf arg1
|
||||
%define matches_next arg2
|
||||
%define f_i_end arg3
|
||||
|
||||
%define f_i rax
|
||||
%define file_start rbp
|
||||
%define tmp r9
|
||||
%define encode_size r10
|
||||
%define prev_len r11
|
||||
%define prev_dist r12
|
||||
|
||||
%define hash_table level_buf + _hash_map_hash_table
|
||||
|
||||
%define datas ymm0
|
||||
%define datas_lookup ymm1
|
||||
%define yhashes ymm2
|
||||
%define ydists ymm3
|
||||
%define ydists_lookup ymm4
|
||||
|
||||
%define ydownconvert_qd ymm5
|
||||
%define ydists2 ymm5
|
||||
%define yscatter ymm5
|
||||
%define ytmp2 ymm5
|
||||
|
||||
%define ylens1 ymm6
|
||||
%define ylens2 ymm7
|
||||
%define ylookup ymm8
|
||||
%define ylookup2 ymm9
|
||||
%define yindex ymm10
|
||||
|
||||
%define yrot_left ymm11
|
||||
%define yshift_finish ymm11
|
||||
%define yqword_shuf ymm11
|
||||
%define yhash_prod ymm11
|
||||
%define ycode ymm11
|
||||
%define ytmp3 ymm11
|
||||
|
||||
%define yones ymm12
|
||||
%define ydatas_perm2 ymm13
|
||||
%define yincrement ymm14
|
||||
|
||||
%define ytmp ymm15
|
||||
%define ydist_extra ymm15
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define stack_size 10*16 + 4 * 8 + 8
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
vmovdqa [rsp + 6*16], xmm12
|
||||
vmovdqa [rsp + 7*16], xmm13
|
||||
vmovdqu [rsp + 8*16], xmm14
|
||||
vmovdqa [rsp + 9*16], xmm15
|
||||
save_reg rsi, 10*16 + 0*8
|
||||
save_reg rdi, 10*16 + 1*8
|
||||
save_reg rbp, 10*16 + 2*8
|
||||
save_reg r12, 10*16 + 3*8
|
||||
end_prolog
|
||||
%endm
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
vmovdqa xmm15, [rsp + 9*16]
|
||||
|
||||
mov rsi, [rsp + 10*16 + 0*8]
|
||||
mov rdi, [rsp + 10*16 + 1*8]
|
||||
mov rbp, [rsp + 10*16 + 2*8]
|
||||
mov r12, [rsp + 10*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endm
|
||||
%else
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push rbp
|
||||
push r12
|
||||
%endm
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
pop rbp
|
||||
%endm
|
||||
%endif
|
||||
|
||||
%define VECT_SIZE 8
|
||||
%define HASH_BYTES 2
|
||||
|
||||
global gen_icf_map_lh1_04
|
||||
func(gen_icf_map_lh1_04)
|
||||
FUNC_SAVE
|
||||
|
||||
mov file_start, [stream + _next_in]
|
||||
mov f_i %+ d, dword [stream + _total_in]
|
||||
|
||||
sub file_start, f_i
|
||||
add f_i_end, f_i
|
||||
cmp f_i, f_i_end
|
||||
jge end_main
|
||||
|
||||
;; Prep for main loop
|
||||
mov level_buf, [stream + _level_buf]
|
||||
sub f_i_end, LA
|
||||
vmovdqu yincrement, [increment]
|
||||
vmovdqu yones, [ones]
|
||||
vmovdqu ydatas_perm2, [datas_perm2]
|
||||
|
||||
xor prev_len, prev_len
|
||||
xor prev_dist, prev_dist
|
||||
|
||||
;; Process first byte
|
||||
vmovd yhashes %+ x, dword [f_i + file_start]
|
||||
vmovdqu yhash_prod, [hash_prod]
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpand yhashes, yhashes, [hash_mask]
|
||||
vmovd hash %+ d, yhashes %+ x
|
||||
mov word [hash_table + HASH_BYTES * hash], f_i %+ w
|
||||
|
||||
add f_i, 1
|
||||
cmp f_i, f_i_end
|
||||
jg end_main
|
||||
|
||||
;;hash
|
||||
vmovdqu datas, [f_i + file_start]
|
||||
vpermq yhashes, datas, 0x44
|
||||
vpshufb yhashes, yhashes, [datas_shuf]
|
||||
vmovdqu yhash_prod, [hash_prod]
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpand yhashes, yhashes, [hash_mask]
|
||||
|
||||
vpermq ylookup, datas, 0x44
|
||||
vmovdqu yqword_shuf, [qword_shuf]
|
||||
vpshufb ylookup, ylookup, yqword_shuf
|
||||
vpermd ylookup2, ydatas_perm2, datas
|
||||
vpshufb ylookup2, ylookup2, yqword_shuf
|
||||
|
||||
;;gather/scatter hashes
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
|
||||
|
||||
vmovd yindex %+ x, f_i %+ d
|
||||
vpbroadcastd yindex, yindex %+ x
|
||||
vpaddd yindex, yindex, yincrement
|
||||
vpand yscatter, ydists_lookup, [upper_word]
|
||||
vpand ytmp, yindex, [low_word]
|
||||
vpor yscatter, yscatter, ytmp
|
||||
|
||||
vmovd tmp %+ d, yhashes %+ x
|
||||
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
||||
vpextrd tmp %+ d, yhashes %+ x, 1
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
||||
vpextrd tmp %+ d, yhashes %+ x, 2
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
||||
vpextrd tmp %+ d,yhashes %+ x, 3
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
||||
|
||||
vextracti128 yscatter %+ x, yscatter, 1
|
||||
vextracti128 yhashes %+ x, yhashes, 1
|
||||
|
||||
vmovd tmp %+ d, yhashes %+ x
|
||||
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
||||
vpextrd tmp %+ d, yhashes %+ x, 1
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
||||
vpextrd tmp %+ d, yhashes %+ x, 2
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
||||
vpextrd tmp %+ d,yhashes %+ x, 3
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
||||
|
||||
;; Compute hash for next loop
|
||||
vmovdqu datas, [f_i + file_start + VECT_SIZE]
|
||||
vpermq yhashes, datas, 0x44
|
||||
vpshufb yhashes, yhashes, [datas_shuf]
|
||||
vmovdqu yhash_prod, [hash_prod]
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpand yhashes, yhashes, [hash_mask]
|
||||
|
||||
vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
|
||||
|
||||
sub f_i_end, VECT_SIZE
|
||||
cmp f_i, f_i_end
|
||||
jg loop1_end
|
||||
|
||||
loop1:
|
||||
lea next_in, [f_i + file_start]
|
||||
|
||||
;; Calculate look back dists
|
||||
vpaddd ydists, ydists_lookup, yones
|
||||
vpsubd ydists, yindex, ydists
|
||||
vpand ydists, ydists, [dist_mask]
|
||||
vpaddd ydists, ydists, yones
|
||||
vpsubd ydists, yincrement, ydists
|
||||
|
||||
;;gather/scatter hashes
|
||||
add f_i, VECT_SIZE
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
|
||||
|
||||
vmovd yindex %+ x, f_i %+ d
|
||||
vpbroadcastd yindex, yindex %+ x
|
||||
vpaddd yindex, yindex, yincrement
|
||||
vpand yscatter, ydists_lookup, [upper_word]
|
||||
vpand ytmp, yindex, [low_word]
|
||||
vpor yscatter, yscatter, ytmp
|
||||
|
||||
vmovd tmp %+ d, yhashes %+ x
|
||||
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
||||
vpextrd tmp %+ d, yhashes %+ x, 1
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
||||
vpextrd tmp %+ d, yhashes %+ x, 2
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
||||
vpextrd tmp %+ d,yhashes %+ x, 3
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
||||
|
||||
vextracti128 yscatter %+ x, yscatter, 1
|
||||
vextracti128 yhashes %+ x, yhashes, 1
|
||||
|
||||
vmovd tmp %+ d, yhashes %+ x
|
||||
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
||||
vpextrd tmp %+ d, yhashes %+ x, 1
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
||||
vpextrd tmp %+ d, yhashes %+ x, 2
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
||||
vpextrd tmp %+ d,yhashes %+ x, 3
|
||||
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
||||
|
||||
;; Compute hash for next loop
|
||||
vpermq yhashes, datas_lookup, 0x44
|
||||
vpshufb yhashes, yhashes, [datas_shuf]
|
||||
vmovdqu yhash_prod, [hash_prod]
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpmaddwd yhashes, yhashes, yhash_prod
|
||||
vpand yhashes, yhashes, [hash_mask]
|
||||
|
||||
;;lookup old codes
|
||||
vextracti128 ydists2 %+ x, ydists, 1
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
|
||||
|
||||
;; Calculate dist_icf_code
|
||||
vpaddd ydists, ydists, yones
|
||||
vpsubd ydists, yincrement, ydists
|
||||
|
||||
vpslld ydist_extra, ydists, 12
|
||||
vpor ydist_extra, ydists, ydist_extra
|
||||
vpand ydist_extra, ydist_extra, [low_nibble]
|
||||
vpshufb ydist_extra, ydist_extra, [nibble_order]
|
||||
vmovdqu ytmp2, [bit_index]
|
||||
vpshufb ydist_extra, ytmp2, ydist_extra
|
||||
vpxor ytmp2, ytmp2, ytmp2
|
||||
vpcmpgtb ytmp2, ydist_extra, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 8
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 16
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 24
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpaddb ydist_extra, [base_offset]
|
||||
vpand ydist_extra, ydist_extra, ytmp2
|
||||
vpsrlq ytmp2, ydist_extra, 32
|
||||
vpxor ytmp3, ytmp3, ytmp3
|
||||
vpsadbw ydist_extra, ydist_extra, ytmp3
|
||||
vpsadbw ytmp2, ytmp2, ytmp3
|
||||
vpsubd ydist_extra, ydist_extra, ytmp2
|
||||
vpsllq ytmp2, ytmp2, 32
|
||||
vpor ydist_extra, ydist_extra, ytmp2
|
||||
vpcmpgtb ytmp3, ydist_extra, ytmp3
|
||||
vpand ydist_extra, ydist_extra, ytmp3
|
||||
|
||||
vmovdqu yones, yones
|
||||
vpsllvd ycode, yones, ydist_extra
|
||||
vpsubd ycode, ycode, yones
|
||||
vpcmpgtd ytmp2, ydists, yones
|
||||
vpand ycode, ydists, ycode
|
||||
vpand ycode, ycode, ytmp2
|
||||
vpsrlvd ydists, ydists, ydist_extra
|
||||
vpslld ydist_extra, ydist_extra, 1
|
||||
vpaddd ydists, ydists, ydist_extra
|
||||
vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
|
||||
vpaddd ydists, ydists, ycode
|
||||
|
||||
;; Setup ydists for combining with ylens
|
||||
vpslld ydists, ydists, DIST_OFFSET
|
||||
|
||||
;; xor current data with lookback dist
|
||||
vpxor ylens1, ylens1, ylookup
|
||||
vpxor ylens2, ylens2, ylookup2
|
||||
|
||||
;; Setup registers for next loop
|
||||
vpermq ylookup, datas, 0x44
|
||||
vmovdqu yqword_shuf, [qword_shuf]
|
||||
vpshufb ylookup, ylookup, yqword_shuf
|
||||
vpermd ylookup2, ydatas_perm2, datas
|
||||
vpshufb ylookup2, ylookup2, yqword_shuf
|
||||
|
||||
;; Compute match length
|
||||
vpxor ytmp, ytmp, ytmp
|
||||
vpcmpeqb ylens1, ylens1, ytmp
|
||||
vpcmpeqb ylens2, ylens2, ytmp
|
||||
vmovdqu yshift_finish, [shift_finish]
|
||||
vpand ylens1, ylens1, yshift_finish
|
||||
vpand ylens2, ylens2, yshift_finish
|
||||
vpsadbw ylens1, ylens1, ytmp
|
||||
vpsadbw ylens2, ylens2, ytmp
|
||||
vmovdqu ydownconvert_qd, [downconvert_qd]
|
||||
vpshufb ylens1, ylens1, ydownconvert_qd
|
||||
vextracti128 ytmp %+ x, ylens1, 1
|
||||
vpor ylens1, ylens1, ytmp
|
||||
vpshufb ylens2, ylens2, ydownconvert_qd
|
||||
vextracti128 ytmp %+ x, ylens2, 1
|
||||
vpor ylens2, ylens2, ytmp
|
||||
vinserti128 ylens1, ylens1, ylens2 %+ x, 1
|
||||
vpsrld ylens2, ylens1, 4
|
||||
vpand ylens1, ylens1, [low_nibble]
|
||||
vmovdqu ytmp, [match_cnt_perm]
|
||||
vpshufb ylens1, ytmp, ylens1
|
||||
vpshufb ylens2, ytmp, ylens2
|
||||
vpcmpeqb ytmp, ylens1, [match_cnt_low_max]
|
||||
vpand ylens2, ylens2, ytmp
|
||||
vpaddd ylens1, ylens1, ylens2
|
||||
|
||||
;; Preload for next loops
|
||||
vmovdqu datas, datas_lookup
|
||||
vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
|
||||
|
||||
;; Zero out matches which should not be taken
|
||||
vmovdqu yrot_left, [drot_left]
|
||||
vpermd ylens2, yrot_left, ylens1
|
||||
vpermd ydists, yrot_left, ydists
|
||||
|
||||
vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
|
||||
vmovd prev_len %+ d, ylens2 %+ x
|
||||
vinserti128 ylens2, ylens2, ytmp %+ x, 0
|
||||
|
||||
vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
|
||||
vmovd prev_dist %+ d, ydists %+ x
|
||||
vinserti128 ydists, ydists, ytmp %+ x, 0
|
||||
|
||||
vpcmpgtd ytmp, ylens2, [shortest_matches]
|
||||
vpcmpgtd ytmp2, ylens1, ylens2
|
||||
|
||||
vpcmpeqd ytmp3, ytmp3, ytmp3
|
||||
vpxor ytmp, ytmp, ytmp3
|
||||
vpor ytmp, ytmp, ytmp2
|
||||
|
||||
vpandn ylens1, ytmp, ylens2
|
||||
|
||||
;; Update zdists to match ylens1
|
||||
vpaddd ydists, ydists, ylens1
|
||||
vpaddd ydists, ydists, [twofiftyfour]
|
||||
|
||||
vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1]
|
||||
vpaddd ytmp3, [null_dist_syms]
|
||||
vpand ytmp3, ytmp3, ytmp
|
||||
vpandn ydists, ytmp, ydists
|
||||
vpor ydists, ydists, ytmp3
|
||||
|
||||
;;Store ydists
|
||||
vmovdqu [matches_next], ydists
|
||||
add matches_next, ICF_CODE_BYTES * VECT_SIZE
|
||||
|
||||
cmp f_i, f_i_end
|
||||
jle loop1
|
||||
|
||||
loop1_end:
|
||||
lea next_in, [f_i + file_start]
|
||||
|
||||
;; Calculate look back dists
|
||||
vpaddd ydists, ydists_lookup, yones
|
||||
vpsubd ydists, yindex, ydists
|
||||
vpand ydists, ydists, [dist_mask]
|
||||
vpaddd ydists, ydists, yones
|
||||
vpsubd ydists, yincrement, ydists
|
||||
|
||||
;;lookup old codes
|
||||
vextracti128 ydists2 %+ x, ydists, 1
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
|
||||
|
||||
;; Calculate dist_icf_code
|
||||
vpaddd ydists, ydists, yones
|
||||
vpsubd ydists, yincrement, ydists
|
||||
|
||||
vpslld ydist_extra, ydists, 12
|
||||
vpor ydist_extra, ydists, ydist_extra
|
||||
vpand ydist_extra, ydist_extra, [low_nibble]
|
||||
vpshufb ydist_extra, ydist_extra, [nibble_order]
|
||||
vmovdqu ytmp2, [bit_index]
|
||||
vpshufb ydist_extra, ytmp2, ydist_extra
|
||||
vpxor ytmp2, ytmp2, ytmp2
|
||||
vpcmpgtb ytmp2, ydist_extra, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 8
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 16
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpsrld ytmp3, ytmp2, 24
|
||||
vpandn ytmp2, ytmp3, ytmp2
|
||||
vpaddb ydist_extra, [base_offset]
|
||||
vpand ydist_extra, ydist_extra, ytmp2
|
||||
vpsrlq ytmp2, ydist_extra, 32
|
||||
vpxor ytmp3, ytmp3, ytmp3
|
||||
vpsadbw ydist_extra, ydist_extra, ytmp3
|
||||
vpsadbw ytmp2, ytmp2, ytmp3
|
||||
vpsubd ydist_extra, ydist_extra, ytmp2
|
||||
vpsllq ytmp2, ytmp2, 32
|
||||
vpor ydist_extra, ydist_extra, ytmp2
|
||||
vpcmpgtb ytmp3, ydist_extra, ytmp3
|
||||
vpand ydist_extra, ydist_extra, ytmp3
|
||||
|
||||
vpsllvd ycode, yones, ydist_extra
|
||||
vpsubd ycode, ycode, yones
|
||||
vpcmpgtd ytmp2, ydists, yones
|
||||
vpand ycode, ydists, ycode
|
||||
vpand ycode, ycode, ytmp2
|
||||
vpsrlvd ydists, ydists, ydist_extra
|
||||
vpslld ydist_extra, ydist_extra, 1
|
||||
vpaddd ydists, ydists, ydist_extra
|
||||
vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
|
||||
vpaddd ydists, ydists, ycode
|
||||
|
||||
;; Setup ydists for combining with ylens
|
||||
vpslld ydists, ydists, DIST_OFFSET
|
||||
|
||||
;; xor current data with lookback dist
|
||||
vpxor ylens1, ylens1, ylookup
|
||||
vpxor ylens2, ylens2, ylookup2
|
||||
|
||||
;; Compute match length
|
||||
vpxor ytmp, ytmp, ytmp
|
||||
vpcmpeqb ylens1, ylens1, ytmp
|
||||
vpcmpeqb ylens2, ylens2, ytmp
|
||||
vmovdqu yshift_finish, [shift_finish]
|
||||
vpand ylens1, ylens1, yshift_finish
|
||||
vpand ylens2, ylens2, yshift_finish
|
||||
vpsadbw ylens1, ylens1, ytmp
|
||||
vpsadbw ylens2, ylens2, ytmp
|
||||
vmovdqu ydownconvert_qd, [downconvert_qd]
|
||||
vpshufb ylens1, ylens1, ydownconvert_qd
|
||||
vextracti128 ytmp %+ x, ylens1, 1
|
||||
vpor ylens1, ylens1, ytmp
|
||||
vpshufb ylens2, ylens2, ydownconvert_qd
|
||||
vextracti128 ytmp %+ x, ylens2, 1
|
||||
vpor ylens2, ylens2, ytmp
|
||||
vinserti128 ylens1, ylens1, ylens2 %+ x, 1
|
||||
vpsrld ylens2, ylens1, 4
|
||||
vpand ylens1, ylens1, [low_nibble]
|
||||
vmovdqu ytmp, [match_cnt_perm]
|
||||
vpshufb ylens1, ytmp, ylens1
|
||||
vpshufb ylens2, ytmp, ylens2
|
||||
vpcmpeqb ytmp, ylens1, [match_cnt_low_max]
|
||||
vpand ylens2, ylens2, ytmp
|
||||
vpaddd ylens1, ylens1, ylens2
|
||||
|
||||
;; Zero out matches which should not be taken
|
||||
vmovdqu yrot_left, [drot_left]
|
||||
vpermd ylens2, yrot_left, ylens1
|
||||
vpermd ydists, yrot_left, ydists
|
||||
|
||||
vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
|
||||
vinserti128 ylens2, ylens2, ytmp %+ x, 0
|
||||
|
||||
vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
|
||||
vinserti128 ydists, ydists, ytmp %+ x, 0
|
||||
|
||||
vpcmpgtd ytmp, ylens2, [shortest_matches]
|
||||
vpcmpgtd ytmp2, ylens1, ylens2
|
||||
|
||||
vpcmpeqd ytmp3, ytmp3, ytmp3
|
||||
vpxor ytmp, ytmp, ytmp3
|
||||
vpor ytmp, ytmp, ytmp2
|
||||
|
||||
vpandn ylens1, ytmp, ylens2
|
||||
|
||||
;; Update zdists to match ylens1
|
||||
vpaddd ydists, ydists, ylens1
|
||||
vpaddd ydists, ydists, [twofiftyfour]
|
||||
|
||||
vpmovzxbd ytmp3, [f_i + file_start - 1]
|
||||
vpaddd ytmp3, [null_dist_syms]
|
||||
vpand ytmp3, ytmp3, ytmp
|
||||
vpandn ydists, ytmp, ydists
|
||||
vpor ydists, ydists, ytmp3
|
||||
|
||||
;;Store ydists
|
||||
vmovdqu [matches_next], ydists
|
||||
|
||||
end_main:
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
align 32
|
||||
datas_perm2:
|
||||
dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4
|
||||
drot_left:
|
||||
dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6
|
||||
datas_shuf:
|
||||
db 0x0, 0x1, 0x2, 0x3
|
||||
db 0x1, 0x2, 0x3, 0x4
|
||||
db 0x2, 0x3, 0x4, 0x5
|
||||
db 0x3, 0x4, 0x5, 0x6
|
||||
db 0x4, 0x5, 0x6, 0x7
|
||||
db 0x5, 0x6, 0x7, 0x8
|
||||
db 0x6, 0x7, 0x8, 0x9
|
||||
db 0x7, 0x8, 0x9, 0xa
|
||||
bswap_shuf:
|
||||
db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
|
||||
db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
|
||||
db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
|
||||
db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
|
||||
|
||||
qword_shuf:
|
||||
db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
||||
db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
|
||||
db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
|
||||
db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
|
||||
|
||||
%define PROD1 0xE84B
|
||||
%define PROD2 0x97B1
|
||||
|
||||
hash_prod:
|
||||
dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2
|
||||
dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2
|
||||
null_dist_syms:
|
||||
dd LIT, LIT, LIT, LIT, LIT, LIT, LIT, LIT
|
||||
increment:
|
||||
dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
||||
ones:
|
||||
dd 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1
|
||||
twofiftyfour:
|
||||
dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
|
||||
dist_mask:
|
||||
dd D-1, D-1, D-1, D-1, D-1, D-1, D-1, D-1
|
||||
hash_mask:
|
||||
dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK
|
||||
dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK
|
||||
shortest_matches:
|
||||
dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH
|
||||
dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH
|
||||
upper_word:
|
||||
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
||||
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
||||
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
||||
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
||||
low_word:
|
||||
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
||||
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
||||
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
||||
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
||||
shift_finish:
|
||||
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
||||
downconvert_qd:
|
||||
db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
|
||||
db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||
db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
|
||||
low_nibble:
|
||||
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
||||
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
||||
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
||||
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
||||
match_cnt_perm:
|
||||
db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
|
||||
db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
|
||||
match_cnt_low_max:
|
||||
dd 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
||||
bit_index:
|
||||
db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
|
||||
db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
||||
db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
|
||||
db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
||||
base_offset:
|
||||
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
||||
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
||||
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
||||
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
||||
nibble_order:
|
||||
db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
|
||||
db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
|
||||
db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
|
||||
db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
|
@ -405,6 +405,8 @@ end_main:
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
align 64
|
||||
datas_perm:
|
||||
|
@ -333,6 +333,21 @@ void isal_deflate_icf_body_base(struct isal_zstream *stream)
|
||||
}
|
||||
}
|
||||
|
||||
void isal_deflate_icf_body_04(struct isal_zstream *stream)
|
||||
{
|
||||
switch (stream->level) {
|
||||
case 3:
|
||||
icf_body_lazyhash1_fillgreedy_greedy(stream);
|
||||
break;
|
||||
case 2:
|
||||
isal_deflate_icf_body_lvl2(stream);
|
||||
break;
|
||||
case 1:
|
||||
default:
|
||||
isal_deflate_icf_body_lvl1(stream);
|
||||
}
|
||||
}
|
||||
|
||||
void isal_deflate_icf_body_06(struct isal_zstream *stream)
|
||||
{
|
||||
switch (stream->level) {
|
||||
|
@ -60,6 +60,7 @@ extern isal_update_histogram_01
|
||||
extern isal_update_histogram_04
|
||||
|
||||
extern gen_icf_map_h1_base
|
||||
extern gen_icf_map_lh1_04
|
||||
|
||||
extern encode_deflate_icf_base
|
||||
extern encode_deflate_icf_04
|
||||
@ -85,6 +86,7 @@ extern isal_deflate_hash_crc_01
|
||||
extern isal_deflate_hash_mad_base
|
||||
|
||||
extern isal_deflate_icf_body_base
|
||||
extern isal_deflate_icf_body_04
|
||||
extern isal_deflate_icf_body_06
|
||||
|
||||
section .text
|
||||
@ -122,7 +124,7 @@ mbin_interface set_long_icf_fg
|
||||
mbin_dispatch_init6 set_long_icf_fg, set_long_icf_fg_base, set_long_icf_fg_base, set_long_icf_fg_base, set_long_icf_fg_base, set_long_icf_fg_06
|
||||
|
||||
mbin_interface gen_icf_map_lh1
|
||||
mbin_dispatch_init6 gen_icf_map_lh1, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_lh1_06
|
||||
mbin_dispatch_init6 gen_icf_map_lh1, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_lh1_04, gen_icf_map_lh1_06
|
||||
%else
|
||||
mbin_interface encode_deflate_icf
|
||||
mbin_dispatch_init5 encode_deflate_icf, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_04
|
||||
@ -131,7 +133,7 @@ mbin_interface set_long_icf_fg
|
||||
mbin_dispatch_init5 set_long_icf_fg, set_long_icf_fg_base, set_long_icf_fg_base, set_long_icf_fg_base, set_long_icf_fg_base
|
||||
|
||||
mbin_interface gen_icf_map_lh1
|
||||
mbin_dispatch_init5 gen_icf_map_lh1, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base
|
||||
mbin_dispatch_init5 gen_icf_map_lh1, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_h1_base, gen_icf_map_lh1_04
|
||||
%endif
|
||||
|
||||
mbin_interface crc32_gzip
|
||||
@ -154,8 +156,8 @@ mbin_dispatch_init5 isal_deflate_hash_lvl3, isal_deflate_hash_base, isal_deflate
|
||||
|
||||
%ifdef HAVE_AS_KNOWS_AVX512
|
||||
mbin_interface isal_deflate_icf_body
|
||||
mbin_dispatch_init6 isal_deflate_icf_body, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_06
|
||||
mbin_dispatch_init6 isal_deflate_icf_body, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_04, isal_deflate_icf_body_06
|
||||
%else
|
||||
mbin_interface isal_deflate_icf_body
|
||||
mbin_dispatch_init5 isal_deflate_icf_body, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base
|
||||
mbin_dispatch_init5 isal_deflate_icf_body, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_base, isal_deflate_icf_body_04
|
||||
%endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user