mirror of
https://github.com/intel/isa-l.git
synced 2025-01-19 04:26:08 +01:00
9acc3ed2ac
Change-Id: Icfdb67445ee5afff85441cfee23beb66bfe15d5e Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
627 lines
17 KiB
NASM
627 lines
17 KiB
NASM
%include "reg_sizes.asm"
|
|
%include "lz0a_const.asm"
|
|
%include "data_struct2.asm"
|
|
|
|
%ifidn __OUTPUT_FORMAT__, win64
|
|
%define arg1 rcx
|
|
%define arg2 rdx
|
|
%define arg3 r8
|
|
%define hash rsi
|
|
%define next_in rdi
|
|
%else
|
|
%define arg1 rdi
|
|
%define arg2 rsi
|
|
%define arg3 rdx
|
|
%define hash r8
|
|
%define next_in rcx
|
|
%endif
|
|
|
|
%define stream arg1
|
|
%define level_buf arg1
|
|
%define matches_next arg2
|
|
%define f_i_end arg3
|
|
|
|
%define f_i rax
|
|
%define file_start rbp
|
|
%define tmp r9
|
|
%define encode_size r10
|
|
%define prev_len r11
|
|
%define prev_dist r12
|
|
|
|
%define hash_table level_buf + _hash_map_hash_table
|
|
|
|
%define datas ymm0
|
|
%define datas_lookup ymm1
|
|
%define yhashes ymm2
|
|
%define ydists ymm3
|
|
%define ydists_lookup ymm4
|
|
|
|
%define ydownconvert_qd ymm5
|
|
%define ydists2 ymm5
|
|
%define yscatter ymm5
|
|
%define ytmp2 ymm5
|
|
|
|
%define ylens1 ymm6
|
|
%define ylens2 ymm7
|
|
%define ylookup ymm8
|
|
%define ylookup2 ymm9
|
|
%define yindex ymm10
|
|
|
|
%define yrot_left ymm11
|
|
%define yshift_finish ymm11
|
|
%define yqword_shuf ymm11
|
|
%define yhash_prod ymm11
|
|
%define ycode ymm11
|
|
%define ytmp3 ymm11
|
|
|
|
%define yones ymm12
|
|
%define ydatas_perm2 ymm13
|
|
%define yincrement ymm14
|
|
|
|
%define ytmp ymm15
|
|
%define ydist_extra ymm15
|
|
|
|
%ifidn __OUTPUT_FORMAT__, win64
|
|
%define stack_size 10*16 + 4 * 8 + 8
|
|
%define func(x) proc_frame x
|
|
|
|
%macro FUNC_SAVE 0
|
|
alloc_stack stack_size
|
|
vmovdqa [rsp + 0*16], xmm6
|
|
vmovdqa [rsp + 1*16], xmm7
|
|
vmovdqa [rsp + 2*16], xmm8
|
|
vmovdqa [rsp + 3*16], xmm9
|
|
vmovdqa [rsp + 4*16], xmm10
|
|
vmovdqa [rsp + 5*16], xmm11
|
|
vmovdqa [rsp + 6*16], xmm12
|
|
vmovdqa [rsp + 7*16], xmm13
|
|
vmovdqu [rsp + 8*16], xmm14
|
|
vmovdqa [rsp + 9*16], xmm15
|
|
save_reg rsi, 10*16 + 0*8
|
|
save_reg rdi, 10*16 + 1*8
|
|
save_reg rbp, 10*16 + 2*8
|
|
save_reg r12, 10*16 + 3*8
|
|
end_prolog
|
|
%endm
|
|
|
|
%macro FUNC_RESTORE 0
|
|
vmovdqa xmm6, [rsp + 0*16]
|
|
vmovdqa xmm7, [rsp + 1*16]
|
|
vmovdqa xmm8, [rsp + 2*16]
|
|
vmovdqa xmm9, [rsp + 3*16]
|
|
vmovdqa xmm10, [rsp + 4*16]
|
|
vmovdqa xmm11, [rsp + 5*16]
|
|
vmovdqa xmm12, [rsp + 6*16]
|
|
vmovdqa xmm13, [rsp + 7*16]
|
|
vmovdqa xmm14, [rsp + 8*16]
|
|
vmovdqa xmm15, [rsp + 9*16]
|
|
|
|
mov rsi, [rsp + 10*16 + 0*8]
|
|
mov rdi, [rsp + 10*16 + 1*8]
|
|
mov rbp, [rsp + 10*16 + 2*8]
|
|
mov r12, [rsp + 10*16 + 3*8]
|
|
add rsp, stack_size
|
|
%endm
|
|
%else
|
|
%define func(x) x:
|
|
%macro FUNC_SAVE 0
|
|
push rbp
|
|
push r12
|
|
%endm
|
|
|
|
%macro FUNC_RESTORE 0
|
|
pop r12
|
|
pop rbp
|
|
%endm
|
|
%endif
|
|
|
|
%define VECT_SIZE 8
|
|
%define HASH_BYTES 2
|
|
|
|
global gen_icf_map_lh1_04
|
|
func(gen_icf_map_lh1_04)
|
|
FUNC_SAVE
|
|
|
|
mov file_start, [stream + _next_in]
|
|
mov f_i %+ d, dword [stream + _total_in]
|
|
|
|
sub file_start, f_i
|
|
add f_i_end, f_i
|
|
cmp f_i, f_i_end
|
|
jge end_main
|
|
|
|
;; Prep for main loop
|
|
mov level_buf, [stream + _level_buf]
|
|
sub f_i_end, LA
|
|
vmovdqu yincrement, [increment]
|
|
vmovdqu yones, [ones]
|
|
vmovdqu ydatas_perm2, [datas_perm2]
|
|
|
|
xor prev_len, prev_len
|
|
xor prev_dist, prev_dist
|
|
|
|
;; Process first byte
|
|
vmovd yhashes %+ x, dword [f_i + file_start]
|
|
vmovdqu yhash_prod, [hash_prod]
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpand yhashes, yhashes, [hash_mask]
|
|
vmovd hash %+ d, yhashes %+ x
|
|
mov word [hash_table + HASH_BYTES * hash], f_i %+ w
|
|
|
|
add f_i, 1
|
|
cmp f_i, f_i_end
|
|
jg end_main
|
|
|
|
;;hash
|
|
vmovdqu datas, [f_i + file_start]
|
|
vpermq yhashes, datas, 0x44
|
|
vpshufb yhashes, yhashes, [datas_shuf]
|
|
vmovdqu yhash_prod, [hash_prod]
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpand yhashes, yhashes, [hash_mask]
|
|
|
|
vpermq ylookup, datas, 0x44
|
|
vmovdqu yqword_shuf, [qword_shuf]
|
|
vpshufb ylookup, ylookup, yqword_shuf
|
|
vpermd ylookup2, ydatas_perm2, datas
|
|
vpshufb ylookup2, ylookup2, yqword_shuf
|
|
|
|
;;gather/scatter hashes
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
|
|
|
|
vmovd yindex %+ x, f_i %+ d
|
|
vpbroadcastd yindex, yindex %+ x
|
|
vpaddd yindex, yindex, yincrement
|
|
vpand yscatter, ydists_lookup, [upper_word]
|
|
vpand ytmp, yindex, [low_word]
|
|
vpor yscatter, yscatter, ytmp
|
|
|
|
vmovd tmp %+ d, yhashes %+ x
|
|
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
|
vpextrd tmp %+ d, yhashes %+ x, 1
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
|
vpextrd tmp %+ d, yhashes %+ x, 2
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
|
vpextrd tmp %+ d,yhashes %+ x, 3
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
|
|
|
vextracti128 yscatter %+ x, yscatter, 1
|
|
vextracti128 yhashes %+ x, yhashes, 1
|
|
|
|
vmovd tmp %+ d, yhashes %+ x
|
|
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
|
vpextrd tmp %+ d, yhashes %+ x, 1
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
|
vpextrd tmp %+ d, yhashes %+ x, 2
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
|
vpextrd tmp %+ d,yhashes %+ x, 3
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
|
|
|
;; Compute hash for next loop
|
|
vmovdqu datas, [f_i + file_start + VECT_SIZE]
|
|
vpermq yhashes, datas, 0x44
|
|
vpshufb yhashes, yhashes, [datas_shuf]
|
|
vmovdqu yhash_prod, [hash_prod]
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpand yhashes, yhashes, [hash_mask]
|
|
|
|
vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
|
|
|
|
sub f_i_end, VECT_SIZE
|
|
cmp f_i, f_i_end
|
|
jg loop1_end
|
|
|
|
loop1:
|
|
lea next_in, [f_i + file_start]
|
|
|
|
;; Calculate look back dists
|
|
vpaddd ydists, ydists_lookup, yones
|
|
vpsubd ydists, yindex, ydists
|
|
vpand ydists, ydists, [dist_mask]
|
|
vpaddd ydists, ydists, yones
|
|
vpsubd ydists, yincrement, ydists
|
|
|
|
;;gather/scatter hashes
|
|
add f_i, VECT_SIZE
|
|
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
|
|
|
|
vmovd yindex %+ x, f_i %+ d
|
|
vpbroadcastd yindex, yindex %+ x
|
|
vpaddd yindex, yindex, yincrement
|
|
vpand yscatter, ydists_lookup, [upper_word]
|
|
vpand ytmp, yindex, [low_word]
|
|
vpor yscatter, yscatter, ytmp
|
|
|
|
vmovd tmp %+ d, yhashes %+ x
|
|
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
|
vpextrd tmp %+ d, yhashes %+ x, 1
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
|
vpextrd tmp %+ d, yhashes %+ x, 2
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
|
vpextrd tmp %+ d,yhashes %+ x, 3
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
|
|
|
vextracti128 yscatter %+ x, yscatter, 1
|
|
vextracti128 yhashes %+ x, yhashes, 1
|
|
|
|
vmovd tmp %+ d, yhashes %+ x
|
|
vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
|
|
vpextrd tmp %+ d, yhashes %+ x, 1
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
|
|
vpextrd tmp %+ d, yhashes %+ x, 2
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
|
|
vpextrd tmp %+ d,yhashes %+ x, 3
|
|
vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
|
|
|
|
;; Compute hash for next loop
|
|
vpermq yhashes, datas_lookup, 0x44
|
|
vpshufb yhashes, yhashes, [datas_shuf]
|
|
vmovdqu yhash_prod, [hash_prod]
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpmaddwd yhashes, yhashes, yhash_prod
|
|
vpand yhashes, yhashes, [hash_mask]
|
|
|
|
;;lookup old codes
|
|
vextracti128 ydists2 %+ x, ydists, 1
|
|
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
|
|
|
|
;; Calculate dist_icf_code
|
|
vpaddd ydists, ydists, yones
|
|
vpsubd ydists, yincrement, ydists
|
|
|
|
vpslld ydist_extra, ydists, 12
|
|
vpor ydist_extra, ydists, ydist_extra
|
|
vpand ydist_extra, ydist_extra, [low_nibble]
|
|
vpshufb ydist_extra, ydist_extra, [nibble_order]
|
|
vmovdqu ytmp2, [bit_index]
|
|
vpshufb ydist_extra, ytmp2, ydist_extra
|
|
vpxor ytmp2, ytmp2, ytmp2
|
|
vpcmpgtb ytmp2, ydist_extra, ytmp2
|
|
vpsrld ytmp3, ytmp2, 8
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpsrld ytmp3, ytmp2, 16
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpsrld ytmp3, ytmp2, 24
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpaddb ydist_extra, [base_offset]
|
|
vpand ydist_extra, ydist_extra, ytmp2
|
|
vpsrlq ytmp2, ydist_extra, 32
|
|
vpxor ytmp3, ytmp3, ytmp3
|
|
vpsadbw ydist_extra, ydist_extra, ytmp3
|
|
vpsadbw ytmp2, ytmp2, ytmp3
|
|
vpsubd ydist_extra, ydist_extra, ytmp2
|
|
vpsllq ytmp2, ytmp2, 32
|
|
vpor ydist_extra, ydist_extra, ytmp2
|
|
vpcmpgtb ytmp3, ydist_extra, ytmp3
|
|
vpand ydist_extra, ydist_extra, ytmp3
|
|
|
|
vmovdqu yones, yones
|
|
vpsllvd ycode, yones, ydist_extra
|
|
vpsubd ycode, ycode, yones
|
|
vpcmpgtd ytmp2, ydists, yones
|
|
vpand ycode, ydists, ycode
|
|
vpand ycode, ycode, ytmp2
|
|
vpsrlvd ydists, ydists, ydist_extra
|
|
vpslld ydist_extra, ydist_extra, 1
|
|
vpaddd ydists, ydists, ydist_extra
|
|
vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
|
|
vpaddd ydists, ydists, ycode
|
|
|
|
;; Setup ydists for combining with ylens
|
|
vpslld ydists, ydists, DIST_OFFSET
|
|
|
|
;; xor current data with lookback dist
|
|
vpxor ylens1, ylens1, ylookup
|
|
vpxor ylens2, ylens2, ylookup2
|
|
|
|
;; Setup registers for next loop
|
|
vpermq ylookup, datas, 0x44
|
|
vmovdqu yqword_shuf, [qword_shuf]
|
|
vpshufb ylookup, ylookup, yqword_shuf
|
|
vpermd ylookup2, ydatas_perm2, datas
|
|
vpshufb ylookup2, ylookup2, yqword_shuf
|
|
|
|
;; Compute match length
|
|
vpxor ytmp, ytmp, ytmp
|
|
vpcmpeqb ylens1, ylens1, ytmp
|
|
vpcmpeqb ylens2, ylens2, ytmp
|
|
vmovdqu yshift_finish, [shift_finish]
|
|
vpand ylens1, ylens1, yshift_finish
|
|
vpand ylens2, ylens2, yshift_finish
|
|
vpsadbw ylens1, ylens1, ytmp
|
|
vpsadbw ylens2, ylens2, ytmp
|
|
vmovdqu ydownconvert_qd, [downconvert_qd]
|
|
vpshufb ylens1, ylens1, ydownconvert_qd
|
|
vextracti128 ytmp %+ x, ylens1, 1
|
|
vpor ylens1, ylens1, ytmp
|
|
vpshufb ylens2, ylens2, ydownconvert_qd
|
|
vextracti128 ytmp %+ x, ylens2, 1
|
|
vpor ylens2, ylens2, ytmp
|
|
vinserti128 ylens1, ylens1, ylens2 %+ x, 1
|
|
vpsrld ylens2, ylens1, 4
|
|
vpand ylens1, ylens1, [low_nibble]
|
|
vmovdqu ytmp, [match_cnt_perm]
|
|
vpshufb ylens1, ytmp, ylens1
|
|
vpshufb ylens2, ytmp, ylens2
|
|
vpcmpeqb ytmp, ylens1, [match_cnt_low_max]
|
|
vpand ylens2, ylens2, ytmp
|
|
vpaddd ylens1, ylens1, ylens2
|
|
|
|
;; Preload for next loops
|
|
vmovdqu datas, datas_lookup
|
|
vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
|
|
|
|
;; Zero out matches which should not be taken
|
|
vmovdqu yrot_left, [drot_left]
|
|
vpermd ylens2, yrot_left, ylens1
|
|
vpermd ydists, yrot_left, ydists
|
|
|
|
vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
|
|
vmovd prev_len %+ d, ylens2 %+ x
|
|
vinserti128 ylens2, ylens2, ytmp %+ x, 0
|
|
|
|
vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
|
|
vmovd prev_dist %+ d, ydists %+ x
|
|
vinserti128 ydists, ydists, ytmp %+ x, 0
|
|
|
|
vpcmpgtd ytmp, ylens2, [shortest_matches]
|
|
vpcmpgtd ytmp2, ylens1, ylens2
|
|
|
|
vpcmpeqd ytmp3, ytmp3, ytmp3
|
|
vpxor ytmp, ytmp, ytmp3
|
|
vpor ytmp, ytmp, ytmp2
|
|
|
|
vpandn ylens1, ytmp, ylens2
|
|
|
|
;; Update zdists to match ylens1
|
|
vpaddd ydists, ydists, ylens1
|
|
vpaddd ydists, ydists, [twofiftyfour]
|
|
|
|
vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1]
|
|
vpaddd ytmp3, [null_dist_syms]
|
|
vpand ytmp3, ytmp3, ytmp
|
|
vpandn ydists, ytmp, ydists
|
|
vpor ydists, ydists, ytmp3
|
|
|
|
;;Store ydists
|
|
vmovdqu [matches_next], ydists
|
|
add matches_next, ICF_CODE_BYTES * VECT_SIZE
|
|
|
|
cmp f_i, f_i_end
|
|
jle loop1
|
|
|
|
loop1_end:
|
|
lea next_in, [f_i + file_start]
|
|
|
|
;; Calculate look back dists
|
|
vpaddd ydists, ydists_lookup, yones
|
|
vpsubd ydists, yindex, ydists
|
|
vpand ydists, ydists, [dist_mask]
|
|
vpaddd ydists, ydists, yones
|
|
vpsubd ydists, yincrement, ydists
|
|
|
|
;;lookup old codes
|
|
vextracti128 ydists2 %+ x, ydists, 1
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
|
|
vpcmpeqq ytmp, ytmp, ytmp
|
|
vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
|
|
|
|
;; Calculate dist_icf_code
|
|
vpaddd ydists, ydists, yones
|
|
vpsubd ydists, yincrement, ydists
|
|
|
|
vpslld ydist_extra, ydists, 12
|
|
vpor ydist_extra, ydists, ydist_extra
|
|
vpand ydist_extra, ydist_extra, [low_nibble]
|
|
vpshufb ydist_extra, ydist_extra, [nibble_order]
|
|
vmovdqu ytmp2, [bit_index]
|
|
vpshufb ydist_extra, ytmp2, ydist_extra
|
|
vpxor ytmp2, ytmp2, ytmp2
|
|
vpcmpgtb ytmp2, ydist_extra, ytmp2
|
|
vpsrld ytmp3, ytmp2, 8
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpsrld ytmp3, ytmp2, 16
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpsrld ytmp3, ytmp2, 24
|
|
vpandn ytmp2, ytmp3, ytmp2
|
|
vpaddb ydist_extra, [base_offset]
|
|
vpand ydist_extra, ydist_extra, ytmp2
|
|
vpsrlq ytmp2, ydist_extra, 32
|
|
vpxor ytmp3, ytmp3, ytmp3
|
|
vpsadbw ydist_extra, ydist_extra, ytmp3
|
|
vpsadbw ytmp2, ytmp2, ytmp3
|
|
vpsubd ydist_extra, ydist_extra, ytmp2
|
|
vpsllq ytmp2, ytmp2, 32
|
|
vpor ydist_extra, ydist_extra, ytmp2
|
|
vpcmpgtb ytmp3, ydist_extra, ytmp3
|
|
vpand ydist_extra, ydist_extra, ytmp3
|
|
|
|
vpsllvd ycode, yones, ydist_extra
|
|
vpsubd ycode, ycode, yones
|
|
vpcmpgtd ytmp2, ydists, yones
|
|
vpand ycode, ydists, ycode
|
|
vpand ycode, ycode, ytmp2
|
|
vpsrlvd ydists, ydists, ydist_extra
|
|
vpslld ydist_extra, ydist_extra, 1
|
|
vpaddd ydists, ydists, ydist_extra
|
|
vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
|
|
vpaddd ydists, ydists, ycode
|
|
|
|
;; Setup ydists for combining with ylens
|
|
vpslld ydists, ydists, DIST_OFFSET
|
|
|
|
;; xor current data with lookback dist
|
|
vpxor ylens1, ylens1, ylookup
|
|
vpxor ylens2, ylens2, ylookup2
|
|
|
|
;; Compute match length
|
|
vpxor ytmp, ytmp, ytmp
|
|
vpcmpeqb ylens1, ylens1, ytmp
|
|
vpcmpeqb ylens2, ylens2, ytmp
|
|
vmovdqu yshift_finish, [shift_finish]
|
|
vpand ylens1, ylens1, yshift_finish
|
|
vpand ylens2, ylens2, yshift_finish
|
|
vpsadbw ylens1, ylens1, ytmp
|
|
vpsadbw ylens2, ylens2, ytmp
|
|
vmovdqu ydownconvert_qd, [downconvert_qd]
|
|
vpshufb ylens1, ylens1, ydownconvert_qd
|
|
vextracti128 ytmp %+ x, ylens1, 1
|
|
vpor ylens1, ylens1, ytmp
|
|
vpshufb ylens2, ylens2, ydownconvert_qd
|
|
vextracti128 ytmp %+ x, ylens2, 1
|
|
vpor ylens2, ylens2, ytmp
|
|
vinserti128 ylens1, ylens1, ylens2 %+ x, 1
|
|
vpsrld ylens2, ylens1, 4
|
|
vpand ylens1, ylens1, [low_nibble]
|
|
vmovdqu ytmp, [match_cnt_perm]
|
|
vpshufb ylens1, ytmp, ylens1
|
|
vpshufb ylens2, ytmp, ylens2
|
|
vpcmpeqb ytmp, ylens1, [match_cnt_low_max]
|
|
vpand ylens2, ylens2, ytmp
|
|
vpaddd ylens1, ylens1, ylens2
|
|
|
|
;; Zero out matches which should not be taken
|
|
vmovdqu yrot_left, [drot_left]
|
|
vpermd ylens2, yrot_left, ylens1
|
|
vpermd ydists, yrot_left, ydists
|
|
|
|
vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
|
|
vinserti128 ylens2, ylens2, ytmp %+ x, 0
|
|
|
|
vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
|
|
vinserti128 ydists, ydists, ytmp %+ x, 0
|
|
|
|
vpcmpgtd ytmp, ylens2, [shortest_matches]
|
|
vpcmpgtd ytmp2, ylens1, ylens2
|
|
|
|
vpcmpeqd ytmp3, ytmp3, ytmp3
|
|
vpxor ytmp, ytmp, ytmp3
|
|
vpor ytmp, ytmp, ytmp2
|
|
|
|
vpandn ylens1, ytmp, ylens2
|
|
|
|
;; Update zdists to match ylens1
|
|
vpaddd ydists, ydists, ylens1
|
|
vpaddd ydists, ydists, [twofiftyfour]
|
|
|
|
vpmovzxbd ytmp3, [f_i + file_start - 1]
|
|
vpaddd ytmp3, [null_dist_syms]
|
|
vpand ytmp3, ytmp3, ytmp
|
|
vpandn ydists, ytmp, ydists
|
|
vpor ydists, ydists, ytmp3
|
|
|
|
;;Store ydists
|
|
vmovdqu [matches_next], ydists
|
|
|
|
end_main:
|
|
FUNC_RESTORE
|
|
ret
|
|
|
|
endproc_frame
|
|
|
|
section .data
|
|
align 32
|
|
datas_perm2:
|
|
dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4
|
|
drot_left:
|
|
dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6
|
|
datas_shuf:
|
|
db 0x0, 0x1, 0x2, 0x3
|
|
db 0x1, 0x2, 0x3, 0x4
|
|
db 0x2, 0x3, 0x4, 0x5
|
|
db 0x3, 0x4, 0x5, 0x6
|
|
db 0x4, 0x5, 0x6, 0x7
|
|
db 0x5, 0x6, 0x7, 0x8
|
|
db 0x6, 0x7, 0x8, 0x9
|
|
db 0x7, 0x8, 0x9, 0xa
|
|
bswap_shuf:
|
|
db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
|
|
db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
|
|
db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
|
|
db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
|
|
|
|
qword_shuf:
|
|
db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
|
db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
|
|
db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
|
|
db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
|
|
|
|
%define PROD1 0xE84B
|
|
%define PROD2 0x97B1
|
|
|
|
hash_prod:
|
|
dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2
|
|
dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2
|
|
null_dist_syms:
|
|
dd LIT, LIT, LIT, LIT, LIT, LIT, LIT, LIT
|
|
increment:
|
|
dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
|
ones:
|
|
dd 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1
|
|
twofiftyfour:
|
|
dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
|
|
dist_mask:
|
|
dd D-1, D-1, D-1, D-1, D-1, D-1, D-1, D-1
|
|
hash_mask:
|
|
dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK
|
|
dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK
|
|
shortest_matches:
|
|
dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH
|
|
dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH
|
|
upper_word:
|
|
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
|
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
|
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
|
dw 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff
|
|
low_word:
|
|
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
|
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
|
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
|
dw 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
|
|
shift_finish:
|
|
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
|
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
|
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
|
db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
|
|
downconvert_qd:
|
|
db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
|
|
db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
|
|
low_nibble:
|
|
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
|
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
|
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
|
db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
|
|
match_cnt_perm:
|
|
db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
|
|
db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
|
|
match_cnt_low_max:
|
|
dd 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
|
bit_index:
|
|
db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
|
|
db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
|
db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
|
|
db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
|
|
base_offset:
|
|
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
|
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
|
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
|
db -0x2, 0x2, 0x6, 0xa, -0x2, 0x2, 0x6, 0xa
|
|
nibble_order:
|
|
db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
|
|
db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
|
|
db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
|
|
db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
|