%include "reg_sizes.asm" %include "lz0a_const.asm" %include "data_struct2.asm" %ifdef HAVE_AS_KNOWS_AVX512 %ifidn __OUTPUT_FORMAT__, win64 %define arg1 rcx %define arg2 rdx %define arg3 r8 %define hash rsi %define next_in rdi %else %define arg1 rdi %define arg2 rsi %define arg3 rdx %define hash r8 %define next_in rcx %endif %define stream arg1 %define level_buf arg1 %define matches_next arg2 %define f_i_end arg3 %define f_i rax %define file_start rbp %define next_byte r9 %define encode_size r10 %define prev_len r11 %define prev_dist r12 %define hash_table level_buf + _hash_map_hash_table %define datas zmm0 %define datas_lookup zmm1 %define zhashes zmm2 %define zdists zmm3 %define zdists_lookup zmm4 %define zscatter zmm5 %define zdists2 zmm6 %define zlens1 zmm7 %define zlens2 zmm8 %define zlookup zmm9 %define zlookup2 zmm10 %define match_lookups zmm11 %define zindex zmm12 %define zdist_extra zmm13 %define zdists_tmp zmm14 %define znull_dist_syms zmm15 %define zcode zmm16 %define zthirty zmm17 %define zdist_mask zmm18 %define zshortest_matches zmm19 %define zrot_left zmm20 %define zdatas_perm zmm21 %define zdatas_perm2 zmm22 %define zdatas_perm3 zmm23 %define zdatas_shuf zmm24 %define zhash_prod zmm25 %define zhash_mask zmm26 %define zincrement zmm27 %define zqword_shuf zmm28 %define zones zmm29 %define ztwofiftyfour zmm30 %define zbswap zmm31 %ifidn __OUTPUT_FORMAT__, win64 %define stack_size 10*16 + 4 * 8 + 8 %macro FUNC_SAVE 0 alloc_stack stack_size vmovdqa [rsp + 0*16], xmm6 vmovdqa [rsp + 1*16], xmm7 vmovdqa [rsp + 2*16], xmm8 vmovdqa [rsp + 3*16], xmm9 vmovdqa [rsp + 4*16], xmm10 vmovdqa [rsp + 5*16], xmm11 vmovdqa [rsp + 6*16], xmm12 vmovdqa [rsp + 7*16], xmm13 vmovdqu [rsp + 8*16], xmm14 vmovdqa [rsp + 9*16], xmm15 save_reg rsi, 10*16 + 0*8 save_reg rdi, 10*16 + 1*8 save_reg rbp, 10*16 + 2*8 save_reg r12, 10*16 + 3*8 end_prolog %endm %macro FUNC_RESTORE 0 vmovdqa xmm6, [rsp + 0*16] vmovdqa xmm7, [rsp + 1*16] vmovdqa xmm8, [rsp + 2*16] vmovdqa xmm9, [rsp + 3*16] vmovdqa xmm10, [rsp + 4*16] vmovdqa xmm11, [rsp + 5*16] vmovdqa xmm12, [rsp + 6*16] vmovdqa xmm13, [rsp + 7*16] vmovdqa xmm14, [rsp + 8*16] vmovdqa xmm15, [rsp + 9*16] mov 10*16 + 0*8, rsi mov 10*16 + 1*8, rdi mov 10*16 + 2*8, rbp mov 10*16 + 3*8, r12 add rsp, stack_size %endm %else %macro FUNC_SAVE 0 push rbp push r12 %endm %macro FUNC_RESTORE 0 pop r12 pop rbp %endm %endif %define VECT_SIZE 16 %define HASH_BYTES 2 global gen_icf_map_lh1_06 gen_icf_map_lh1_06: FUNC_SAVE mov file_start, [stream + _next_in] mov f_i %+ d, dword [stream + _total_in] sub file_start, f_i add f_i_end, f_i cmp f_i, f_i_end jge end_main ;; Prep for main loop mov level_buf, [stream + _level_buf] sub f_i_end, LA vmovdqu64 zdatas_perm, [datas_perm] vmovdqu64 zdatas_shuf, [datas_shuf] vmovdqu64 zhash_prod, [hash_prod] vmovdqu64 zhash_mask, [hash_mask] vmovdqu64 zincrement, [increment] vmovdqu64 zqword_shuf, [qword_shuf] vmovdqu64 zdatas_perm2, [datas_perm2] vmovdqu64 zdatas_perm3, [datas_perm3] vmovdqu64 zones, [ones] vmovdqu64 zbswap, [bswap_shuf] vmovdqu64 zthirty, [thirty] vmovdqu64 zrot_left, [drot_left] vmovdqu64 zdist_mask, [dist_mask] vmovdqu64 zshortest_matches, [shortest_matches] vmovdqu64 ztwofiftyfour, [twofiftyfour] vmovdqu64 znull_dist_syms, [null_dist_syms] kxorq k0, k0, k0 kmovq k1, [k_mask_1] kmovq k2, [k_mask_2] xor prev_len, prev_len xor prev_dist, prev_dist ;; Process first byte vmovd zhashes %+ x, dword [f_i + file_start] vpmaddwd zhashes, zhashes, zhash_prod vpmaddwd zhashes, zhashes, zhash_prod vpandd zhashes, zhashes, zhash_mask vmovd hash %+ d, zhashes %+ x mov word [hash_table + HASH_BYTES * hash], f_i %+ w add f_i, 1 cmp f_i, f_i_end jg end_main ;;hash vmovdqu64 datas %+ y, [f_i + file_start] vpermq zhashes, zdatas_perm, datas vpshufb zhashes, zhashes, zdatas_shuf vpmaddwd zhashes, zhashes, zhash_prod vpmaddwd zhashes, zhashes, zhash_prod vpandd zhashes, zhashes, zhash_mask vpermq zlookup, zdatas_perm2, datas vpshufb zlookup, zlookup, zqword_shuf vpermq zlookup2, zdatas_perm3, datas vpshufb zlookup2, zlookup2, zqword_shuf ;;gather/scatter hashes knotq k6, k0 vpgatherdd zdists_lookup {k6}, [hash_table + HASH_BYTES * zhashes] vpbroadcastd zindex, f_i %+ d vpaddd zindex, zindex, zincrement vpblendmw zscatter {k1}, zindex, zdists_lookup knotq k6, k0 vpscatterdd [hash_table + HASH_BYTES * zhashes] {k6}, zscatter ;; Compute hash for next loop vmovdqu64 datas %+ y, [f_i + file_start + VECT_SIZE] vpermq zhashes, zdatas_perm, datas vpshufb zhashes, zhashes, zdatas_shuf vpmaddwd zhashes, zhashes, zhash_prod vpmaddwd zhashes, zhashes, zhash_prod vpandd zhashes, zhashes, zhash_mask vmovdqu64 datas_lookup %+ y, [f_i + file_start + 2 * VECT_SIZE] sub f_i_end, VECT_SIZE cmp f_i, f_i_end jg loop1_end loop1: lea next_in, [f_i + file_start] ;; Calculate look back dists vpaddd zdists, zdists_lookup, zones vpsubd zdists, zindex, zdists vpandd zdists, zdists, zdist_mask vpaddd zdists, zdists, zones vpsubd zdists, zincrement, zdists ;;gather/scatter hashes add f_i, VECT_SIZE kxnorq k6, k6, k6 kxnorq k7, k7, k7 vpgatherdd zdists_lookup {k6}, [hash_table + HASH_BYTES * zhashes] vpbroadcastd zindex, f_i %+ d vpaddd zindex, zindex, zincrement vpblendmw zscatter {k1}, zindex, zdists_lookup vpscatterdd [hash_table + HASH_BYTES * zhashes] {k7}, zscatter ;; Compute hash for next loop vpermq zhashes, zdatas_perm, datas_lookup vpshufb zhashes, zhashes, zdatas_shuf vpmaddwd zhashes, zhashes, zhash_prod vpmaddwd zhashes, zhashes, zhash_prod vpandd zhashes, zhashes, zhash_mask ;;lookup old codes vextracti32x8 zdists2 %+ y, zdists, 1 kxnorq k6, k6, k6 kxnorq k7, k7, k7 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y] vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y] ;; Calculate dist_icf_code vpaddd zdists, zdists, zones vpsubd zdists, zincrement, zdists vpcmpgtd k5, zdists, zones vplzcntd zdist_extra, zdists vpsubd zdist_extra {k5}{z}, zthirty, zdist_extra vpsllvd zcode, zones, zdist_extra vpsubd zcode, zcode, zones vpandd zcode {k5}{z}, zdists, zcode vpsrlvd zdists, zdists, zdist_extra vpslld zdist_extra, zdist_extra, 1 vpaddd zdists, zdists, zdist_extra vpslld zcode, zcode, EXTRA_BITS_OFFSET - DIST_OFFSET vpaddd zdists, zdists, zcode ;; Setup zdists for combining with zlens vpslld zdists, zdists, DIST_OFFSET ;; xor current data with lookback dist vpxorq zlens1, zlens1, zlookup vpxorq zlens2, zlens2, zlookup2 ;; Setup registers for next loop vpermq zlookup, zdatas_perm2, datas vpshufb zlookup, zlookup, zqword_shuf vpermq zlookup2, zdatas_perm3, datas vpshufb zlookup2, zlookup2, zqword_shuf ;; Compute match length vpshufb zlens1, zlens1, zbswap vpshufb zlens2, zlens2, zbswap vplzcntq zlens1, zlens1 vplzcntq zlens2, zlens2 vpmovqd zlens1 %+ y, zlens1 vpmovqd zlens2 %+ y, zlens2 vinserti32x8 zlens1, zlens2 %+ y, 1 vpsrld zlens1, zlens1, 3 ;; Preload for next loops vmovdqu64 datas, datas_lookup vmovdqu64 datas_lookup %+ y, [f_i + file_start + 2 * VECT_SIZE] ;; Zero out matches which should not be taken kshiftrw k3, k1, 15 vpermd zlens2, zrot_left, zlens1 vpermd zdists, zrot_left, zdists vmovd zdists_tmp %+ x, prev_len %+ d vmovd prev_len %+ d, zlens2 %+ x vmovdqu32 zlens2 {k3}, zdists_tmp vmovd zdists_tmp %+ x, prev_dist %+ d vmovd prev_dist %+ d, zdists %+ x vmovdqu32 zdists {k3}, zdists_tmp vpcmpgtd k3, zlens2, zshortest_matches vpcmpgtd k4, zlens1, zlens2 knotq k3, k3 korq k3, k3, k4 knotq k4, k3 vmovdqu32 zlens1 {k4}{z}, zlens2 ;; Update zdists to match zlens1 vpaddd zdists, zdists, zlens1 vpaddd zdists, zdists, ztwofiftyfour vpmovzxbd zdists {k3}, [f_i + file_start - VECT_SIZE - 1] vpaddd zdists {k3}, zdists, znull_dist_syms ;;Store zdists vmovdqu64 [matches_next], zdists add matches_next, ICF_CODE_BYTES * VECT_SIZE cmp f_i, f_i_end jle loop1 loop1_end: lea next_in, [f_i + file_start] ;; Calculate look back dists vpaddd zdists, zdists_lookup, zones vpsubd zdists, zindex, zdists vpandd zdists, zdists, zdist_mask vpaddd zdists, zdists, zones vpsubd zdists, zincrement, zdists ;;lookup old codes vextracti32x8 zdists2 %+ y, zdists, 1 kxnorq k6, k6, k6 kxnorq k7, k7, k7 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y] vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y] ;; Calculate dist_icf_code vpaddd zdists, zdists, zones vpsubd zdists, zincrement, zdists vpcmpgtd k5, zdists, zones vplzcntd zdist_extra, zdists vpsubd zdist_extra {k5}{z}, zthirty, zdist_extra vpsllvd zcode, zones, zdist_extra vpsubd zcode, zcode, zones vpandd zcode {k5}{z}, zdists, zcode vpsrlvd zdists, zdists, zdist_extra vpslld zdist_extra, zdist_extra, 1 vpaddd zdists, zdists, zdist_extra vpslld zcode, zcode, EXTRA_BITS_OFFSET - DIST_OFFSET vpaddd zdists, zdists, zcode ;; Setup zdists for combining with zlens vpslld zdists, zdists, DIST_OFFSET ;; xor current data with lookback dist vpxorq zlens1, zlens1, zlookup vpxorq zlens2, zlens2, zlookup2 ;; Compute match length vpshufb zlens1, zlens1, zbswap vpshufb zlens2, zlens2, zbswap vplzcntq zlens1, zlens1 vplzcntq zlens2, zlens2 vpmovqd zlens1 %+ y, zlens1 vpmovqd zlens2 %+ y, zlens2 vinserti32x8 zlens1, zlens2 %+ y, 1 vpsrld zlens1, zlens1, 3 ;; Zero out matches which should not be taken kshiftrw k3, k1, 15 vpermd zlens2, zrot_left, zlens1 vpermd zdists, zrot_left, zdists vmovd zdists_tmp %+ x, prev_len %+ d vmovd prev_len %+ d, zlens2 %+ x vmovdqu32 zlens2 {k3}, zdists_tmp vmovd zdists_tmp %+ x, prev_dist %+ d vmovd prev_dist %+ d, zdists %+ x vmovdqu32 zdists {k3}, zdists_tmp vpcmpgtd k3, zlens2, zshortest_matches vpcmpgtd k4, zlens1, zlens2 knotq k3, k3 korq k3, k3, k4 knotq k4, k3 vmovdqu32 zlens1 {k4}{z}, zlens2 ;; Update zdists to match zlens1 vpaddd zdists, zdists, zlens1 vpaddd zdists, zdists, ztwofiftyfour vpmovzxbd zdists {k3}, [f_i + file_start - 1] vpaddd zdists {k3}, zdists, znull_dist_syms ;;Store zdists vmovdqu64 [matches_next], zdists end_main: FUNC_RESTORE ret section .data align 64 datas_perm: dq 0x0, 0x1, 0x0, 0x1, 0x1, 0x2, 0x1, 0x2 datas_perm2: dq 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1 datas_perm3: dq 0x1, 0x2, 0x1, 0x2, 0x1, 0x2, 0x1, 0x2 drot_left: dd 0xf, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 dd 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe datas_shuf: db 0x0, 0x1, 0x2, 0x3 db 0x1, 0x2, 0x3, 0x4 db 0x2, 0x3, 0x4, 0x5 db 0x3, 0x4, 0x5, 0x6 db 0x4, 0x5, 0x6, 0x7 db 0x5, 0x6, 0x7, 0x8 db 0x6, 0x7, 0x8, 0x9 db 0x7, 0x8, 0x9, 0xa db 0x0, 0x1, 0x2, 0x3 db 0x1, 0x2, 0x3, 0x4 db 0x2, 0x3, 0x4, 0x5 db 0x3, 0x4, 0x5, 0x6 db 0x4, 0x5, 0x6, 0x7 db 0x5, 0x6, 0x7, 0x8 db 0x6, 0x7, 0x8, 0x9 db 0x7, 0x8, 0x9, 0xa bswap_shuf: db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 qword_shuf: db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf %define PROD1 0xE84B %define PROD2 0x97B1 hash_prod: dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2 dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2 dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2 dw PROD1, PROD2, PROD1, PROD2, PROD1, PROD2, PROD1, PROD2 null_dist_syms: dd LIT, LIT, LIT, LIT, LIT, LIT, LIT, LIT dd LIT, LIT, LIT, LIT, LIT, LIT, LIT, LIT increment: dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ones: dd 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1 dd 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1 thirty: dd 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e dd 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e twofiftyfour: dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe dist_mask: dd D-1, D-1, D-1, D-1, D-1, D-1, D-1, D-1 dd D-1, D-1, D-1, D-1, D-1, D-1, D-1, D-1 hash_mask: dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK dd HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK, HASH_MAP_HASH_MASK lit_len_mask: dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK shortest_matches: dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH dd MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH, MIN_DEF_MATCH k_mask_1: dq 0xaaaaaaaaaaaaaaaa k_mask_2: dq 0x7fff %endif