diff --git a/igzip/igzip_icf_body.c b/igzip/igzip_icf_body.c index 61ba0e9..11b2924 100644 --- a/igzip/igzip_icf_body.c +++ b/igzip/igzip_icf_body.c @@ -4,7 +4,7 @@ #include "igzip_level_buf_structs.h" extern uint64_t gen_icf_map_lh1(struct isal_zstream *, struct deflate_icf *, uint32_t); -extern void set_long_icf_fg(uint8_t *, uint8_t *, struct deflate_icf *, struct level_buf *); +extern void set_long_icf_fg(uint8_t *, uint64_t, uint64_t, struct deflate_icf *); extern void isal_deflate_icf_body_lvl1(struct isal_zstream *); extern void isal_deflate_icf_body_lvl2(struct isal_zstream *); extern void isal_deflate_icf_body_lvl3(struct isal_zstream *); @@ -24,9 +24,11 @@ static inline void write_deflate_icf(struct deflate_icf *icf, uint32_t lit_len, | (extra_bits << (LIT_LEN_BIT_COUNT + DIST_LIT_BIT_COUNT)); } -void set_long_icf_fg_base(uint8_t * next_in, uint8_t * end_in, - struct deflate_icf *match_lookup, struct level_buf *level_buf) +void set_long_icf_fg_base(uint8_t * next_in, uint64_t processed, uint64_t input_size, + struct deflate_icf *match_lookup) { + uint8_t *end_processed = next_in + processed; + uint8_t *end_in = next_in + input_size; uint32_t dist_code, dist_extra, dist, len; uint32_t match_len; uint32_t dist_start[] = { @@ -36,15 +38,17 @@ void set_long_icf_fg_base(uint8_t * next_in, uint8_t * end_in, 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000 }; - while (next_in < end_in - ISAL_LOOK_AHEAD) { + if (end_in > end_processed + ISAL_LOOK_AHEAD) + end_in = end_processed + ISAL_LOOK_AHEAD; + + while (next_in < end_processed) { dist_code = match_lookup->lit_dist; dist_extra = match_lookup->dist_extra; dist = dist_start[dist_code] + dist_extra; len = match_lookup->lit_len; if (len >= 8 + LEN_OFFSET) { - match_len = compare(next_in - dist + 8, next_in + 8, - end_in - next_in + ISAL_DEF_MAX_MATCH) + - LEN_OFFSET + 8; + match_len = compare((next_in + 8) - dist, next_in + 8, + end_in - (next_in + 8)) + LEN_OFFSET + 8; while (match_len > match_lookup->lit_len && match_len >= LEN_OFFSET + SHORTEST_MATCH) { @@ -251,8 +255,7 @@ void icf_body_hash1_fillgreedy_lazy(struct isal_zstream *stream) processed = gen_icf_map_h1_base(stream, matches_icf_lookup, input_size); - set_long_icf_fg(stream->next_in, stream->next_in + processed, - matches_icf_lookup, level_buf); + set_long_icf_fg(stream->next_in, processed, input_size, matches_icf_lookup); stream->next_in += processed; stream->avail_in -= processed; @@ -291,8 +294,7 @@ void icf_body_lazyhash1_fillgreedy_greedy(struct isal_zstream *stream) processed = gen_icf_map_lh1(stream, matches_icf_lookup, input_size); - set_long_icf_fg(stream->next_in, stream->next_in + processed, - matches_icf_lookup, level_buf); + set_long_icf_fg(stream->next_in, processed, input_size, matches_icf_lookup); stream->next_in += processed; stream->avail_in -= processed; diff --git a/igzip/igzip_set_long_icf_fg_04.asm b/igzip/igzip_set_long_icf_fg_04.asm index 50a1460..f5c2b98 100644 --- a/igzip/igzip_set_long_icf_fg_04.asm +++ b/igzip/igzip_set_long_icf_fg_04.asm @@ -39,28 +39,30 @@ default rel %define arg1 rcx %define arg2 rdx %define arg3 r8 -%define dist_code rsi -%define tmp3 rsi +%define arg4 r9 %define len rdi %define tmp2 rdi +%define dist rsi %else %define arg1 rdi %define arg2 rsi %define arg3 rdx -%define dist_code rcx -%define tmp3 rcx +%define arg4 rcx %define len r8 %define tmp2 r8 +%define dist r9 %endif %define next_in arg1 -%define end_in arg2 -%define match_lookup arg3 +%define end_processed arg2 +%define end_in arg3 +%define match_lookup arg4 %define match_in rax -%define dist r9 %define match_offset r10 %define tmp1 r11 -%define end_in_orig r12 +%define end_processed_orig r12 +%define dist_code r13 +%define tmp3 r13 %define ymatch_lookup ymm0 %define ymatch_lookup2 ymm1 @@ -97,6 +99,7 @@ default rel save_reg rsi, 10*16 + 0*8 save_reg rdi, 10*16 + 1*8 save_reg r12, 10*16 + 2*8 + save_reg r13, 10*16 + 3*8 end_prolog %endm @@ -115,15 +118,18 @@ default rel mov rsi, [rsp + 10*16 + 0*8] mov rdi, [rsp + 10*16 + 1*8] mov r12, [rsp + 10*16 + 2*8] + mov r13, [rsp + 10*16 + 3*8] add rsp, stack_size %endm %else %define func(x) x: %macro FUNC_SAVE 0 push r12 + push r13 %endm %macro FUNC_RESTORE 0 + pop r13 pop r12 %endm %endif @@ -133,8 +139,13 @@ global set_long_icf_fg_04 func(set_long_icf_fg_04) FUNC_SAVE - mov end_in_orig, end_in - sub end_in, VECT_SIZE - 1 + lea end_in, [next_in + arg3] + add end_processed, next_in + mov end_processed_orig, end_processed + lea tmp1, [end_processed + LA_STATELESS] + cmp end_in, tmp1 + cmovg end_in, tmp1 + sub end_processed, VECT_SIZE - 1 vmovdqu ylong_lens, [long_len] vmovdqu ylens_mask, [len_mask] vmovdqu ydists_mask, [dists_mask] @@ -148,7 +159,7 @@ func(set_long_icf_fg_04) vmovdqu ymatch_lookup2, ymatch_lookup vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE] - cmp next_in, end_in + cmp next_in, end_processed jae .end_fill .finish_entry: @@ -185,7 +196,6 @@ func(set_long_icf_fg_04) mov len, 8 mov tmp3, end_in sub tmp3, next_in - add tmp3, 258 compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2 @@ -194,7 +204,7 @@ func(set_long_icf_fg_04) vpsubd ylens1, ylens1, [increment] vpaddd ylens1, ylens1, [twofiftyfour] - mov tmp3, end_in + mov tmp3, end_processed sub tmp3, next_in cmp len, tmp3 cmovg len, tmp3 @@ -235,11 +245,11 @@ func(set_long_icf_fg_04) jmp .update_match_lookup .end_fill: - mov end_in, end_in_orig - cmp next_in, end_in + mov end_processed, end_processed_orig + cmp next_in, end_processed jge .finish - mov tmp1, end_in + mov tmp1, end_processed sub tmp1, next_in vmovd ytmp1 %+ x, tmp1 %+ d vpbroadcastd ytmp1, ytmp1 %+ x diff --git a/igzip/igzip_set_long_icf_fg_06.asm b/igzip/igzip_set_long_icf_fg_06.asm index 89858a4..39708ed 100644 --- a/igzip/igzip_set_long_icf_fg_06.asm +++ b/igzip/igzip_set_long_icf_fg_06.asm @@ -38,26 +38,28 @@ %define arg1 rcx %define arg2 rdx %define arg3 r8 -%define dist_code rsi -%define tmp2 rsi +%define arg4 r9 %define len rdi +%define dist rsi %else %define arg1 rdi %define arg2 rsi %define arg3 rdx -%define dist_code rcx -%define tmp2 rcx +%define arg4 rcx %define len r8 +%define dist r9 %endif %define next_in arg1 -%define end_in arg2 -%define match_lookup arg3 +%define end_processed arg2 +%define end_in arg3 +%define match_lookup arg4 %define match_in rax -%define dist r9 %define match_offset r10 %define tmp1 r11 -%define end_in_orig r12 +%define end_processed_orig r12 +%define dist_code r13 +%define tmp2 r13 %define zmatch_lookup zmm0 %define zmatch_lookup2 zmm1 @@ -106,6 +108,7 @@ save_reg rsi, 8*16 + 0*8 save_reg rdi, 8*16 + 1*8 save_reg r12, 8*16 + 2*8 + save_reg r13, 8*16 + 3*8 end_prolog %endm @@ -122,15 +125,18 @@ mov rsi, [rsp + 8*16 + 0*8] mov rdi, [rsp + 8*16 + 1*8] mov r12, [rsp + 8*16 + 2*8] + mov r13, [rsp + 8*16 + 3*8] add rsp, stack_size %endm %else %define func(x) x: %macro FUNC_SAVE 0 push r12 + push r13 %endm %macro FUNC_RESTORE 0 + pop r13 pop r12 %endm %endif @@ -140,8 +146,13 @@ global set_long_icf_fg_06 func(set_long_icf_fg_06) FUNC_SAVE - mov end_in_orig, end_in - sub end_in, 15 + lea end_in, [next_in + arg3] + add end_processed, next_in + mov end_processed_orig, end_processed + lea tmp1, [end_processed + LA_STATELESS] + cmp end_in, tmp1 + cmovg end_in, tmp1 + sub end_processed, 15 vpbroadcastd zlong_lens, [long_len] vpbroadcastd zlong_lens2, [long_len2] vpbroadcastd zlens_mask, [len_mask] @@ -164,7 +175,7 @@ func(set_long_icf_fg_06) vmovdqu32 zmatch_lookup2, zmatch_lookup vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE] - cmp next_in, end_in + cmp next_in, end_processed jae .end_fill .finish_entry: @@ -237,7 +248,6 @@ func(set_long_icf_fg_06) mov len, 16 mov tmp2, end_in sub tmp2, next_in - add tmp2, 258 compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2 @@ -245,7 +255,7 @@ func(set_long_icf_fg_06) vpsubd zlens1, zlens1, zincrement vpaddd zlens1, zlens1, ztwofiftyfour - mov tmp2, end_in + mov tmp2, end_processed sub tmp2, next_in cmp len, tmp2 cmovg len, tmp2 @@ -283,11 +293,11 @@ func(set_long_icf_fg_06) jmp .update_match_lookup .end_fill: - mov end_in, end_in_orig - cmp next_in, end_in + mov end_processed, end_processed_orig + cmp next_in, end_processed jge .finish - mov tmp1, end_in + mov tmp1, end_processed sub tmp1, next_in vpbroadcastd ztmp1, tmp1 %+ d vpcmpd k3, ztmp1, zincrement, 6