diff --git a/igzip/encode_df.h b/igzip/encode_df.h index 168e02c..f3e4f75 100644 --- a/igzip/encode_df.h +++ b/igzip/encode_df.h @@ -2,6 +2,7 @@ #define ENCODE_DF_H #include +#include "igzip_lib.h" #include "huff_codes.h" /* Deflate Intermediate Compression Format */ @@ -12,8 +13,9 @@ #define ICF_DIST_OFFSET LIT_LEN_BIT_COUNT #define NULL_DIST_SYM 30 -#define LEN_START 257 -#define LEN_OFFSET (LEN_START - 3) +#define LEN_START ISAL_DEF_LIT_SYMBOLS +#define LEN_OFFSET (LEN_START - ISAL_DEF_MIN_MATCH) +#define LEN_MAX (LEN_OFFSET + ISAL_DEF_MAX_MATCH) #define LIT_START (NULL_DIST_SYM + 1) #define ICF_CODE_LEN 32 diff --git a/igzip/huffman.h b/igzip/huffman.h index 505564e..dbb52aa 100644 --- a/igzip/huffman.h +++ b/igzip/huffman.h @@ -282,3 +282,61 @@ static inline int compare258(uint8_t * str1, uint8_t * str2, uint32_t max_length return count; } + +/** + * @brief Returns how long str1 and str2 have the same symbols. + * @param str1: First input string. + * @param str2: Second input string. + * @param max_length: length of the smaller string. + */ +static inline int compare(uint8_t * str1, uint8_t * str2, uint32_t max_length) +{ + uint32_t count; + uint64_t test; + uint64_t loop_length; + + loop_length = max_length & ~0x7; + + for(count = 0; count < loop_length; count += 8){ + test = *(uint64_t *) str1; + test ^= *(uint64_t *) str2; + if(test != 0) + return count + tzbytecnt(test); + str1 += 8; + str2 += 8; + } + + switch(max_length % 8){ + + case 7: + if(*str1++ != *str2++) + return count; + count++; + case 6: + if(*str1++ != *str2++) + return count; + count++; + case 5: + if(*str1++ != *str2++) + return count; + count++; + case 4: + if(*str1++ != *str2++) + return count; + count++; + case 3: + if(*str1++ != *str2++) + return count; + count++; + case 2: + if(*str1++ != *str2++) + return count; + count++; + case 1: + if(*str1 != *str2) + return count; + count++; + } + + return count; +} diff --git a/igzip/igzip_compare_types.asm b/igzip/igzip_compare_types.asm index f525602..c5ab316 100644 --- a/igzip/igzip_compare_types.asm +++ b/igzip/igzip_compare_types.asm @@ -278,7 +278,78 @@ jmp %%end %%miscompare_vect: - bsf %%tmp, %%tmp + tzcnt %%tmp, %%tmp + add %%result, %%tmp +%%end: +%endm + +;; compares 64 bytes at a time +;; compare_z src1, src2, result, result_max, tmp, ktmp, ztmp1, ztmp2 +;; Clobbers result_max +%macro compare_z 8 +%define %%src1 %1 +%define %%src2 %2 +%define %%result %3 ; Accumulator for match_length +%define %%result_max %4 +%define %%tmp %5 ; tmp as a 16-bit register +%define %%ktmp %6 +%define %%ztmp %7 +%define %%ztmp2 %8 + + sub %%result_max, 128 + cmp %%result, %%result_max + jg %%_by_64 + +%%loop1: + vmovdqu8 %%ztmp, [%%src1 + %%result] + vmovdqu8 %%ztmp2, [%%src2 + %%result] + vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ + ktestq %%ktmp, %%ktmp + jnz %%miscompare + add %%result, 64 + + vmovdqu8 %%ztmp, [%%src1 + %%result] + vmovdqu8 %%ztmp2, [%%src2 + %%result] + vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ + ktestq %%ktmp, %%ktmp + jnz %%miscompare + add %%result, 64 + + cmp %%result, %%result_max + jle %%loop1 + +%%_by_64: + add %%result_max, 64 + cmp %%result, %%result_max + jg %%_less_than_64 + + vmovdqu8 %%ztmp, [%%src1 + %%result] + vmovdqu8 %%ztmp2, [%%src2 + %%result] + vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ + ktestq %%ktmp, %%ktmp + jnz %%miscompare + add %%result, 64 + +%%_less_than_64: + add %%result_max, 64 + sub %%result_max, %%result + jle %%end + + mov %%tmp, -1 + bzhi %%tmp, %%tmp, %%result_max + kmovq %%ktmp, %%tmp + + vmovdqu8 %%ztmp {%%ktmp}{z}, [%%src1 + %%result] + vmovdqu8 %%ztmp2 {%%ktmp}{z}, [%%src2 + %%result] + vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ + ktestq %%ktmp, %%ktmp + jnz %%miscompare + add %%result, %%result_max + + jmp %%end +%%miscompare: + kmovq %%tmp, %%ktmp + tzcnt %%tmp, %%tmp add %%result, %%tmp %%end: %endm diff --git a/igzip/igzip_icf_body.c b/igzip/igzip_icf_body.c index 28e7b1f..61ba0e9 100644 --- a/igzip/igzip_icf_body.c +++ b/igzip/igzip_icf_body.c @@ -42,13 +42,15 @@ void set_long_icf_fg_base(uint8_t * next_in, uint8_t * end_in, dist = dist_start[dist_code] + dist_extra; len = match_lookup->lit_len; if (len >= 8 + LEN_OFFSET) { - match_len = - compare258(next_in - dist + 8, next_in + 8, 250) + LEN_OFFSET + 8; + match_len = compare(next_in - dist + 8, next_in + 8, + end_in - next_in + ISAL_DEF_MAX_MATCH) + + LEN_OFFSET + 8; while (match_len > match_lookup->lit_len && match_len >= LEN_OFFSET + SHORTEST_MATCH) { - write_deflate_icf(match_lookup, match_len, dist_code, - dist_extra); + write_deflate_icf(match_lookup, + match_len > LEN_MAX ? LEN_MAX : match_len, + dist_code, dist_extra); match_lookup++; next_in++; match_len--; diff --git a/igzip/igzip_set_long_icf_fg_04.asm b/igzip/igzip_set_long_icf_fg_04.asm index 8bd0b2d..50a1460 100644 --- a/igzip/igzip_set_long_icf_fg_04.asm +++ b/igzip/igzip_set_long_icf_fg_04.asm @@ -40,6 +40,7 @@ default rel %define arg2 rdx %define arg3 r8 %define dist_code rsi +%define tmp3 rsi %define len rdi %define tmp2 rdi %else @@ -47,6 +48,7 @@ default rel %define arg2 rsi %define arg3 rdx %define dist_code rcx +%define tmp3 rcx %define len r8 %define tmp2 r8 %endif @@ -63,14 +65,14 @@ default rel %define ymatch_lookup ymm0 %define ymatch_lookup2 ymm1 %define ylens ymm2 -%define ydists ymm3 +%define ycmp2 ymm3 %define ylens1 ymm4 %define ylens2 ymm5 %define ycmp ymm6 %define ytmp1 ymm7 %define ytmp2 ymm8 %define yvect_size ymm9 -%define ytwofiftyfour ymm10 +%define ymax_len ymm10 %define ytwofiftysix ymm11 %define ynlen_mask ymm12 %define ydists_mask ymm13 @@ -138,7 +140,7 @@ func(set_long_icf_fg_04) vmovdqu ydists_mask, [dists_mask] vmovdqu ynlen_mask, [nlen_mask] vmovdqu yvect_size, [vect_size] - vmovdqu ytwofiftyfour, [twofiftyfour] + vmovdqu ymax_len, [max_len] vmovdqu ytwofiftysix, [twofiftysix] vmovdqu ymatch_lookup, [match_lookup] @@ -180,27 +182,23 @@ func(set_long_icf_fg_04) mov match_in, next_in sub match_in, dist - mov len, 2 -%rep 7 - vmovdqu ytmp1, [next_in + len] - vmovdqu ytmp2, [match_in + len] - vpcmpeqb ycmp, ytmp1, [match_in + len] - vpmovmskb tmp1, ycmp - cmp tmp1 %+ d, 0xffffffff - jne .miscompare + mov len, 8 + mov tmp3, end_in + sub tmp3, next_in + add tmp3, 258 - add len, 32 -%endrep + compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2 - vmovdqu ytmp1, [next_in + len] - vmovdqu ytmp2, [match_in + len] - vpcmpeqb ycmp, ytmp1, [match_in + len] - vpmovmskb tmp1, ycmp + vmovd ylens1 %+ x, len %+ d + vpbroadcastd ylens1, ylens1 %+ x + vpsubd ylens1, ylens1, [increment] + vpaddd ylens1, ylens1, [twofiftyfour] + + mov tmp3, end_in + sub tmp3, next_in + cmp len, tmp3 + cmovg len, tmp3 -.miscompare: - not tmp1 %+ d - tzcnt tmp1 %+ d, tmp1 %+ d - add len, tmp1 add next_in, len lea match_lookup, [match_lookup + ICF_CODE_BYTES * len] vmovdqu ymatch_lookup, [match_lookup] @@ -208,10 +206,6 @@ func(set_long_icf_fg_04) vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask - vmovd ylens1 %+ x, len %+ d - vpbroadcastd ylens1, ylens1 %+ x - vpsubd ylens1, ylens1, [increment] - vpaddd ylens1, ylens1, ytwofiftyfour neg len .update_match_lookup: @@ -222,7 +216,12 @@ func(set_long_icf_fg_04) vpand ycmp, ycmp, ytmp1 vpmovmskb tmp1, ycmp - vpaddd ylens2, ylens1, ymatch_lookup2 + vpcmpgtd ycmp2, ylens1, ymax_len + vpandn ylens, ycmp2, ylens1 + vpand ycmp2, ymax_len, ycmp2 + vpor ylens, ycmp2 + + vpaddd ylens2, ylens, ymatch_lookup2 vpand ylens2, ylens2, ycmp vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2 @@ -281,3 +280,6 @@ twofiftysix: nlen_mask: dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 +max_len: + dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102 + dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102 diff --git a/igzip/igzip_set_long_icf_fg_06.asm b/igzip/igzip_set_long_icf_fg_06.asm index 6a36c4e..89858a4 100644 --- a/igzip/igzip_set_long_icf_fg_06.asm +++ b/igzip/igzip_set_long_icf_fg_06.asm @@ -39,12 +39,14 @@ %define arg2 rdx %define arg3 r8 %define dist_code rsi +%define tmp2 rsi %define len rdi %else %define arg1 rdi %define arg2 rsi %define arg3 rdx %define dist_code rcx +%define tmp2 rcx %define len r8 %endif @@ -71,7 +73,8 @@ %define datas zmm11 %define ztmp1 zmm12 %define ztmp2 zmm13 -%define zvect_size zmm17 +%define zvect_size zmm16 +%define zmax_len zmm17 %define ztwofiftyfour zmm18 %define ztwofiftysix zmm19 %define ztwosixtytwo zmm20 @@ -151,6 +154,7 @@ func(set_long_icf_fg_06) vbroadcasti64x2 zbswap, [bswap_shuf] vpbroadcastd znlen_mask, [nlen_mask] vpbroadcastd zvect_size, [vect_size] + vpbroadcastd zmax_len, [max_len] vpbroadcastd ztwofiftyfour, [twofiftyfour] vpbroadcastd ztwofiftysix, [twofiftysix] vpbroadcastd ztwosixtytwo, [twosixtytwo] @@ -230,25 +234,22 @@ func(set_long_icf_fg_06) mov match_in, next_in sub match_in, dist - mov len, 2 -%rep 3 - vmovdqu8 ztmp1, [next_in + len] - vmovdqu8 ztmp2, [match_in + len] - vpcmpb k3, ztmp1, [match_in + len], NEQ - ktestq k3, k3 - jnz .miscompare + mov len, 16 + mov tmp2, end_in + sub tmp2, next_in + add tmp2, 258 - add len, 64 -%endrep + compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2 - vmovdqu8 ztmp1, [next_in + len] - vmovdqu8 ztmp2, [match_in + len] - vpcmpb k3, ztmp1, ztmp2, 4 + vpbroadcastd zlens1, len %+ d + vpsubd zlens1, zlens1, zincrement + vpaddd zlens1, zlens1, ztwofiftyfour + + mov tmp2, end_in + sub tmp2, next_in + cmp len, tmp2 + cmovg len, tmp2 -.miscompare: - kmovq tmp1, k3 - tzcnt tmp1, tmp1 - add len, tmp1 add next_in, len lea match_lookup, [match_lookup + ICF_CODE_BYTES * len] vmovdqu32 zmatch_lookup, [match_lookup] @@ -256,9 +257,6 @@ func(set_long_icf_fg_06) vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x vpandd zmatch_lookup2, zmatch_lookup2, znlen_mask - vpbroadcastd zlens1, len %+ d - vpsubd zlens1, zlens1, zincrement - vpaddd zlens1, zlens1, ztwofiftyfour neg len .update_match_lookup: @@ -267,7 +265,11 @@ func(set_long_icf_fg_06) vpcmpgtd k4, zlens1, ztwofiftysix kandw k3, k3, k4 - vpaddd zlens2 {k3}{z}, zlens1, zmatch_lookup2 + vpcmpgtd k4, zlens1, zmax_len + vmovdqu32 zlens, zlens1 + vmovdqu32 zlens {k4}, zmax_len + + vpaddd zlens2 {k3}{z}, zlens, zmatch_lookup2 vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2 @@ -340,6 +342,8 @@ long_len: dd 0x105 long_len2: dd 0x7 +max_len: + dd 0xfe + 0x102 vect_size: dd VECT_SIZE twofiftyfour: diff --git a/include/igzip_lib.h b/include/igzip_lib.h index e6e8d81..800fa7b 100644 --- a/include/igzip_lib.h +++ b/include/igzip_lib.h @@ -85,13 +85,16 @@ extern "C" { #define ISAL_DEF_MAX_CODE_LEN 15 #define ISAL_DEF_HIST_SIZE (32*IGZIP_K) #define ISAL_DEF_MAX_HIST_BITS 15 +#define ISAL_DEF_MAX_MATCH 258 +#define ISAL_DEF_MIN_MATCH 3 #define ISAL_DEF_LIT_SYMBOLS 257 #define ISAL_DEF_LEN_SYMBOLS 29 #define ISAL_DEF_DIST_SYMBOLS 30 #define ISAL_DEF_LIT_LEN_SYMBOLS (ISAL_DEF_LIT_SYMBOLS + ISAL_DEF_LEN_SYMBOLS) -#define ISAL_LOOK_AHEAD (18 * 16) /* Max repeat length, rounded up to 32 byte boundary */ +/* Max repeat length, rounded up to 32 byte boundary */ +#define ISAL_LOOK_AHEAD ((ISAL_DEF_MAX_MATCH + 31) & ~31) /******************************************************************************/ /* Deflate Implementation Specific Defines */