From 31814483c0163d27ab565739c5a0fe177c56f994 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Mon, 6 Jun 2016 09:23:26 -0700 Subject: [PATCH] igzip: Create assembly version of isal_update_histogram Signed-off-by: Roy Oursler Reviewed-by: Greg Tucker --- igzip/Makefile.am | 10 +- igzip/huff_codes.c | 4 +- igzip/igzip_hist_perf.c | 348 +++++++++++++++++++++ igzip/igzip_multibinary.asm | 7 + igzip/igzip_update_histogram.asm | 467 ++++++++++++++++++++++++++++ igzip/igzip_update_histogram_01.asm | 7 + igzip/igzip_update_histogram_04.asm | 8 + igzip/rfc1951_lookup.asm | 44 +++ include/igzip_lib.h | 1 + 9 files changed, 892 insertions(+), 4 deletions(-) create mode 100644 igzip/igzip_hist_perf.c create mode 100644 igzip/igzip_update_histogram.asm create mode 100644 igzip/igzip_update_histogram_01.asm create mode 100644 igzip/igzip_update_histogram_04.asm create mode 100644 igzip/rfc1951_lookup.asm diff --git a/igzip/Makefile.am b/igzip/Makefile.am index d18353d..cee0a44 100644 --- a/igzip/Makefile.am +++ b/igzip/Makefile.am @@ -36,7 +36,10 @@ lsrc += igzip/igzip.c igzip/hufftables_c.c \ igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \ igzip/igzip_multibinary.asm \ igzip/igzip_stateless_base.c \ - igzip/igzip_base.c + igzip/igzip_base.c \ + igzip/igzip_update_histogram_01.asm \ + igzip/igzip_update_histogram_04.asm \ + igzip/rfc1951_lookup.asm src_include += -I $(srcdir)/igzip extern_hdrs += include/igzip_lib.h @@ -49,7 +52,7 @@ check_tests += igzip/igzip_check perf_tests += igzip/igzip_perf igzip/igzip_sync_flush_perf -other_tests += igzip/igzip_file_perf igzip/igzip_sync_flush_file_perf igzip/igzip_stateless_file_perf +other_tests += igzip/igzip_file_perf igzip/igzip_sync_flush_file_perf igzip/igzip_stateless_file_perf igzip/igzip_hist_perf other_src += igzip/bitbuf2.asm igzip/data_struct2.asm \ igzip/igzip_buffer_utils_01.asm \ @@ -59,6 +62,7 @@ other_src += igzip/bitbuf2.asm igzip/data_struct2.asm \ igzip/bitbuf2.h igzip/repeated_char_result.h \ igzip/igzip_body.asm \ igzip/igzip_stateless.asm \ + igzip/igzip_update_histogram.asm \ igzip/huffman.asm \ include/reg_sizes.asm \ include/multibinary.asm \ @@ -94,3 +98,5 @@ igzip_igzip_inflate_test_LDADD = igzip/igzip_inflate_ref.lo libisal.la igzip_igzip_inflate_test_LDFLAGS = -lz igzip_check: igzip_inflate_ref.o igzip_igzip_check_LDADD = igzip/igzip_inflate_ref.lo libisal.la +igzip_hist_perf: igzip_inflate_ref.o +igzip_igzip_hist_perf_LDADD = igzip/igzip_inflate_ref.lo libisal.la diff --git a/igzip/huff_codes.c b/igzip/huff_codes.c index d69c99d..c0820cd 100644 --- a/igzip/huff_codes.c +++ b/igzip/huff_codes.c @@ -142,8 +142,8 @@ void append_to_back(struct linked_list *list, struct linked_list_node *new_eleme return; } -void isal_update_histogram(uint8_t * start_stream, int length, - struct isal_huff_histogram *histogram) +void isal_update_histogram_base(uint8_t * start_stream, int length, + struct isal_huff_histogram *histogram) { uint32_t literal = 0, hash; uint8_t *last_seen[HASH_SIZE]; diff --git a/igzip/igzip_hist_perf.c b/igzip/igzip_hist_perf.c new file mode 100644 index 0000000..03042ba --- /dev/null +++ b/igzip/igzip_hist_perf.c @@ -0,0 +1,348 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include "igzip_lib.h" +#include "test.h" +#include "igzip_inflate_ref.h" + +#define BUF_SIZE 1024 +#define MIN_TEST_LOOPS 8 +#ifndef RUN_MEM_SIZE +# define RUN_MEM_SIZE 2000000000 +#endif + +/* Inflates and fills a histogram of lit, len, and dist codes seen in non-type 0 blocks.*/ +int igzip_inflate_hist(struct inflate_state *state, struct isal_huff_histogram *histogram) +{ + /* The following tables are based on the tables in the deflate standard, + * RFC 1951 page 11. */ + const uint16_t len_start[29] = { + 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, + 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x17, 0x1b, 0x1f, + 0x23, 0x2b, 0x33, 0x3b, 0x43, 0x53, 0x63, 0x73, + 0x83, 0xa3, 0xc3, 0xe3, 0x102 + }; + const uint8_t len_extra_bit_count[29] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, + 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, + 0x5, 0x5, 0x5, 0x5, 0x0 + }; + const uint32_t dist_start[30] = { + 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d, + 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1, + 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01, + 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001 + }; + const uint8_t dist_extra_bit_count[30] = { + 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x2, 0x2, + 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, + 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, + 0xb, 0xb, 0xc, 0xc, 0xd, 0xd + }; + + uint16_t next_lit, len, nlen; + uint8_t next_dist; + uint32_t repeat_length; + uint32_t look_back_dist; + uint32_t tmp; + + memset(histogram, 0, sizeof(struct isal_huff_histogram)); + while (state->new_block == 0 || state->bfinal == 0) { + if (state->new_block != 0) { + tmp = read_header(state); + + if (tmp) + return tmp; + } + + if (state->btype == 0) { + /* If the block is uncompressed, update state data accordingly */ + if (state->in_buffer.avail_in < 4) + return END_OF_INPUT; + + len = *(uint16_t *) state->in_buffer.next_in; + state->in_buffer.next_in += 2; + nlen = *(uint16_t *) state->in_buffer.next_in; + state->in_buffer.next_in += 2; + + /* Check if len and nlen match */ + if (len != (~nlen & 0xffff)) + return INVALID_NON_COMPRESSED_BLOCK_LENGTH; + + if (state->in_buffer.avail_in < len) + len = state->in_buffer.avail_in; + else + state->new_block = 1; + + state->out_buffer.total_out += len; + state->in_buffer.next_in += len; + state->in_buffer.avail_in -= len + 4; + + if (state->in_buffer.avail_in == 0 && state->new_block == 0) + return END_OF_INPUT; + + } else { + /* Else decode a huffman encoded block */ + while (state->new_block == 0) { + /* While not at the end of block, decode the next + * symbol */ + next_lit = + decode_next(&state->in_buffer, &state->lit_huff_code); + + histogram->lit_len_histogram[next_lit] += 1; + + if (state->in_buffer.read_in_length < 0) + return END_OF_INPUT; + + if (next_lit < 256) + /* Next symbol is a literal */ + state->out_buffer.total_out++; + + else if (next_lit == 256) + /* Next symbol is end of block */ + state->new_block = 1; + + else if (next_lit < 286) { + /* Next symbol is a repeat length followed by a + lookback distance */ + repeat_length = + len_start[next_lit - 257] + + inflate_in_read_bits(&state->in_buffer, + len_extra_bit_count[next_lit - + 257]); + + next_dist = decode_next(&state->in_buffer, + &state->dist_huff_code); + + histogram->dist_histogram[next_dist] += 1; + + look_back_dist = dist_start[next_dist] + + inflate_in_read_bits(&state->in_buffer, + dist_extra_bit_count + [next_dist]); + + if (state->in_buffer.read_in_length < 0) + return END_OF_INPUT; + + if (look_back_dist > state->out_buffer.total_out) + return INVALID_LOOK_BACK_DISTANCE; + + state->out_buffer.total_out += repeat_length; + + } else + return INVALID_SYMBOL; + } + } + } + state->in_buffer.next_in -= state->in_buffer.read_in_length / 8; + state->in_buffer.avail_in += state->in_buffer.read_in_length / 8; + + return DECOMPRESSION_FINISHED; +} + +int get_filesize(FILE * f) +{ + int curr, end; + + curr = ftell(f); /* Save current position */ + fseek(f, 0L, SEEK_END); + end = ftell(f); + fseek(f, curr, SEEK_SET); /* Restore position */ + return end; +} + +void print_histogram(struct isal_huff_histogram *histogram) +{ + int i; + printf("Lit Len histogram"); + for (i = 0; i < IGZIP_LIT_LEN; i++) { + if (i % 16 == 0) + printf("\n"); + else + printf(", "); + printf("%4lu", histogram->lit_len_histogram[i]); + } + printf("\n"); + + printf("Dist histogram"); + for (i = 0; i < IGZIP_DIST_LEN; i++) { + if (i % 16 == 0) + printf("\n"); + else + printf(", "); + printf("%4lu", histogram->dist_histogram[i]); + } + printf("\n"); +} + +void print_diff_histogram(struct isal_huff_histogram *histogram1, + struct isal_huff_histogram *histogram2) +{ + int i; + double relative_error; + printf("Lit Len histogram relative error"); + for (i = 0; i < IGZIP_LIT_LEN; i++) { + if (i % 16 == 0) + printf("\n"); + else + printf(", "); + + if (histogram1->lit_len_histogram[i] == histogram2->lit_len_histogram[i]) { + printf(" % 4.0f %%", 0.0); + } else { + relative_error = + abs(histogram1->lit_len_histogram[i] - + histogram2->lit_len_histogram[i]); + relative_error = relative_error / histogram1->lit_len_histogram[i]; + relative_error = 100.0 * relative_error; + printf("~% 4.0f %%", relative_error); + } + } + printf("\n"); + + printf("Dist histogram relative error"); + for (i = 0; i < IGZIP_DIST_LEN; i++) { + if (i % 16 == 0) + printf("\n"); + else + printf(", "); + + if (histogram1->dist_histogram[i] == histogram2->dist_histogram[i]) { + printf(" % 4.0f %%", 0.0); + } else { + relative_error = + abs(histogram1->dist_histogram[i] - histogram2->dist_histogram[i]); + relative_error = relative_error / histogram1->dist_histogram[i]; + relative_error = 100.0 * relative_error; + printf("~% 4.0f %%", relative_error); + } + + } + printf("\n"); +} + +int main(int argc, char *argv[]) +{ + FILE *in; + unsigned char *inbuf, *outbuf; + int i, infile_size, outbuf_size, iterations, avail_in; + struct isal_huff_histogram histogram1, histogram2; + struct isal_hufftables hufftables_custom; + struct isal_zstream stream; + struct inflate_state gstream; + + memset(&histogram1, 0, sizeof(histogram1)); + memset(&histogram2, 0, sizeof(histogram2)); + + if (argc > 3 || argc < 2) { + fprintf(stderr, "Usage: igzip_file_perf infile [outfile]\n" + "\t - Runs multiple iterations of igzip on a file to " + "get more accurate time results.\n"); + exit(0); + } + in = fopen(argv[1], "rb"); + if (!in) { + fprintf(stderr, "Can't open %s for reading\n", argv[1]); + exit(0); + } + + /* Allocate space for entire input file and output + * (assuming some possible expansion on output size) + */ + infile_size = get_filesize(in); + outbuf_size = 2 * infile_size; + + if (infile_size != 0) + iterations = RUN_MEM_SIZE / infile_size; + else + iterations = MIN_TEST_LOOPS; + + if (iterations < MIN_TEST_LOOPS) + iterations = MIN_TEST_LOOPS; + + inbuf = malloc(infile_size); + outbuf = malloc(outbuf_size); + if (inbuf == NULL) { + fprintf(stderr, "Can't allocate input buffer memory\n"); + exit(0); + } + + if (outbuf == NULL) { + fprintf(stderr, "Can't allocate output buffer memory\n"); + exit(0); + } + + avail_in = fread(inbuf, 1, infile_size, in); + if (avail_in != infile_size) { + fprintf(stderr, "Couldn't fit all of input file into buffer\n"); + exit(0); + } + + struct perf start, stop; + perf_start(&start); + + for (i = 0; i < iterations; i++) + isal_update_histogram(inbuf, infile_size, &histogram1); + perf_stop(&stop); + + printf(" file %s - in_size=%d iter=%d\n", argv[1], infile_size, i); + printf("igzip_file: "); + perf_print(stop, start, (long long)infile_size * i); + + memset(&histogram1, 0, sizeof(histogram1)); + + isal_update_histogram(inbuf, infile_size, &histogram1); + + isal_create_hufftables(&hufftables_custom, &histogram1); + + isal_deflate_init(&stream); + stream.end_of_stream = 1; /* Do the entire file at once */ + stream.flush = NO_FLUSH; + stream.next_in = inbuf; + stream.avail_in = infile_size; + stream.next_out = outbuf; + stream.avail_out = outbuf_size; + stream.hufftables = &hufftables_custom; + isal_deflate_stateless(&stream); + + igzip_inflate_init(&gstream, outbuf, stream.total_out, NULL, 0); + igzip_inflate_hist(&gstream, &histogram2); + + printf("Histogram Error \n"); + print_diff_histogram(&histogram1, &histogram2); + + fclose(in); + fflush(0); + return 0; +} diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm index 9fb2741..32bcb8e 100644 --- a/igzip/igzip_multibinary.asm +++ b/igzip/igzip_multibinary.asm @@ -51,6 +51,10 @@ extern isal_deflate_finish_01 extern get_crc_base extern get_crc_01 +extern isal_update_histogram_base +extern isal_update_histogram_01 +extern isal_update_histogram_04 + extern isal_deflate_init_base extern isal_deflate_init_01 @@ -71,3 +75,6 @@ mbin_dispatch_init5 isal_deflate_finish, isal_deflate_finish_base, isal_deflate_ mbin_interface get_crc mbin_dispatch_init5 get_crc, get_crc_base, get_crc_01, get_crc_01, get_crc_01 + +mbin_interface isal_update_histogram +mbin_dispatch_init5 isal_update_histogram, isal_update_histogram_base, isal_update_histogram_01, isal_update_histogram_01, isal_update_histogram_04 diff --git a/igzip/igzip_update_histogram.asm b/igzip/igzip_update_histogram.asm new file mode 100644 index 0000000..2067a21 --- /dev/null +++ b/igzip/igzip_update_histogram.asm @@ -0,0 +1,467 @@ + +%include "options.asm" + +%include "lz0a_const.asm" +%include "data_struct2.asm" +%include "bitbuf2.asm" +%include "huffman.asm" +%include "igzip_compare_types.asm" +%include "reg_sizes.asm" + +%include "stdmac.asm" + +extern rfc1951_lookup_table +_len_to_code_offset equ 0 + +%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds +%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary +%define LIT_LEN 286 +%define DIST_LEN 30 +%define HIST_ELEM_SIZE 8 + +%ifdef DEBUG +%macro MARK 1 +global %1 +%1: +%endm +%else +%macro MARK 1 +%endm +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define file_start rdi +%define file_length rsi +%define histogram rdx +%define rfc_lookup r9 +%define f_i r10 + +%define curr_data rax + +%define tmp2 rcx + +%define dist rbx +%define dist_code2 rbx + +%define dist2 r12 +%define dist_code r12 + +%define len rbp +%define len_code rbp +%define hash3 rbp + +%define curr_data2 r8 +%define len2 r8 + +%define tmp1 r11 + +%define tmp3 r13 + +%define hash r14 + +%define hash2 r15 + +%define xtmp0 xmm0 +%define xtmp1 xmm1 + +%define ytmp0 ymm0 +%define ytmp1 ymm1 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +_eob_count_offset equ 0 ; local variable (8 bytes) +f_end_i_mem_offset equ 8 +gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes) +xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) +stack_size equ 2*8 + 8*8 + 4*16 + 8 +;;; 8 because stack address is odd multiple of 8 after a function call and +;;; we want it aligned to 16 bytes +_lit_len_offset equ 0 +_dist_offset equ (8 * LIT_LEN) +_hash_offset equ (_dist_offset + 8 * DIST_LEN) + +%macro len_to_len_code 3 +%define %%len_code %1 ; Output +%define %%len %2 ; Input +%define %%rfc_lookup %3 + movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len] + or %%len_code, 0x100 +%endm + +;;; Clobbers rcx and dist +%macro dist_to_dist_code 2 +%define %%dist_code %1 ; Output code associated with dist +%define %%dist_coded %1d +%define %%dist %2d ; Input dist + dec %%dist + mov %%dist_coded, %%dist + bsr ecx, %%dist_coded + dec ecx + SHRX %%dist_code, %%dist_code, rcx + lea %%dist_coded, [%%dist_coded + 2*ecx] + + cmp %%dist, 1 + cmovle %%dist_coded, %%dist +%endm + +;;; Clobbers rcx and dist +%macro dist_to_dist_code2 2 +%define %%dist_code %1 ; Output code associated with dist +%define %%dist_coded %1d +%define %%dist %2d ; Input -(dist - 1) + neg %%dist + mov %%dist_coded, %%dist + bsr ecx, %%dist_coded + dec ecx + SHRX %%dist_code, %%dist_code, rcx + lea %%dist_coded, [%%dist_coded + 2*ecx] + + cmp %%dist, 1 + cmovle %%dist_coded, %%dist +%endm + +; void isal_update_histogram +global isal_update_histogram_ %+ ARCH +isal_update_histogram_ %+ ARCH %+ : + + ;; do nothing if (avail_in == 0) + cmp file_length, 0 + jne skip1 + ret +skip1: + +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, stack_size + and rsp, ~15 +%else + sub rsp, stack_size +%endif + + mov [rsp + gpr_save_mem_offset + 0*8], rbx + mov [rsp + gpr_save_mem_offset + 1*8], rsi + mov [rsp + gpr_save_mem_offset + 2*8], rdi + mov [rsp + gpr_save_mem_offset + 3*8], rbp + mov [rsp + gpr_save_mem_offset + 4*8], r12 + mov [rsp + gpr_save_mem_offset + 5*8], r13 + mov [rsp + gpr_save_mem_offset + 6*8], r14 + mov [rsp + gpr_save_mem_offset + 7*8], r15 + mov f_i, 0 + + mov tmp1, qword [histogram + _lit_len_offset + 8*256] + inc tmp1 + mov [rsp + _eob_count_offset], tmp1 + + lea rfc_lookup, [rfc1951_lookup_table] + + ;; Init hash_table + mov rcx, (HASH_SIZE-1) +init_hash_table: + mov word [histogram + _hash_offset + 2*rcx], -(D+1) + sub rcx, 1 + jge init_hash_table + + sub file_length, LA_STATELESS + cmp file_length, 0 + jle end_loop_2 + + + ;; Load first literal into histogram + mov curr_data, [file_start + f_i] + compute_hash hash, curr_data + and hash %+ d, HASH_MASK + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + and curr_data, 0xff + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + inc f_i + + ;; Setup to begin loop 2 + mov curr_data, [file_start + f_i] + mov curr_data2, curr_data + compute_hash hash, curr_data + shr curr_data2, 8 + compute_hash hash2, curr_data2 + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK +loop2: + xor dist, dist + xor dist2, dist2 + xor tmp3, tmp3 + + lea tmp1, [file_start + f_i] + + ;; Load possible look back distances and update hash data + mov dist %+ w, f_i %+ w + sub dist %+ w, word [histogram + _hash_offset + 2 * hash] + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + + add f_i, 1 + + mov dist2 %+ w, f_i %+ w + sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2] + mov [histogram + _hash_offset + 2 * hash2], f_i %+ w + + ;; Start computing hashes to be used in either the next loop or + ;; for updating the hash if a match is found + mov curr_data2, [file_start + f_i + 1] + mov tmp2, curr_data2 + compute_hash hash, curr_data2 + + ;; Check if look back distances are valid. Load a junk distance of 1 + ;; if the look back distance is too long for speculative lookups. + sub dist, 1 + cmp dist %+ d, (D-1) + cmovae dist, tmp3 + neg dist + + sub dist2, 1 + cmp dist2 %+ d, (D-1) + cmovae dist2, tmp3 + neg dist2 + + shr tmp2, 8 + compute_hash hash2, tmp2 + + ;; Check for long len/dist matches (>7) + mov len, [tmp1] + xor len, [tmp1 + dist - 1] + jz compare_loop + + and hash %+ d, HASH_MASK + and hash2 %+ d, HASH_MASK + + mov len2, [tmp1 + 1] + xor len2, [tmp1 + dist2] + jz compare_loop2 + + ;; Specutively load the code for the first literal + movzx tmp1, curr_data %+ b + shr curr_data, 8 + + lea tmp3, [f_i + 1] + + ;; Check for len/dist match for first literal + test len %+ d, 0xFFFFFFFF + jz len_dist_huffman_pre + + ;; Store first literal + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1] + + ;; Specutively load the code for the second literal + and curr_data, 0xff + + ;; Check for len/dist match for second literal + test len2 %+ d, 0xFFFFFFFF + jnz lit_lit_huffman +len_dist_lit_huffman_pre: + ;; Calculate repeat length + tzcnt len2, len2 + shr len2, 3 + +len_dist_lit_huffman: + ;; Store updated hashes + mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w + add tmp3,1 + mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w + + add f_i, len2 + + mov curr_data, [file_start + f_i] + mov tmp1, curr_data + compute_hash hash, curr_data + + dist_to_dist_code2 dist_code2, dist2 + + len_to_len_code len_code, len2, rfc_lookup + + shr tmp1, 8 + compute_hash hash2, tmp1 + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2] + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK + + cmp f_i, file_length + jl loop2 + jmp end_loop_2 + ;; encode as dist/len + +len_dist_huffman_pre: + tzcnt len, len + shr len, 3 + +len_dist_huffman: + mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w + + dec f_i + add f_i, len + + mov curr_data, [file_start + f_i] + mov tmp1, curr_data + compute_hash hash, curr_data + + dist_to_dist_code2 dist_code, dist + + len_to_len_code len_code, len, rfc_lookup + + shr tmp1, 8 + compute_hash hash2, tmp1 + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK + + cmp f_i, file_length + jl loop2 + jmp end_loop_2 + +lit_lit_huffman: + add f_i, 1 + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + + mov curr_data %+ d, [file_start + f_i] + + cmp f_i, file_length + jl loop2 + +end_loop_2: + add file_length, LA_STATELESS - LAST_BYTES_COUNT + cmp f_i, file_length + jge final_bytes + +loop2_finish: + mov curr_data, [file_start + f_i] + compute_hash hash, curr_data + and hash %+ d, HASH_MASK + + ;; Calculate possible distance for length/dist pair. + xor dist, dist + mov dist %+ w, f_i %+ w + sub dist %+ w, word [histogram + _hash_offset + 2 * hash] + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + + ;; Check if look back distance is valid (the dec is to handle when dist = 0) + dec dist + cmp dist %+ d, (D-1) + jae encode_literal_finish + inc dist + + ;; Check if look back distance is a match + lea tmp3, [file_length + LAST_BYTES_COUNT] + sub tmp3, f_i + lea tmp1, [file_start + f_i] + mov tmp2, tmp1 + sub tmp2, dist + compare tmp3, tmp1, tmp2, len, tmp3 + + ;; Limit len to maximum value of 258 + mov tmp2, 258 + cmp len, 258 + cmova len, tmp2 + cmp len, SHORTEST_MATCH + jb encode_literal_finish + + add f_i, len + + len_to_len_code len_code, len, rfc_lookup + dist_to_dist_code dist_code, dist + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] + + cmp f_i, file_length + jl loop2_finish + jmp final_bytes + +encode_literal_finish: + ;; Encode literal + and curr_data %+ d, 0xFF + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + + ;; Setup for next loop + add f_i, 1 + cmp f_i, file_length + jl loop2_finish + +final_bytes: + add file_length, LAST_BYTES_COUNT +final_bytes_loop: + cmp f_i, file_length + jge end + movzx curr_data, byte [file_start + f_i] + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + inc f_i + jmp final_bytes_loop + +end: + ;; Handle eob at end of stream + mov tmp1, [rsp + _eob_count_offset] + mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1 + + mov rbx, [rsp + gpr_save_mem_offset + 0*8] + mov rsi, [rsp + gpr_save_mem_offset + 1*8] + mov rdi, [rsp + gpr_save_mem_offset + 2*8] + mov rbp, [rsp + gpr_save_mem_offset + 3*8] + mov r12, [rsp + gpr_save_mem_offset + 4*8] + mov r13, [rsp + gpr_save_mem_offset + 5*8] + mov r14, [rsp + gpr_save_mem_offset + 6*8] + mov r15, [rsp + gpr_save_mem_offset + 7*8] + +%ifndef ALIGN_STACK + add rsp, stack_size +%else + mov rsp, rbp + pop rbp +%endif + ret + +compare_loop: + and hash %+ d, HASH_MASK + lea tmp2, [tmp1 + dist - 1] +%if (COMPARE_TYPE == 1) + compare250 tmp1, tmp2, len, tmp3 +%elif (COMPARE_TYPE == 2) + compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1 +%elif (COMPARE_TYPE == 3) + compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1 +%else + %error Unknown Compare type COMPARE_TYPE + % error +%endif + lea tmp3, [f_i + 1] + jmp len_dist_huffman + +compare_loop2: + add tmp1, 1 + lea tmp2, [tmp1 + dist2 - 1] + +%if (COMPARE_TYPE == 1) + compare250 tmp1, tmp2, len2, tmp3 +%elif (COMPARE_TYPE == 2) + compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1 +%elif (COMPARE_TYPE == 3) + compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1 +%else +%error Unknown Compare type COMPARE_TYPE + % error +%endif + and curr_data, 0xff + inc qword [histogram + _lit_len_offset + 8 * curr_data] + lea tmp3, [f_i + 1] + jmp len_dist_lit_huffman + +section .data + align 4 +const_D: dq D +const_30: dq 30 diff --git a/igzip/igzip_update_histogram_01.asm b/igzip/igzip_update_histogram_01.asm new file mode 100644 index 0000000..0705a07 --- /dev/null +++ b/igzip/igzip_update_histogram_01.asm @@ -0,0 +1,7 @@ +%define ARCH 01 + +%ifndef COMPARE_TYPE +%define COMPARE_TYPE 2 +%endif + +%include "igzip_update_histogram.asm" diff --git a/igzip/igzip_update_histogram_04.asm b/igzip/igzip_update_histogram_04.asm new file mode 100644 index 0000000..18945b2 --- /dev/null +++ b/igzip/igzip_update_histogram_04.asm @@ -0,0 +1,8 @@ +%define ARCH 04 +%define USE_HSWNI + +%ifndef COMPARE_TYPE +%define COMPARE_TYPE 3 +%endif + +%include "igzip_update_histogram.asm" diff --git a/igzip/rfc1951_lookup.asm b/igzip/rfc1951_lookup.asm new file mode 100644 index 0000000..cea3c5b --- /dev/null +++ b/igzip/rfc1951_lookup.asm @@ -0,0 +1,44 @@ +%ifndef RFC1951_LOOKUP +%define RFC1951_LOOKUP + +section .data + + align 8 + +global rfc1951_lookup_table:data internal +rfc1951_lookup_table: +len_to_code: + db 0x00, 0x00, 0x00 + db 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 + db 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c + db 0x0d, 0x0d, 0x0d, 0x0d, 0x0e, 0x0e, 0x0e, 0x0e + db 0x0f, 0x0f, 0x0f, 0x0f, 0x10, 0x10, 0x10, 0x10 + db 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 + db 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12 + db 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13 + db 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14 + db 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15 + db 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15 + db 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16 + db 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16 + db 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17 + db 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17 + db 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18 + db 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18 + db 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 + db 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 + db 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 + db 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19 + db 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a + db 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a + db 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a + db 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a + db 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b + db 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b + db 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b + db 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b + db 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c + db 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c + db 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c + db 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1d +%endif diff --git a/include/igzip_lib.h b/include/igzip_lib.h index 79d86f5..fe3ff3a 100644 --- a/include/igzip_lib.h +++ b/include/igzip_lib.h @@ -190,6 +190,7 @@ enum isal_zstate_state { struct isal_huff_histogram { uint64_t lit_len_histogram[IGZIP_LIT_LEN]; uint64_t dist_histogram[IGZIP_DIST_LEN]; + uint16_t hash_table[HASH_SIZE]; }; /** @brief Holds Bit Buffer information*/