isa-l/igzip/igzip_update_histogram.asm

563 lines
13 KiB
NASM
Raw Normal View History

%include "options.asm"
%include "lz0a_const.asm"
%include "data_struct2.asm"
%include "bitbuf2.asm"
%include "huffman.asm"
%include "igzip_compare_types.asm"
%include "reg_sizes.asm"
%include "stdmac.asm"
extern rfc1951_lookup_table
_len_to_code_offset equ 0
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
%define LIT_LEN 286
%define DIST_LEN 30
%define HIST_ELEM_SIZE 8
%ifdef DEBUG
%macro MARK 1
global %1
%1:
%endm
%else
%macro MARK 1
%endm
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%define file_start rdi
%define file_length rsi
%define histogram rdx
%define rfc_lookup r9
%define f_i r10
%define curr_data rax
%define tmp2 rcx
%define dist rbx
%define dist_code2 rbx
%define dist2 r12
%define dist_code r12
%define len rbp
%define len_code rbp
%define hash3 rbp
%define curr_data2 r8
%define len2 r8
%define tmp4 r8
%define tmp1 r11
%define tmp3 r13
%define hash r14
%define hash2 r15
%define xtmp0 xmm0
%define xtmp1 xmm1
%define xdata xmm2
%define ytmp0 ymm0
%define ytmp1 ymm1
%if(ARCH == 01)
%define vtmp0 xtmp0
%define vtmp1 xtmp1
%define V_LENGTH 16
%else
%define vtmp0 ytmp0
%define vtmp1 ytmp1
%define V_LENGTH 32
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
_eob_count_offset equ 0 ; local variable (8 bytes)
f_end_i_mem_offset equ 8
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
stack_size equ 2*8 + 8*8 + 4*16 + 8
;;; 8 because stack address is odd multiple of 8 after a function call and
;;; we want it aligned to 16 bytes
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%macro FUNC_SAVE 0
%ifdef ALIGN_STACK
push rbp
mov rbp, rsp
sub rsp, stack_size
and rsp, ~15
%else
sub rsp, stack_size
%endif
mov [rsp + gpr_save_mem_offset + 0*8], rbx
mov [rsp + gpr_save_mem_offset + 1*8], rbp
mov [rsp + gpr_save_mem_offset + 2*8], r12
mov [rsp + gpr_save_mem_offset + 3*8], r13
mov [rsp + gpr_save_mem_offset + 4*8], r14
mov [rsp + gpr_save_mem_offset + 5*8], r15
%endm
%macro FUNC_RESTORE 0
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
mov rbp, [rsp + gpr_save_mem_offset + 1*8]
mov r12, [rsp + gpr_save_mem_offset + 2*8]
mov r13, [rsp + gpr_save_mem_offset + 3*8]
mov r14, [rsp + gpr_save_mem_offset + 4*8]
mov r15, [rsp + gpr_save_mem_offset + 5*8]
%ifndef ALIGN_STACK
add rsp, stack_size
%else
mov rsp, rbp
pop rbp
%endif
%endm
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%macro FUNC_SAVE 0
%ifdef ALIGN_STACK
push rbp
mov rbp, rsp
sub rsp, stack_size
and rsp, ~15
%else
sub rsp, stack_size
%endif
mov [rsp + gpr_save_mem_offset + 0*8], rbx
mov [rsp + gpr_save_mem_offset + 1*8], rsi
mov [rsp + gpr_save_mem_offset + 2*8], rdi
mov [rsp + gpr_save_mem_offset + 3*8], rbp
mov [rsp + gpr_save_mem_offset + 4*8], r12
mov [rsp + gpr_save_mem_offset + 5*8], r13
mov [rsp + gpr_save_mem_offset + 6*8], r14
mov [rsp + gpr_save_mem_offset + 7*8], r15
%endm
%macro FUNC_RESTORE 0
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
mov rsi, [rsp + gpr_save_mem_offset + 1*8]
mov rdi, [rsp + gpr_save_mem_offset + 2*8]
mov rbp, [rsp + gpr_save_mem_offset + 3*8]
mov r12, [rsp + gpr_save_mem_offset + 4*8]
mov r13, [rsp + gpr_save_mem_offset + 5*8]
mov r14, [rsp + gpr_save_mem_offset + 6*8]
mov r15, [rsp + gpr_save_mem_offset + 7*8]
%ifndef ALIGN_STACK
add rsp, stack_size
%else
mov rsp, rbp
pop rbp
%endif
%endm
%endif
_lit_len_offset equ 0
_dist_offset equ (8 * LIT_LEN)
_hash_offset equ (_dist_offset + 8 * DIST_LEN)
%macro len_to_len_code 3
%define %%len_code %1 ; Output
%define %%len %2 ; Input
%define %%rfc_lookup %3
movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
or %%len_code, 0x100
%endm
;;; Clobbers rcx and dist
%macro dist_to_dist_code 2
%define %%dist_code %1 ; Output code associated with dist
%define %%dist_coded %1d
%define %%dist %2d ; Input dist
dec %%dist
mov %%dist_coded, %%dist
bsr ecx, %%dist_coded
dec ecx
SHRX %%dist_code, %%dist_code, rcx
lea %%dist_coded, [%%dist_coded + 2*ecx]
cmp %%dist, 1
cmovle %%dist_coded, %%dist
%endm
;;; Clobbers rcx and dist
%macro dist_to_dist_code2 2
%define %%dist_code %1 ; Output code associated with dist
%define %%dist_coded %1d
%define %%dist %2d ; Input -(dist - 1)
neg %%dist
mov %%dist_coded, %%dist
bsr ecx, %%dist_coded
dec ecx
SHRX %%dist_code, %%dist_code, rcx
lea %%dist_coded, [%%dist_coded + 2*ecx]
cmp %%dist, 1
cmovle %%dist_coded, %%dist
%endm
; void isal_update_histogram
global isal_update_histogram_ %+ ARCH
isal_update_histogram_ %+ ARCH %+ :
;; do nothing if (avail_in == 0)
cmp file_length, 0
jne skip1
ret
skip1:
FUNC_SAVE
%ifnidn file_start, arg0
mov file_start, arg0
%endif
%ifnidn file_length, arg1
mov file_length, arg1
%endif
%ifnidn histogram, arg2
mov histogram, arg2
%endif
mov f_i, 0
mov tmp1, qword [histogram + _lit_len_offset + 8*256]
inc tmp1
mov [rsp + _eob_count_offset], tmp1
lea rfc_lookup, [rfc1951_lookup_table]
;; Init hash_table
PXOR vtmp0, vtmp0, vtmp0
mov rcx, (IGZIP_HASH_SIZE - V_LENGTH)
init_hash_table:
MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0
MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
sub rcx, V_LENGTH
jge init_hash_table
sub file_length, LA_STATELESS
cmp file_length, 0
jle end_loop_2
;; Load first literal into histogram
mov curr_data, [file_start + f_i]
compute_hash hash, curr_data
and hash %+ d, HASH_MASK
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
and curr_data, 0xff
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
inc f_i
;; Setup to begin loop 2
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
mov curr_data2, curr_data
compute_hash hash, curr_data
shr curr_data2, 8
compute_hash hash2, curr_data2
and hash2 %+ d, HASH_MASK
and hash, HASH_MASK
loop2:
xor dist, dist
xor dist2, dist2
xor tmp3, tmp3
lea tmp1, [file_start + f_i]
MOVQ curr_data, xdata
PSRLDQ xdata, 1
;; Load possible look back distances and update hash data
mov dist %+ w, f_i %+ w
sub dist, 1
sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
add f_i, 1
mov dist2 %+ w, f_i %+ w
sub dist2, 1
sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
mov [histogram + _hash_offset + 2 * hash2], f_i %+ w
;; Start computing hashes to be used in either the next loop or
;; for updating the hash if a match is found
MOVQ curr_data2, xdata
MOVQ tmp2, xdata
shr curr_data2, 8
compute_hash hash, curr_data2
;; Check if look back distances are valid. Load a junk distance of 1
;; if the look back distance is too long for speculative lookups.
and dist %+ d, (D-1)
neg dist
and dist2 %+ d, (D-1)
neg dist2
shr tmp2, 16
compute_hash hash2, tmp2
;; Check for long len/dist matches (>7)
mov len, curr_data
xor len, [tmp1 + dist - 1]
jz compare_loop
and hash %+ d, HASH_MASK
and hash2 %+ d, HASH_MASK
MOVQ len2, xdata
xor len2, [tmp1 + dist2]
jz compare_loop2
;; Specutively load the code for the first literal
movzx tmp1, curr_data %+ b
shr curr_data, 8
lea tmp3, [f_i + 1]
;; Check for len/dist match for first literal
test len %+ d, 0xFFFFFFFF
jz len_dist_huffman_pre
;; Store first literal
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
;; Check for len/dist match for second literal
test len2 %+ d, 0xFFFFFFFF
jnz lit_lit_huffman
len_dist_lit_huffman_pre:
;; Calculate repeat length
tzcnt len2, len2
shr len2, 3
len_dist_lit_huffman:
MOVQ curr_data, xdata
shr curr_data, 24
compute_hash hash3, curr_data
;; Store updated hashes
mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
add tmp3,1
mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
add tmp3, 1
add f_i, len2
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
mov tmp1, curr_data
compute_hash hash, curr_data
and hash3, HASH_MASK
mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w
dist_to_dist_code2 dist_code2, dist2
len_to_len_code len_code, len2, rfc_lookup
shr tmp1, 8
compute_hash hash2, tmp1
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
and hash2 %+ d, HASH_MASK
and hash, HASH_MASK
cmp f_i, file_length
jl loop2
jmp end_loop_2
;; encode as dist/len
len_dist_huffman_pre:
tzcnt len, len
shr len, 3
len_dist_huffman:
mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
add tmp3,1
mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
dec f_i
add f_i, len
MOVDQU xdata, [file_start + f_i]
mov curr_data, [file_start + f_i]
mov tmp1, curr_data
compute_hash hash, curr_data
dist_to_dist_code2 dist_code, dist
len_to_len_code len_code, len, rfc_lookup
shr tmp1, 8
compute_hash hash2, tmp1
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
and hash2 %+ d, HASH_MASK
and hash, HASH_MASK
cmp f_i, file_length
jl loop2
jmp end_loop_2
lit_lit_huffman:
MOVDQU xdata, [file_start + f_i + 1]
and curr_data, 0xff
add f_i, 1
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
cmp f_i, file_length
jl loop2
end_loop_2:
add file_length, LA_STATELESS - LAST_BYTES_COUNT
cmp f_i, file_length
jge final_bytes
loop2_finish:
mov curr_data %+ d, dword [file_start + f_i]
compute_hash hash, curr_data
and hash %+ d, HASH_MASK
;; Calculate possible distance for length/dist pair.
xor dist, dist
mov dist %+ w, f_i %+ w
sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
;; Check if look back distance is valid (the dec is to handle when dist = 0)
dec dist
cmp dist %+ d, (D-1)
jae encode_literal_finish
inc dist
;; Check if look back distance is a match
lea tmp4, [file_length + LAST_BYTES_COUNT]
sub tmp4, f_i
lea tmp1, [file_start + f_i]
mov tmp2, tmp1
sub tmp2, dist
compare tmp4, tmp1, tmp2, len, tmp3
;; Limit len to maximum value of 258
mov tmp2, 258
cmp len, 258
cmova len, tmp2
cmp len, SHORTEST_MATCH
jb encode_literal_finish
add f_i, len
len_to_len_code len_code, len, rfc_lookup
dist_to_dist_code dist_code, dist
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
cmp f_i, file_length
jl loop2_finish
jmp final_bytes
encode_literal_finish:
;; Encode literal
and curr_data %+ d, 0xFF
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
;; Setup for next loop
add f_i, 1
cmp f_i, file_length
jl loop2_finish
final_bytes:
add file_length, LAST_BYTES_COUNT
final_bytes_loop:
cmp f_i, file_length
jge end
movzx curr_data, byte [file_start + f_i]
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
inc f_i
jmp final_bytes_loop
end:
;; Handle eob at end of stream
mov tmp1, [rsp + _eob_count_offset]
mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
FUNC_RESTORE
ret
compare_loop:
and hash %+ d, HASH_MASK
and hash2 %+ d, HASH_MASK
lea tmp2, [tmp1 + dist - 1]
%if (COMPARE_TYPE == 1)
compare250 tmp1, tmp2, len, tmp3
%elif (COMPARE_TYPE == 2)
compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
%elif (COMPARE_TYPE == 3)
compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
%else
%error Unknown Compare type COMPARE_TYPE
% error
%endif
lea tmp3, [f_i + 1]
jmp len_dist_huffman
compare_loop2:
add tmp1, 1
lea tmp2, [tmp1 + dist2 - 1]
%if (COMPARE_TYPE == 1)
compare250 tmp1, tmp2, len2, tmp3
%elif (COMPARE_TYPE == 2)
compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
%elif (COMPARE_TYPE == 3)
compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
%else
%error Unknown Compare type COMPARE_TYPE
% error
%endif
and curr_data, 0xff
inc qword [histogram + _lit_len_offset + 8 * curr_data]
lea tmp3, [f_i + 1]
jmp len_dist_lit_huffman
section .data
align 32
D_vector:
dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF