mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 17:33:50 +01:00
17dac9f641
Signed-off-by: Roy Oursler <roy.j.oursler@intel.com> Reviewed-by: Greg Tucker <greg.b.tucker@intel.com>
498 lines
11 KiB
NASM
498 lines
11 KiB
NASM
|
|
%include "options.asm"
|
|
|
|
%include "lz0a_const.asm"
|
|
%include "data_struct2.asm"
|
|
%include "bitbuf2.asm"
|
|
%include "huffman.asm"
|
|
%include "igzip_compare_types.asm"
|
|
%include "reg_sizes.asm"
|
|
|
|
%include "stdmac.asm"
|
|
|
|
extern rfc1951_lookup_table
|
|
_len_to_code_offset equ 0
|
|
|
|
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
|
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
|
%define LIT_LEN 286
|
|
%define DIST_LEN 30
|
|
%define HIST_ELEM_SIZE 8
|
|
|
|
%ifdef DEBUG
|
|
%macro MARK 1
|
|
global %1
|
|
%1:
|
|
%endm
|
|
%else
|
|
%macro MARK 1
|
|
%endm
|
|
%endif
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
%define file_start rdi
|
|
%define file_length rsi
|
|
%define histogram rdx
|
|
%define rfc_lookup r9
|
|
%define f_i r10
|
|
|
|
%define curr_data rax
|
|
|
|
%define tmp2 rcx
|
|
|
|
%define dist rbx
|
|
%define dist_code2 rbx
|
|
|
|
%define dist2 r12
|
|
%define dist_code r12
|
|
|
|
%define len rbp
|
|
%define len_code rbp
|
|
%define hash3 rbp
|
|
|
|
%define curr_data2 r8
|
|
%define len2 r8
|
|
%define tmp4 r8
|
|
|
|
%define tmp1 r11
|
|
|
|
%define tmp3 r13
|
|
|
|
%define hash r14
|
|
|
|
%define hash2 r15
|
|
|
|
%define xtmp0 xmm0
|
|
%define xtmp1 xmm1
|
|
%define xdata xmm2
|
|
|
|
%define ytmp0 ymm0
|
|
%define ytmp1 ymm1
|
|
|
|
%if(ARCH == 01)
|
|
%define vtmp0 xtmp0
|
|
%define vtmp1 xtmp1
|
|
%define V_LENGTH 16
|
|
%else
|
|
%define vtmp0 ytmp0
|
|
%define vtmp1 ytmp1
|
|
%define V_LENGTH 32
|
|
%endif
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
_eob_count_offset equ 0 ; local variable (8 bytes)
|
|
f_end_i_mem_offset equ 8
|
|
gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
|
|
xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
|
|
stack_size equ 2*8 + 8*8 + 4*16 + 8
|
|
;;; 8 because stack address is odd multiple of 8 after a function call and
|
|
;;; we want it aligned to 16 bytes
|
|
_lit_len_offset equ 0
|
|
_dist_offset equ (8 * LIT_LEN)
|
|
_hash_offset equ (_dist_offset + 8 * DIST_LEN)
|
|
|
|
%macro len_to_len_code 3
|
|
%define %%len_code %1 ; Output
|
|
%define %%len %2 ; Input
|
|
%define %%rfc_lookup %3
|
|
movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
|
|
or %%len_code, 0x100
|
|
%endm
|
|
|
|
;;; Clobbers rcx and dist
|
|
%macro dist_to_dist_code 2
|
|
%define %%dist_code %1 ; Output code associated with dist
|
|
%define %%dist_coded %1d
|
|
%define %%dist %2d ; Input dist
|
|
dec %%dist
|
|
mov %%dist_coded, %%dist
|
|
bsr ecx, %%dist_coded
|
|
dec ecx
|
|
SHRX %%dist_code, %%dist_code, rcx
|
|
lea %%dist_coded, [%%dist_coded + 2*ecx]
|
|
|
|
cmp %%dist, 1
|
|
cmovle %%dist_coded, %%dist
|
|
%endm
|
|
|
|
;;; Clobbers rcx and dist
|
|
%macro dist_to_dist_code2 2
|
|
%define %%dist_code %1 ; Output code associated with dist
|
|
%define %%dist_coded %1d
|
|
%define %%dist %2d ; Input -(dist - 1)
|
|
neg %%dist
|
|
mov %%dist_coded, %%dist
|
|
bsr ecx, %%dist_coded
|
|
dec ecx
|
|
SHRX %%dist_code, %%dist_code, rcx
|
|
lea %%dist_coded, [%%dist_coded + 2*ecx]
|
|
|
|
cmp %%dist, 1
|
|
cmovle %%dist_coded, %%dist
|
|
%endm
|
|
|
|
; void isal_update_histogram
|
|
global isal_update_histogram_ %+ ARCH
|
|
isal_update_histogram_ %+ ARCH %+ :
|
|
|
|
;; do nothing if (avail_in == 0)
|
|
cmp file_length, 0
|
|
jne skip1
|
|
ret
|
|
skip1:
|
|
|
|
%ifdef ALIGN_STACK
|
|
push rbp
|
|
mov rbp, rsp
|
|
sub rsp, stack_size
|
|
and rsp, ~15
|
|
%else
|
|
sub rsp, stack_size
|
|
%endif
|
|
|
|
mov [rsp + gpr_save_mem_offset + 0*8], rbx
|
|
mov [rsp + gpr_save_mem_offset + 1*8], rsi
|
|
mov [rsp + gpr_save_mem_offset + 2*8], rdi
|
|
mov [rsp + gpr_save_mem_offset + 3*8], rbp
|
|
mov [rsp + gpr_save_mem_offset + 4*8], r12
|
|
mov [rsp + gpr_save_mem_offset + 5*8], r13
|
|
mov [rsp + gpr_save_mem_offset + 6*8], r14
|
|
mov [rsp + gpr_save_mem_offset + 7*8], r15
|
|
mov f_i, 0
|
|
|
|
mov tmp1, qword [histogram + _lit_len_offset + 8*256]
|
|
inc tmp1
|
|
mov [rsp + _eob_count_offset], tmp1
|
|
|
|
lea rfc_lookup, [rfc1951_lookup_table]
|
|
|
|
;; Init hash_table
|
|
MOVDQU vtmp0, [D_vector]
|
|
mov rcx, (HASH_SIZE - V_LENGTH)
|
|
init_hash_table:
|
|
MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0
|
|
MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
|
|
sub rcx, V_LENGTH
|
|
jge init_hash_table
|
|
|
|
sub file_length, LA_STATELESS
|
|
cmp file_length, 0
|
|
jle end_loop_2
|
|
|
|
|
|
;; Load first literal into histogram
|
|
mov curr_data, [file_start + f_i]
|
|
compute_hash hash, curr_data
|
|
and hash %+ d, HASH_MASK
|
|
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
|
|
and curr_data, 0xff
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
|
|
inc f_i
|
|
|
|
;; Setup to begin loop 2
|
|
MOVDQU xdata, [file_start + f_i]
|
|
mov curr_data, [file_start + f_i]
|
|
mov curr_data2, curr_data
|
|
compute_hash hash, curr_data
|
|
shr curr_data2, 8
|
|
compute_hash hash2, curr_data2
|
|
|
|
and hash2 %+ d, HASH_MASK
|
|
and hash, HASH_MASK
|
|
loop2:
|
|
xor dist, dist
|
|
xor dist2, dist2
|
|
xor tmp3, tmp3
|
|
|
|
lea tmp1, [file_start + f_i]
|
|
|
|
MOVQ curr_data, xdata
|
|
PSRLDQ xdata, 1
|
|
|
|
;; Load possible look back distances and update hash data
|
|
mov dist %+ w, f_i %+ w
|
|
sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
|
|
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
|
|
|
|
add f_i, 1
|
|
|
|
mov dist2 %+ w, f_i %+ w
|
|
sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
|
|
mov [histogram + _hash_offset + 2 * hash2], f_i %+ w
|
|
|
|
;; Start computing hashes to be used in either the next loop or
|
|
;; for updating the hash if a match is found
|
|
MOVQ curr_data2, xdata
|
|
MOVQ tmp2, xdata
|
|
shr curr_data2, 8
|
|
compute_hash hash, curr_data2
|
|
|
|
;; Check if look back distances are valid. Load a junk distance of 1
|
|
;; if the look back distance is too long for speculative lookups.
|
|
sub dist, 1
|
|
cmp dist %+ d, (D-1)
|
|
cmovae dist, tmp3
|
|
neg dist
|
|
|
|
sub dist2, 1
|
|
cmp dist2 %+ d, (D-1)
|
|
cmovae dist2, tmp3
|
|
neg dist2
|
|
|
|
shr tmp2, 16
|
|
compute_hash hash2, tmp2
|
|
|
|
;; Check for long len/dist matches (>7)
|
|
mov len, curr_data
|
|
xor len, [tmp1 + dist - 1]
|
|
jz compare_loop
|
|
|
|
and hash %+ d, HASH_MASK
|
|
and hash2 %+ d, HASH_MASK
|
|
|
|
MOVQ len2, xdata
|
|
xor len2, [tmp1 + dist2]
|
|
jz compare_loop2
|
|
|
|
;; Specutively load the code for the first literal
|
|
movzx tmp1, curr_data %+ b
|
|
shr curr_data, 8
|
|
|
|
lea tmp3, [f_i + 1]
|
|
|
|
;; Check for len/dist match for first literal
|
|
test len %+ d, 0xFFFFFFFF
|
|
jz len_dist_huffman_pre
|
|
|
|
;; Store first literal
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
|
|
|
|
;; Check for len/dist match for second literal
|
|
test len2 %+ d, 0xFFFFFFFF
|
|
jnz lit_lit_huffman
|
|
len_dist_lit_huffman_pre:
|
|
;; Calculate repeat length
|
|
tzcnt len2, len2
|
|
shr len2, 3
|
|
|
|
len_dist_lit_huffman:
|
|
MOVQ curr_data, xdata
|
|
shr curr_data, 24
|
|
compute_hash hash3, curr_data
|
|
|
|
;; Store updated hashes
|
|
mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
|
|
add tmp3,1
|
|
mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
|
|
add tmp3, 1
|
|
|
|
add f_i, len2
|
|
|
|
MOVDQU xdata, [file_start + f_i]
|
|
mov curr_data, [file_start + f_i]
|
|
mov tmp1, curr_data
|
|
compute_hash hash, curr_data
|
|
|
|
and hash3, HASH_MASK
|
|
mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w
|
|
|
|
dist_to_dist_code2 dist_code2, dist2
|
|
|
|
len_to_len_code len_code, len2, rfc_lookup
|
|
|
|
shr tmp1, 8
|
|
compute_hash hash2, tmp1
|
|
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
|
|
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
|
|
|
|
and hash2 %+ d, HASH_MASK
|
|
and hash, HASH_MASK
|
|
|
|
cmp f_i, file_length
|
|
jl loop2
|
|
jmp end_loop_2
|
|
;; encode as dist/len
|
|
|
|
len_dist_huffman_pre:
|
|
tzcnt len, len
|
|
shr len, 3
|
|
|
|
len_dist_huffman:
|
|
mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
|
|
add tmp3,1
|
|
mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
|
|
|
|
dec f_i
|
|
add f_i, len
|
|
|
|
MOVDQU xdata, [file_start + f_i]
|
|
mov curr_data, [file_start + f_i]
|
|
mov tmp1, curr_data
|
|
compute_hash hash, curr_data
|
|
|
|
dist_to_dist_code2 dist_code, dist
|
|
|
|
len_to_len_code len_code, len, rfc_lookup
|
|
|
|
shr tmp1, 8
|
|
compute_hash hash2, tmp1
|
|
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
|
|
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
|
|
|
|
and hash2 %+ d, HASH_MASK
|
|
and hash, HASH_MASK
|
|
|
|
cmp f_i, file_length
|
|
jl loop2
|
|
jmp end_loop_2
|
|
|
|
lit_lit_huffman:
|
|
MOVDQU xdata, [file_start + f_i + 1]
|
|
and curr_data, 0xff
|
|
add f_i, 1
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
|
|
|
|
cmp f_i, file_length
|
|
jl loop2
|
|
|
|
end_loop_2:
|
|
add file_length, LA_STATELESS - LAST_BYTES_COUNT
|
|
cmp f_i, file_length
|
|
jge final_bytes
|
|
|
|
loop2_finish:
|
|
mov curr_data %+ d, dword [file_start + f_i]
|
|
compute_hash hash, curr_data
|
|
and hash %+ d, HASH_MASK
|
|
|
|
;; Calculate possible distance for length/dist pair.
|
|
xor dist, dist
|
|
mov dist %+ w, f_i %+ w
|
|
sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
|
|
mov [histogram + _hash_offset + 2 * hash], f_i %+ w
|
|
|
|
;; Check if look back distance is valid (the dec is to handle when dist = 0)
|
|
dec dist
|
|
cmp dist %+ d, (D-1)
|
|
jae encode_literal_finish
|
|
inc dist
|
|
|
|
;; Check if look back distance is a match
|
|
lea tmp4, [file_length + LAST_BYTES_COUNT]
|
|
sub tmp4, f_i
|
|
lea tmp1, [file_start + f_i]
|
|
mov tmp2, tmp1
|
|
sub tmp2, dist
|
|
compare tmp4, tmp1, tmp2, len, tmp3
|
|
|
|
;; Limit len to maximum value of 258
|
|
mov tmp2, 258
|
|
cmp len, 258
|
|
cmova len, tmp2
|
|
cmp len, SHORTEST_MATCH
|
|
jb encode_literal_finish
|
|
|
|
add f_i, len
|
|
|
|
len_to_len_code len_code, len, rfc_lookup
|
|
dist_to_dist_code dist_code, dist
|
|
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
|
|
inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
|
|
|
|
cmp f_i, file_length
|
|
jl loop2_finish
|
|
jmp final_bytes
|
|
|
|
encode_literal_finish:
|
|
;; Encode literal
|
|
and curr_data %+ d, 0xFF
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
|
|
|
|
;; Setup for next loop
|
|
add f_i, 1
|
|
cmp f_i, file_length
|
|
jl loop2_finish
|
|
|
|
final_bytes:
|
|
add file_length, LAST_BYTES_COUNT
|
|
final_bytes_loop:
|
|
cmp f_i, file_length
|
|
jge end
|
|
movzx curr_data, byte [file_start + f_i]
|
|
inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
|
|
inc f_i
|
|
jmp final_bytes_loop
|
|
|
|
end:
|
|
;; Handle eob at end of stream
|
|
mov tmp1, [rsp + _eob_count_offset]
|
|
mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
|
|
|
|
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
|
|
mov rsi, [rsp + gpr_save_mem_offset + 1*8]
|
|
mov rdi, [rsp + gpr_save_mem_offset + 2*8]
|
|
mov rbp, [rsp + gpr_save_mem_offset + 3*8]
|
|
mov r12, [rsp + gpr_save_mem_offset + 4*8]
|
|
mov r13, [rsp + gpr_save_mem_offset + 5*8]
|
|
mov r14, [rsp + gpr_save_mem_offset + 6*8]
|
|
mov r15, [rsp + gpr_save_mem_offset + 7*8]
|
|
|
|
%ifndef ALIGN_STACK
|
|
add rsp, stack_size
|
|
%else
|
|
mov rsp, rbp
|
|
pop rbp
|
|
%endif
|
|
ret
|
|
|
|
compare_loop:
|
|
and hash %+ d, HASH_MASK
|
|
and hash2 %+ d, HASH_MASK
|
|
lea tmp2, [tmp1 + dist - 1]
|
|
%if (COMPARE_TYPE == 1)
|
|
compare250 tmp1, tmp2, len, tmp3
|
|
%elif (COMPARE_TYPE == 2)
|
|
compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
|
|
%elif (COMPARE_TYPE == 3)
|
|
compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
|
|
%else
|
|
%error Unknown Compare type COMPARE_TYPE
|
|
% error
|
|
%endif
|
|
lea tmp3, [f_i + 1]
|
|
jmp len_dist_huffman
|
|
|
|
compare_loop2:
|
|
add tmp1, 1
|
|
lea tmp2, [tmp1 + dist2 - 1]
|
|
|
|
%if (COMPARE_TYPE == 1)
|
|
compare250 tmp1, tmp2, len2, tmp3
|
|
%elif (COMPARE_TYPE == 2)
|
|
compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
|
|
%elif (COMPARE_TYPE == 3)
|
|
compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
|
|
%else
|
|
%error Unknown Compare type COMPARE_TYPE
|
|
% error
|
|
%endif
|
|
and curr_data, 0xff
|
|
inc qword [histogram + _lit_len_offset + 8 * curr_data]
|
|
lea tmp3, [f_i + 1]
|
|
jmp len_dist_lit_huffman
|
|
|
|
section .data
|
|
align 32
|
|
D_vector: dw -(D + 1), -(D + 1), -(D + 1), -(D + 1)
|
|
dw -(D + 1), -(D + 1), -(D + 1), -(D + 1)
|
|
dw -(D + 1), -(D + 1), -(D + 1), -(D + 1)
|
|
dw -(D + 1), -(D + 1), -(D + 1), -(D + 1)
|