isa-l/igzip/aarch64/isal_update_histogram.S

326 lines
9.5 KiB
ArmAsm
Raw Permalink Normal View History

/**********************************************************************
Copyright(c) 2019 Arm Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Arm Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include "../include/aarch64_label.h"
.arch armv8-a+crc
.text
.align 2
#include "lz0a_const_aarch64.h"
#include "data_struct_aarch64.h"
#include "huffman_aarch64.h"
#include "bitbuf2_aarch64.h"
#include "stdmac_aarch64.h"
/*
declare Macros
*/
.macro declare_generic_reg name:req,reg:req,default:req
\name .req \default\reg
w_\name .req w\reg
x_\name .req x\reg
.endm
.macro convert_dist_to_dist_sym dist:req,tmp0:req,tmp1:req
mov w_\tmp0, w_\dist
mov w_\dist, -1
cmp w_\tmp0, 32768
bhi .dist2code_done
sub w_\dist, w_\tmp0, #1
cmp w_\tmp0, 4
bls .dist2code_done
clz w_\tmp1, w_\dist
mov w_\tmp0, 30
sub w_\tmp0, w_\tmp0, w_\tmp1
lsr w_\dist, w_\dist, w_\tmp0
add w_\dist, w_\dist, w_\tmp0, lsl 1
.dist2code_done:
.endm
.macro convert_length_to_len_sym length:req,length_out:req,tmp0:req
#ifndef __APPLE__
adrp x_\tmp0, .len_to_code_tab_lanchor
add x_\tmp0, x_\tmp0, :lo12:.len_to_code_tab_lanchor
#else
adrp x_\tmp0, .len_to_code_tab_lanchor@PAGE
add x_\tmp0, x_\tmp0, .len_to_code_tab_lanchor@PAGEOFF
#endif
ldr w_\length_out, [x_\tmp0, w_\length, uxtw 2]
add w_\length_out, w_\length_out, 256
.endm
ASM_DEF_RODATA
.align 4
.len_to_code_tab_lanchor = . + 0
#ifndef __APPLE__
.type len_to_code_tab, %object
.size len_to_code_tab, 1056
#endif
len_to_code_tab:
.word 0x00, 0x00, 0x00
.word 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
.word 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c
.word 0x0d, 0x0d, 0x0d, 0x0d, 0x0e, 0x0e, 0x0e, 0x0e
.word 0x0f, 0x0f, 0x0f, 0x0f, 0x10, 0x10, 0x10, 0x10
.word 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11
.word 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12
.word 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13
.word 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14
.word 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15
.word 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15
.word 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16
.word 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16
.word 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17
.word 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17
.word 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18
.word 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18
.word 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19
.word 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19
.word 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19
.word 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19
.word 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a
.word 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a
.word 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a
.word 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a
.word 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b
.word 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b
.word 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b
.word 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b
.word 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
.word 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
.word 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
.word 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1d
.word 0x00, 0x00, 0x00, 0x00, 0x00
.text
.global cdecl(isal_update_histogram_aarch64)
.arch armv8-a+crc
#ifndef __APPLE__
.type isal_update_histogram_aarch64, %function
#endif
/*
void isal_update_histogram_aarch64(uint8_t * start_stream, int length,
struct isal_huff_histogram *histogram);
*/
/* arguments */
declare_generic_reg start_stream, 0,x
declare_generic_reg length, 1,x
declare_generic_reg histogram, 2,x
declare_generic_reg param0, 0,x
declare_generic_reg param1, 1,x
declare_generic_reg param2, 2,x
/* local variable */
declare_generic_reg start_stream_saved, 10,x
declare_generic_reg histogram_saved, 23,x
declare_generic_reg current, 19,x
declare_generic_reg last_seen, 20,x
declare_generic_reg end_stream, 21,x
declare_generic_reg loop_end_iter, 22,x
declare_generic_reg dist_histogram, 12,x
declare_generic_reg lit_len_histogram, 23,x
declare_generic_reg literal, 8,x
declare_generic_reg next_hash, 9,x
declare_generic_reg end, 4,x
declare_generic_reg dist, 7,x
declare_generic_reg D, 11,w
declare_generic_reg match_length, 3,w
declare_generic_reg tmp0, 5,w
declare_generic_reg tmp1, 6,w
/* constant */
.equ LIT_LEN, 286
.equ DIST_LEN, 30
.equ lit_len_offset, 0
.equ dist_offset, (8*LIT_LEN) // 2288
.equ hash_offset, (dist_offset + 8*DIST_LEN) // 2528
.equ hash_table_size, (8*1024*2) // 16384
cdecl(isal_update_histogram_aarch64):
cmp w_length, 0
ble .done
stp x29, x30, [sp, -64]!
add x29, sp, 0
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
str x23, [sp, 48]
add last_seen, histogram, hash_offset
add end_stream, start_stream, w_length, sxtw
mov current, start_stream
sub loop_end_iter, end_stream, #3
mov histogram_saved, histogram
mov x0, last_seen
mov w1, 0
mov x2, hash_table_size
bl cdecl(memset)
cmp current, loop_end_iter
bcs .loop_end
mov start_stream_saved, current
add dist_histogram, histogram_saved, dist_offset
mov D, 32766
b .loop
.align 2
.loop_2nd_stream:
and literal, literal, 0xff
mov current, next_hash
cmp loop_end_iter, current
ldr x0, [lit_len_histogram, literal, lsl 3]
add x0, x0, 1
str x0, [lit_len_histogram, literal, lsl 3]
bls .loop_end
.loop:
ldr w_literal, [current]
add next_hash, current, 1
mov w0, w_literal
crc32cw w0, wzr, w0
ubfiz x0, x0, 1, 13
sub x2, current, start_stream_saved
ldrh w_dist, [last_seen, x0]
strh w2, [last_seen, x0]
sub w2, w2, w_dist
and w_dist, w2, 65535
sub w0, w_dist, #1
cmp w0, D
bhi .loop_2nd_stream
sub w2, w_end_stream, w_current
mov x1, current
sub x0, current, w_dist, uxth
compare_max_258_bytes param0,param1,param2,match_length,tmp0,tmp1
cmp match_length, 3
bls .loop_2nd_stream
add end, current, 3
cmp end, loop_end_iter
csel end, end, loop_end_iter, ls
cmp end, next_hash
bls .skip_inner_loop
.align 3
.inner_loop:
ldr w0, [next_hash]
crc32cw w0, wzr, w0
ubfiz x0, x0, 1, 13
sub x1, next_hash, start_stream_saved
add next_hash, next_hash, 1
cmp next_hash, end
strh w1, [last_seen, x0]
bne .inner_loop
.skip_inner_loop:
convert_dist_to_dist_sym dist, tmp0, tmp1
uxtw x2, w_dist
ldr x1, [dist_histogram, x2, lsl 3]
add x1, x1, 1
str x1, [dist_histogram, x2, lsl 3]
convert_length_to_len_sym match_length,tmp1,tmp0
uxtw x0, w_tmp1
ldr x1, [lit_len_histogram, x0, lsl 3]
add x1, x1, 1
str x1, [lit_len_histogram, x0, lsl 3]
sub match_length, match_length, #1
add x3, x3, 1
add current, current, x3
cmp loop_end_iter, current
bhi .loop
.align 3
// fold the last for loop
.loop_end:
cmp end_stream, current
bls .loop_fold_end
mov x0, current
ldrb w1, [x0], 1
cmp end_stream, x0
ldr x0, [lit_len_histogram, x1, lsl 3]
add x0, x0, 1
str x0, [lit_len_histogram, x1, lsl 3]
bls .loop_fold_end
ldrb w1, [current, 1]
add x0, current, 2
cmp end_stream, x0
ldr x0, [lit_len_histogram, x1, lsl 3]
add x0, x0, 1
str x0, [lit_len_histogram, x1, lsl 3]
bls .loop_fold_end
ldrb w1, [current, 2]
add x0, current, 3
cmp end_stream, x0
ldr x0, [lit_len_histogram, x1, lsl 3]
add x0, x0, 1
str x0, [lit_len_histogram, x1, lsl 3]
bls .loop_fold_end
ldrb w1, [current, 3]
ldr x0, [lit_len_histogram, x1, lsl 3]
add x0, x0, 1
str x0, [lit_len_histogram, x1, lsl 3]
.loop_fold_end:
ldr x0, [lit_len_histogram, (256*8)]
add x0, x0, 1
str x0, [lit_len_histogram, (256*8)]
ldr x23, [sp, 48]
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x29, x30, [sp], 64
ret
.align 2
.done:
ret
#ifndef __APPLE__
.size isal_update_histogram_aarch64, .-isal_update_histogram_aarch64
#endif