mirror of
https://github.com/intel/isa-l.git
synced 2025-01-05 22:59:52 +01:00
1187583a97
- It should be fine to enable pmull always on Apple Silicon - macOS 12+ is required for PMULL instruction. - Changed the conditional macro to __APPLE__ - Rewritten dispatcher using sysctlbyname - Use __USER_LABEL_PREFIX__ - Use __TEXT,__const as readonly section - use ASM_DEF_RODATA macro - fix func decl Change-Id: I800593f21085d8187b480c8bb3ab2bd70c4a6974 Signed-off-by: Taiju Yamada <tyamada@bi.a.u-tokyo.ac.jp>
187 lines
4.6 KiB
ArmAsm
187 lines
4.6 KiB
ArmAsm
/**********************************************************************
|
|
Copyright(c) 2019 Arm Corporation All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
* Neither the name of Arm Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
**********************************************************************/
|
|
|
|
#include "../include/aarch64_label.h"
|
|
|
|
.arch armv8-a+crypto
|
|
.text
|
|
.align 3
|
|
|
|
/*
|
|
Macros
|
|
*/
|
|
|
|
.macro declare_var_vector_reg name:req,reg:req
|
|
\name\()_q .req q\reg
|
|
\name\()_v .req v\reg
|
|
\name\()_s .req s\reg
|
|
\name\()_d .req d\reg
|
|
.endm
|
|
|
|
.macro mod_adler dest:req,tmp:req
|
|
umull \tmp\()_x,\dest,const_div1
|
|
lsr \tmp\()_x,\tmp\()_x,47
|
|
msub \dest,\tmp,const_div2,\dest
|
|
.endm
|
|
|
|
/*
|
|
uint32_t adler32_neon(uint32_t adler32, uint8_t * start, uint32_t length);
|
|
*/
|
|
/*
|
|
Arguements list
|
|
*/
|
|
adler32 .req w0
|
|
start .req x1
|
|
length .req x2
|
|
.global cdecl(adler32_neon)
|
|
#ifndef __APPLE__
|
|
.type adler32_neon, %function
|
|
#endif
|
|
cdecl(adler32_neon):
|
|
/*
|
|
local variables
|
|
*/
|
|
declare_var_vector_reg factor0 , 6
|
|
declare_var_vector_reg factor1 , 7
|
|
declare_var_vector_reg d0 , 4
|
|
declare_var_vector_reg d1 , 5
|
|
declare_var_vector_reg adacc , 2
|
|
declare_var_vector_reg s2acc , 3
|
|
declare_var_vector_reg zero , 16
|
|
declare_var_vector_reg adler , 17
|
|
declare_var_vector_reg sum2 , 20
|
|
declare_var_vector_reg tmp2 , 20
|
|
|
|
adler0 .req w4
|
|
adler1 .req w5
|
|
adler0_x .req x4
|
|
adler1_x .req x5
|
|
end .req x0
|
|
tmp .req w8
|
|
tmp_x .req x8
|
|
tmp1_x .req x9
|
|
loop_cnt .req x10
|
|
loop_const .req x11
|
|
const_div1 .req w6
|
|
const_div2 .req w7
|
|
mov const_div1, 32881
|
|
movk const_div1, 0x8007, lsl 16
|
|
mov const_div2, 65521
|
|
and adler0, adler32, 0xffff
|
|
lsr adler1, adler32, 16
|
|
|
|
lsr loop_cnt,length,5
|
|
#ifndef __APPLE__
|
|
adrp x3,factors
|
|
add x3,x3,:lo12:factors
|
|
#else
|
|
adrp x3,factors@PAGE
|
|
add x3,x3,factors@PAGEOFF
|
|
#endif
|
|
ld1 {factor0_v.16b-factor1_v.16b},[x3]
|
|
|
|
add end,start,length
|
|
cbz loop_cnt,final_accum32
|
|
mov loop_const,173
|
|
|
|
movi v16.4s,0
|
|
|
|
|
|
|
|
|
|
great_than_32:
|
|
cmp loop_cnt,173
|
|
csel loop_const,loop_cnt,loop_const,le
|
|
mov adacc_v.16b,zero_v.16b
|
|
mov s2acc_v.16b,zero_v.16b
|
|
ins adacc_v.s[0],adler0
|
|
ins s2acc_v.s[0],adler1
|
|
add tmp_x,start,loop_const,lsl 5
|
|
|
|
accum32_neon:
|
|
ld1 {d0_v.16b-d1_v.16b},[start]
|
|
add start,start,32
|
|
|
|
shl tmp2_v.4s,adacc_v.4s,5
|
|
add s2acc_v.4s,s2acc_v.4s,tmp2_v.4s
|
|
|
|
uaddlp adler_v.8h,d0_v.16b
|
|
uadalp adler_v.8h,d1_v.16b
|
|
uadalp adacc_v.4s,adler_v.8h
|
|
|
|
umull sum2_v.8h,factor0_v.8b ,d0_v.8b
|
|
umlal2 sum2_v.8h,factor0_v.16b,d0_v.16b
|
|
umlal sum2_v.8h,factor1_v.8b ,d1_v.8b
|
|
umlal2 sum2_v.8h,factor1_v.16b,d1_v.16b
|
|
uadalp s2acc_v.4s,sum2_v.8h
|
|
|
|
cmp start,tmp_x
|
|
bne accum32_neon
|
|
|
|
uaddlv adacc_d,adacc_v.4s
|
|
uaddlv s2acc_d,s2acc_v.4s
|
|
fmov adler0_x,adacc_d
|
|
fmov adler1_x,s2acc_d
|
|
|
|
mod_adler adler0,tmp
|
|
mod_adler adler1,tmp
|
|
sub loop_cnt,loop_cnt,loop_const
|
|
cbnz loop_cnt,great_than_32
|
|
|
|
final_accum32:
|
|
and length,length,31
|
|
cbz length,end_func
|
|
|
|
accum32_body:
|
|
cmp start,end
|
|
beq end_func
|
|
ldrb tmp,[start],1
|
|
add adler0,adler0,tmp
|
|
add adler1,adler1,adler0
|
|
b accum32_body
|
|
|
|
end_func:
|
|
mod_adler adler0,tmp
|
|
mod_adler adler1,tmp
|
|
orr w0,adler0,adler1,lsl 16
|
|
ret
|
|
|
|
#ifndef __APPLE__
|
|
.size adler32_neon, .-adler32_neon
|
|
.section .rodata.cst16,"aM",@progbits,16
|
|
#else
|
|
.section __TEXT,__const
|
|
#endif
|
|
.align 4
|
|
factors:
|
|
.quad 0x191a1b1c1d1e1f20
|
|
.quad 0x1112131415161718
|
|
.quad 0x090a0b0c0d0e0f10
|
|
.quad 0x0102030405060708
|