mirror of
https://github.com/intel/isa-l.git
synced 2025-01-19 12:27:43 +01:00
1187583a97
- It should be fine to enable pmull always on Apple Silicon - macOS 12+ is required for PMULL instruction. - Changed the conditional macro to __APPLE__ - Rewritten dispatcher using sysctlbyname - Use __USER_LABEL_PREFIX__ - Use __TEXT,__const as readonly section - use ASM_DEF_RODATA macro - fix func decl Change-Id: I800593f21085d8187b480c8bb3ab2bd70c4a6974 Signed-off-by: Taiju Yamada <tyamada@bi.a.u-tokyo.ac.jp>
437 lines
12 KiB
ArmAsm
437 lines
12 KiB
ArmAsm
########################################################################
|
|
# Copyright(c) 2019 Arm Corporation All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
# * Neither the name of Arm Corporation nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#########################################################################
|
|
|
|
#include "../include/aarch64_label.h"
|
|
|
|
.arch armv8-a+crc+crypto
|
|
.text
|
|
.align 3
|
|
.global cdecl(crc16_t10dif_copy_pmull)
|
|
#ifndef __APPLE__
|
|
.type crc16_t10dif_copy_pmull, %function
|
|
#endif
|
|
|
|
/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
|
|
|
|
/* arguments */
|
|
w_seed .req w0
|
|
x_dst .req x1
|
|
x_src .req x2
|
|
x_len .req x3
|
|
w_len .req w3
|
|
|
|
/* returns */
|
|
w_ret .req w0
|
|
|
|
/* these as global temporary registers */
|
|
w_tmp .req w6
|
|
x_tmp .req x6
|
|
x_tmp1 .req x7
|
|
x_tmp2 .req x11
|
|
|
|
d_tmp1 .req d0
|
|
d_tmp2 .req d1
|
|
q_tmp1 .req q0
|
|
q_tmp2 .req q1
|
|
v_tmp1 .req v0
|
|
v_tmp2 .req v1
|
|
|
|
/* local variables */
|
|
w_counter .req w4
|
|
w_crc .req w0
|
|
x_crc .req x0
|
|
x_counter .req x4
|
|
x_crc16tab .req x5
|
|
x_src_saved .req x0
|
|
x_dst_saved .req x12
|
|
|
|
cdecl(crc16_t10dif_copy_pmull):
|
|
cmp x_len, 63
|
|
sub sp, sp, #16
|
|
uxth w_seed, w_seed
|
|
bhi .crc_fold
|
|
|
|
mov x_tmp, 0
|
|
mov w_counter, 0
|
|
|
|
.crc_table_loop_pre:
|
|
cmp x_len, x_tmp
|
|
bls .end
|
|
|
|
#ifndef __APPLE__
|
|
sxtw x_counter, w_counter
|
|
adrp x_crc16tab, .LANCHOR0
|
|
sub x_src, x_src, x_counter
|
|
sub x_dst, x_dst, x_counter
|
|
add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
|
|
#else
|
|
sxtw x_counter, w_counter
|
|
adrp x_crc16tab, .LANCHOR0@PAGE
|
|
sub x_src, x_src, x_counter
|
|
sub x_dst, x_dst, x_counter
|
|
add x_crc16tab, x_crc16tab, .LANCHOR0@PAGEOFF
|
|
#endif
|
|
|
|
.align 2
|
|
.crc_table_loop:
|
|
ldrb w_tmp, [x_src, x_counter]
|
|
strb w_tmp, [x_dst, x_counter]
|
|
add x_counter, x_counter, 1
|
|
cmp x_len, x_counter
|
|
eor w_tmp, w_tmp, w_crc, lsr 8
|
|
ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1]
|
|
eor w_crc, w_tmp, w_crc, lsl 8
|
|
uxth w_crc, w_crc
|
|
bhi .crc_table_loop
|
|
|
|
.end:
|
|
add sp, sp, 16
|
|
ret
|
|
|
|
/* carry less multiplication, part1 - before loop */
|
|
q_x0 .req q2
|
|
q_x1 .req q3
|
|
q_x2 .req q4
|
|
q_x3 .req q5
|
|
|
|
v_x0 .req v2
|
|
v_x1 .req v3
|
|
v_x2 .req v4
|
|
v_x3 .req v5
|
|
|
|
d_x0 .req d2
|
|
d_x1 .req d3
|
|
d_x2 .req d4
|
|
d_x3 .req d5
|
|
|
|
q_permutation .req q7
|
|
v_permutation .req v7
|
|
|
|
// the following registers only used this part1
|
|
d_tmp3 .req d16
|
|
v_tmp3 .req v16
|
|
|
|
.align 3
|
|
.crc_fold:
|
|
fmov d_tmp1, x_crc
|
|
fmov d_tmp2, xzr
|
|
dup d_tmp3, v_tmp2.d[0]
|
|
shl d_tmp1, d_tmp1, 48
|
|
ins v_tmp3.d[1], v_tmp1.d[0]
|
|
|
|
and x_counter, x_len, -64
|
|
sub x_counter, x_counter, #64
|
|
cmp x_counter, 63
|
|
add x_src_saved, x_src, 64
|
|
add x_dst_saved, x_dst, 64
|
|
|
|
ldp q_x0, q_x1, [x_src]
|
|
ldp q_x2, q_x3, [x_src, 32]
|
|
|
|
stp q_x0, q_x1, [x_dst]
|
|
stp q_x2, q_x3, [x_dst, 32]
|
|
|
|
#ifndef __APPLE__
|
|
adrp x_tmp, .shuffle_mask_lanchor
|
|
ldr q_permutation, [x_tmp, :lo12:.shuffle_mask_lanchor]
|
|
#else
|
|
adrp x_tmp, .shuffle_mask_lanchor@PAGE
|
|
ldr q_permutation, [x_tmp, .shuffle_mask_lanchor@PAGEOFF]
|
|
#endif
|
|
|
|
tbl v_tmp1.16b, {v_x0.16b}, v7.16b
|
|
eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
|
|
|
|
tbl v_x1.16b, {v_x1.16b}, v7.16b
|
|
tbl v_x2.16b, {v_x2.16b}, v7.16b
|
|
tbl v_x3.16b, {v_x3.16b}, v7.16b
|
|
bls .crc_fold_loop_end
|
|
|
|
/* carry less multiplication, part2 - loop */
|
|
q_y0 .req q28
|
|
q_y1 .req q29
|
|
q_y2 .req q30
|
|
q_y3 .req q31
|
|
|
|
v_y0 .req v28
|
|
v_y1 .req v29
|
|
v_y2 .req v30
|
|
v_y3 .req v31
|
|
|
|
d_x0_h .req d24
|
|
d_x0_l .req d2
|
|
d_x1_h .req d25
|
|
d_x1_l .req d3
|
|
d_x2_h .req d26
|
|
d_x2_l .req d4
|
|
d_x3_h .req d27
|
|
d_x3_l .req d5
|
|
|
|
v_x0_h .req v24
|
|
v_x0_l .req v2
|
|
v_x1_h .req v25
|
|
v_x1_l .req v3
|
|
v_x2_h .req v26
|
|
v_x2_l .req v4
|
|
v_x3_h .req v27
|
|
v_x3_l .req v5
|
|
|
|
v_tmp1_x0 .req v24
|
|
v_tmp1_x1 .req v25
|
|
v_tmp1_x2 .req v26
|
|
v_tmp1_x3 .req v27
|
|
|
|
q_fold_const .req q17
|
|
v_fold_const .req v17
|
|
|
|
ldr q_fold_const, fold_constant
|
|
|
|
.align 2
|
|
.crc_fold_loop:
|
|
add x_src_saved, x_src_saved, 64
|
|
add x_dst_saved, x_dst_saved, 64
|
|
|
|
sub x_counter, x_counter, #64
|
|
cmp x_counter, 63
|
|
|
|
ldp q_y0, q_y1, [x_src_saved, -64]
|
|
ldp q_y2, q_y3, [x_src_saved, -32]
|
|
|
|
stp q_y0, q_y1, [x_dst_saved, -64]
|
|
stp q_y2, q_y3, [x_dst_saved, -32]
|
|
|
|
prfm pldl2strm, [x_src_saved, #1024]
|
|
prfm pldl2strm, [x_src_saved, #1088]
|
|
|
|
pmull2 v_tmp1_x0.1q, v_x0.2d, v_fold_const.2d
|
|
pmull v_x0.1q, v_x0.1d, v_fold_const.1d
|
|
|
|
pmull2 v_tmp1_x1.1q, v_x1.2d, v_fold_const.2d
|
|
pmull v_x1.1q, v_x1.1d, v_fold_const.1d
|
|
|
|
pmull2 v_tmp1_x2.1q, v_x2.2d, v_fold_const.2d
|
|
pmull v_x2.1q, v_x2.1d, v_fold_const.1d
|
|
|
|
pmull2 v_tmp1_x3.1q, v_x3.2d, v_fold_const.2d
|
|
pmull v_x3.1q, v_x3.1d, v_fold_const.1d
|
|
|
|
tbl v_y0.16b, {v_y0.16b}, v_permutation.16b
|
|
eor v_x0.16b, v_tmp1_x0.16b, v_x0.16b
|
|
eor v_x0.16b, v_x0.16b, v_y0.16b
|
|
|
|
tbl v_y1.16b, {v_y1.16b}, v_permutation.16b
|
|
eor v_x1.16b, v_tmp1_x1.16b, v_x1.16b
|
|
eor v_x1.16b, v_x1.16b, v_y1.16b
|
|
|
|
tbl v_y2.16b, {v_y2.16b}, v_permutation.16b
|
|
eor v_x2.16b, v_tmp1_x2.16b, v_x2.16b
|
|
eor v_x2.16b, v_x2.16b, v_y2.16b
|
|
|
|
tbl v_y3.16b, {v_y3.16b}, v_permutation.16b
|
|
eor v_x3.16b, v_tmp1_x3.16b, v_x3.16b
|
|
eor v_x3.16b, v_x3.16b, v_y3.16b
|
|
|
|
bhi .crc_fold_loop
|
|
|
|
/* carry less multiplication, part3 - after loop */
|
|
/* folding 512bit ---> 128bit */
|
|
|
|
// input parameters:
|
|
// v_x0 => v2
|
|
// v_x1 => v3
|
|
// v_x2 => v4
|
|
// v_x3 => v5
|
|
|
|
// v0, v1, v6, v30, are tmp registers
|
|
|
|
.crc_fold_loop_end:
|
|
mov x_tmp, 0x4c1a0000 /* p1 [1] */
|
|
fmov d0, x_tmp
|
|
mov x_tmp, 0xfb0b0000 /* p1 [0] */
|
|
fmov d1, x_tmp
|
|
|
|
and w_counter, w_len, -64
|
|
sxtw x_tmp, w_counter
|
|
|
|
add x_src, x_src, x_tmp
|
|
add x_dst, x_dst, x_tmp
|
|
|
|
dup d6, v_x0.d[1]
|
|
dup d30, v_x0.d[0]
|
|
pmull v6.1q, v6.1d, v0.1d
|
|
pmull v30.1q, v30.1d, v1.1d
|
|
eor v6.16b, v6.16b, v30.16b
|
|
eor v_x1.16b, v6.16b, v_x1.16b
|
|
|
|
dup d6, v_x1.d[1]
|
|
dup d30, v_x1.d[0]
|
|
pmull v6.1q, v6.1d, v0.1d
|
|
pmull v16.1q, v30.1d, v1.1d
|
|
eor v6.16b, v6.16b, v16.16b
|
|
eor v_x2.16b, v6.16b, v_x2.16b
|
|
|
|
dup d_x0, v_x2.d[1]
|
|
dup d30, v_x2.d[0]
|
|
pmull v0.1q, v_x0.1d, v0.1d
|
|
pmull v_x0.1q, v30.1d, v1.1d
|
|
eor v1.16b, v0.16b, v_x0.16b
|
|
eor v_x0.16b, v1.16b, v_x3.16b
|
|
|
|
/* carry less multiplication, part3 - after loop */
|
|
/* crc16 fold function */
|
|
d_16fold_p0_h .req d18
|
|
v_16fold_p0_h .req v18
|
|
|
|
d_16fold_p0_l .req d4
|
|
v_16fold_p0_l .req v4
|
|
|
|
v_16fold_from .req v_x0
|
|
d_16fold_from_h .req d3
|
|
v_16fold_from_h .req v3
|
|
|
|
v_16fold_zero .req v7
|
|
|
|
v_16fold_from1 .req v16
|
|
|
|
v_16fold_from2 .req v0
|
|
d_16fold_from2_h .req d6
|
|
v_16fold_from2_h .req v6
|
|
|
|
v_16fold_tmp .req v0
|
|
|
|
movi v_16fold_zero.4s, 0
|
|
mov x_tmp1, 0x2d560000 /* p0 [1] */
|
|
mov x_tmp2, 0x13680000 /* p0 [0] */
|
|
|
|
ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
|
|
ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
|
|
|
|
dup d_16fold_from_h, v_16fold_from.d[1]
|
|
fmov d_16fold_p0_h, x_tmp1
|
|
pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
|
|
eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
|
|
|
|
dup d_16fold_from2_h, v_16fold_from2.d[1]
|
|
fmov d_16fold_p0_l, x_tmp2
|
|
pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
|
|
eor v_x0.16b, v0.16b, v6.16b
|
|
|
|
/* carry less multiplication, part3 - after loop */
|
|
/* crc16 barrett reduction function */
|
|
|
|
// input parameters:
|
|
// v_x0: v2
|
|
// barrett reduction constant: br[0], br[1]
|
|
|
|
d_br0 .req d3
|
|
v_br0 .req v3
|
|
d_br1 .req d5
|
|
v_br1 .req v5
|
|
|
|
mov x_tmp1, 0x57f9 /* br[0] low */
|
|
movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */
|
|
movk x_tmp1, 0x1, lsl 32
|
|
fmov d_br0, x_tmp1
|
|
|
|
dup d1, v_x0.d[0]
|
|
dup d1, v1.d[0]
|
|
ext v1.16b, v1.16b, v7.16b, #4
|
|
pmull v4.1q, v1.1d, v_br0.1d
|
|
|
|
ext v1.16b, v4.16b, v7.16b, #4
|
|
mov x_tmp1, 0x8bb70000 /* br[1] low */
|
|
movk x_tmp1, 0x1, lsl 32 /* br[1] high */
|
|
|
|
fmov d_br1, x_tmp1
|
|
pmull v_br1.1q, v1.1d, v_br1.1d
|
|
eor v_x0.16b, v_x0.16b, v_br1.16b
|
|
|
|
umov x0, v_x0.d[0]
|
|
ubfx x0, x0, 16, 16
|
|
b .crc_table_loop_pre
|
|
#ifndef __APPLE__
|
|
.size crc16_t10dif_copy_pmull, .-crc16_t10dif_copy_pmull
|
|
#endif
|
|
|
|
ASM_DEF_RODATA
|
|
.align 4
|
|
fold_constant:
|
|
.word 0x87e70000
|
|
.word 0x00000000
|
|
.word 0x371d0000
|
|
.word 0x00000000
|
|
.shuffle_mask_lanchor = . + 0
|
|
#ifndef __APPLE__
|
|
.type shuffle_mask, %object
|
|
.size shuffle_mask, 16
|
|
#endif
|
|
shuffle_mask:
|
|
.byte 15, 14, 13, 12, 11, 10, 9, 8
|
|
.byte 7, 6, 5, 4, 3, 2, 1, 0
|
|
|
|
.align 4
|
|
.LANCHOR0 = . + 0
|
|
#ifndef __APPLE__
|
|
.type crc16tab, %object
|
|
.size crc16tab, 512
|
|
#endif
|
|
crc16tab:
|
|
.hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
|
|
.hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
|
|
.hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
|
|
.hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
|
|
.hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
|
|
.hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
|
|
.hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
|
|
.hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
|
|
.hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
|
|
.hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
|
|
.hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
|
|
.hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
|
|
.hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
|
|
.hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
|
|
.hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
|
|
.hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
|
|
.hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
|
|
.hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
|
|
.hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
|
|
.hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
|
|
.hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
|
|
.hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
|
|
.hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
|
|
.hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
|
|
.hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
|
|
.hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
|
|
.hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
|
|
.hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
|
|
.hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
|
|
.hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
|
|
.hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
|
|
.hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
|