isa-l/crc/aarch64/crc_common_pmull.h
Taiju Yamada 1187583a97 Fixes for aarch64 mac
- It should be fine to enable pmull always on Apple Silicon
- macOS 12+ is required for PMULL instruction.
- Changed the conditional macro to __APPLE__
- Rewritten dispatcher using sysctlbyname
- Use __USER_LABEL_PREFIX__
- Use __TEXT,__const as readonly section
- use ASM_DEF_RODATA macro
- fix func decl

Change-Id: I800593f21085d8187b480c8bb3ab2bd70c4a6974
Signed-off-by: Taiju Yamada <tyamada@bi.a.u-tokyo.ac.jp>
2022-10-28 08:27:26 -07:00

310 lines
8.4 KiB
C

########################################################################
# Copyright (c) 2019 Microsoft Corporation.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Microsoft Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################
#include "../include/aarch64_label.h"
// parameters
#define w_seed w0
#define x_seed x0
#define x_buf x1
#define w_len w2
#define x_len x2
// return
#define w_crc_ret w0
#define x_crc_ret x0
// constant
#define FOLD_SIZE 64
// global variables
#define x_buf_end x3
#define w_counter w4
#define x_counter x4
#define x_buf_iter x5
#define x_crc_tab_addr x6
#define x_tmp2 x6
#define w_tmp w7
#define x_tmp x7
#define v_x0 v0
#define d_x0 d0
#define s_x0 s0
#define q_x1 q1
#define v_x1 v1
#define q_x2 q2
#define v_x2 v2
#define q_x3 q3
#define v_x3 v3
#define d_x3 d3
#define s_x3 s3
#define q_y0 q4
#define v_y0 v4
#define v_tmp_high v4
#define d_tmp_high d4
#define q_y1 q5
#define v_y1 v5
#define v_tmp_low v5
#define q_y2 q6
#define v_y2 v6
#define q_y3 q7
#define v_y3 v7
#define q_x0_tmp q30
#define v_x0_tmp v30
#define d_p4_high v30.d[1]
#define d_p4_low d30
#define v_p4 v30
#define d_p1_high v30.d[1]
#define d_p1_low d30
#define v_p1 v30
#define d_p0_high v30.d[1]
#define d_p0_low d30
#define v_p0 v30
#define d_br_low d30
#define d_br_low2 v30.d[1]
#define v_br_low v30
#define q_shuffle q31
#define v_shuffle v31
#define d_br_high d31
#define d_br_high2 v31.d[1]
#define v_br_high v31
#define d_p0_low2 d31
#define d_p0_high2 v31.d[1]
#define v_p02 v31
#define v_x0_high v16
#define v_x1_high v17
#define v_x2_high v18
#define v_x3_high v19
.macro crc_refl_load_first_block
ldr q_x0_tmp, [x_buf]
ldr q_x1, [x_buf, 16]
ldr q_x2, [x_buf, 32]
ldr q_x3, [x_buf, 48]
and x_counter, x_len, -64
sub x_tmp, x_counter, #64
cmp x_tmp, 63
add x_buf_iter, x_buf, 64
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm
.macro crc_norm_load_first_block
#ifndef __APPLE__
adrp x_tmp, .shuffle_data
ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data]
#else
adrp x_tmp, .shuffle_data@PAGE
ldr q_shuffle, [x_tmp, #.shuffle_data@PAGEOFF]
#endif
ldr q_x0_tmp, [x_buf]
ldr q_x1, [x_buf, 16]
ldr q_x2, [x_buf, 32]
ldr q_x3, [x_buf, 48]
and x_counter, x_len, -64
sub x_tmp, x_counter, #64
cmp x_tmp, 63
add x_buf_iter, x_buf, 64
tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm
.macro crc32_load_p4
add x_buf_end, x_buf_iter, x_tmp
mov x_tmp, p4_low_b0
movk x_tmp, p4_low_b1, lsl 16
fmov d_p4_low, x_tmp
mov x_tmp2, p4_high_b0
movk x_tmp2, p4_high_b1, lsl 16
fmov d_p4_high, x_tmp2
.endm
.macro crc64_load_p4
add x_buf_end, x_buf_iter, x_tmp
mov x_tmp, p4_low_b0
movk x_tmp, p4_low_b1, lsl 16
movk x_tmp, p4_low_b2, lsl 32
movk x_tmp, p4_low_b3, lsl 48
fmov d_p4_low, x_tmp
mov x_tmp2, p4_high_b0
movk x_tmp2, p4_high_b1, lsl 16
movk x_tmp2, p4_high_b2, lsl 32
movk x_tmp2, p4_high_b3, lsl 48
fmov d_p4_high, x_tmp2
.endm
.macro crc_refl_loop
.align 3
.clmul_loop:
// interleave ldr and pmull(2) for arch which can only issue quadword load every
// other cycle (i.e. A55)
ldr q_y0, [x_buf_iter]
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
ldr q_y1, [x_buf_iter, 16]
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
ldr q_y2, [x_buf_iter, 32]
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
ldr q_y3, [x_buf_iter, 48]
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
pmull v_x0.1q, v_x0.1d, v_p4.1d
add x_buf_iter, x_buf_iter, 64
pmull v_x1.1q, v_x1.1d, v_p4.1d
cmp x_buf_iter, x_buf_end
pmull v_x2.1q, v_x2.1d, v_p4.1d
pmull v_x3.1q, v_x3.1d, v_p4.1d
eor v_x0.16b, v_x0.16b, v_x0_high.16b
eor v_x1.16b, v_x1.16b, v_x1_high.16b
eor v_x2.16b, v_x2.16b, v_x2_high.16b
eor v_x3.16b, v_x3.16b, v_x3_high.16b
eor v_x0.16b, v_x0.16b, v_y0.16b
eor v_x1.16b, v_x1.16b, v_y1.16b
eor v_x2.16b, v_x2.16b, v_y2.16b
eor v_x3.16b, v_x3.16b, v_y3.16b
bne .clmul_loop
.endm
.macro crc_norm_loop
.align 3
.clmul_loop:
// interleave ldr and pmull(2) for arch which can only issue quadword load every
// other cycle (i.e. A55)
ldr q_y0, [x_buf_iter]
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
ldr q_y1, [x_buf_iter, 16]
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
ldr q_y2, [x_buf_iter, 32]
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
ldr q_y3, [x_buf_iter, 48]
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
pmull v_x0.1q, v_x0.1d, v_p4.1d
add x_buf_iter, x_buf_iter, 64
pmull v_x1.1q, v_x1.1d, v_p4.1d
cmp x_buf_iter, x_buf_end
pmull v_x2.1q, v_x2.1d, v_p4.1d
pmull v_x3.1q, v_x3.1d, v_p4.1d
tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b
tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b
tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b
tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_x0_high.16b
eor v_x1.16b, v_x1.16b, v_x1_high.16b
eor v_x2.16b, v_x2.16b, v_x2_high.16b
eor v_x3.16b, v_x3.16b, v_x3_high.16b
eor v_x0.16b, v_x0.16b, v_y0.16b
eor v_x1.16b, v_x1.16b, v_y1.16b
eor v_x2.16b, v_x2.16b, v_y2.16b
eor v_x3.16b, v_x3.16b, v_y3.16b
bne .clmul_loop
.endm
.macro crc32_fold_512b_to_128b
mov x_tmp, p1_low_b0
movk x_tmp, p1_low_b1, lsl 16
fmov d_p1_low, x_tmp
mov x_tmp2, p1_high_b0
movk x_tmp2, p1_high_b1, lsl 16
fmov d_p1_high, x_tmp2
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm
.macro crc64_fold_512b_to_128b
mov x_tmp, p1_low_b0
movk x_tmp, p1_low_b1, lsl 16
movk x_tmp, p1_low_b2, lsl 32
movk x_tmp, p1_low_b3, lsl 48
fmov d_p1_low, x_tmp
mov x_tmp2, p1_high_b0
movk x_tmp2, p1_high_b1, lsl 16
movk x_tmp2, p1_high_b2, lsl 32
movk x_tmp2, p1_high_b3, lsl 48
fmov d_p1_high, x_tmp2
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm