mirror of
https://github.com/intel/isa-l.git
synced 2024-12-13 09:52:56 +01:00
Implement aarch64 neon for erasure code.
1.Replace below erasure code interfaces to arm neon interface by mbin_interface function. ec_encode_data gf_vect_mul gf_vect_dot_prod gf_vect_mad ec_encode_data_update 2.Utilise arm neon instrution to accelerate GF(2^8) set compute by 128bit registor. Change-Id: Ib0ecbfbd1837d2b1f823d26815c896724d2d22e4 Signed-off-by: Zhou Xiong <zhouxiong13@huawei.com>
This commit is contained in:
parent
c680d3aba7
commit
d7848c1d05
@ -27,11 +27,11 @@
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
include erasure_code/aarch64/Makefile.am
|
||||
|
||||
lsrc += erasure_code/ec_base.c
|
||||
|
||||
lsrc_base_aliases += erasure_code/ec_base_aliases.c
|
||||
lsrc_aarch64 += erasure_code/ec_base_aliases.c
|
||||
|
||||
lsrc_x86_64 += \
|
||||
erasure_code/ec_highlevel_func.c \
|
||||
erasure_code/gf_vect_mul_sse.asm \
|
||||
|
45
erasure_code/aarch64/Makefile.am
Normal file
45
erasure_code/aarch64/Makefile.am
Normal file
@ -0,0 +1,45 @@
|
||||
##################################################################
|
||||
# Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Huawei Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
lsrc_aarch64 += \
|
||||
erasure_code/aarch64/ec_aarch64_highlevel_func.c \
|
||||
erasure_code/aarch64/ec_aarch64_dispatcher.c \
|
||||
erasure_code/aarch64/gf_vect_dot_prod_neon.S \
|
||||
erasure_code/aarch64/gf_2vect_dot_prod_neon.S \
|
||||
erasure_code/aarch64/gf_3vect_dot_prod_neon.S \
|
||||
erasure_code/aarch64/gf_4vect_dot_prod_neon.S \
|
||||
erasure_code/aarch64/gf_5vect_dot_prod_neon.S \
|
||||
erasure_code/aarch64/gf_vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_2vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_3vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_4vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_5vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_6vect_mad_neon.S \
|
||||
erasure_code/aarch64/gf_vect_mul_neon.S \
|
||||
erasure_code/aarch64/ec_multibinary_arm.S
|
69
erasure_code/aarch64/ec_aarch64_dispatcher.c
Normal file
69
erasure_code/aarch64/ec_aarch64_dispatcher.c
Normal file
@ -0,0 +1,69 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
#include <aarch64_multibinary.h>
|
||||
|
||||
DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod)
|
||||
{
|
||||
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
||||
return PROVIDER_INFO(gf_vect_dot_prod_neon);
|
||||
return PROVIDER_BASIC(gf_vect_dot_prod);
|
||||
|
||||
}
|
||||
|
||||
DEFINE_INTERFACE_DISPATCHER(gf_vect_mad)
|
||||
{
|
||||
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
||||
return PROVIDER_INFO(gf_vect_mad_neon);
|
||||
return PROVIDER_BASIC(gf_vect_mad);
|
||||
|
||||
}
|
||||
|
||||
DEFINE_INTERFACE_DISPATCHER(ec_encode_data)
|
||||
{
|
||||
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
||||
return PROVIDER_INFO(ec_encode_data_neon);
|
||||
return PROVIDER_BASIC(ec_encode_data);
|
||||
|
||||
}
|
||||
|
||||
DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update)
|
||||
{
|
||||
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
||||
return PROVIDER_INFO(ec_encode_data_update_neon);
|
||||
return PROVIDER_BASIC(ec_encode_data_update);
|
||||
|
||||
}
|
||||
|
||||
DEFINE_INTERFACE_DISPATCHER(gf_vect_mul)
|
||||
{
|
||||
if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
|
||||
return PROVIDER_INFO(gf_vect_mul_neon);
|
||||
return PROVIDER_BASIC(gf_vect_mul);
|
||||
|
||||
}
|
127
erasure_code/aarch64/ec_aarch64_highlevel_func.c
Normal file
127
erasure_code/aarch64/ec_aarch64_highlevel_func.c
Normal file
@ -0,0 +1,127 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
#include "erasure_code.h"
|
||||
|
||||
/*external function*/
|
||||
extern void gf_vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
extern void gf_2vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
extern void gf_3vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
extern void gf_4vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
extern void gf_5vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
extern void gf_vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char *dest);
|
||||
extern void gf_2vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char **dest);
|
||||
extern void gf_3vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char **dest);
|
||||
extern void gf_4vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char **dest);
|
||||
extern void gf_5vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char **dest);
|
||||
extern void gf_6vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
|
||||
unsigned char *src, unsigned char **dest);
|
||||
|
||||
void ec_encode_data_neon(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
||||
unsigned char **coding)
|
||||
{
|
||||
if (len < 16) {
|
||||
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
|
||||
while (rows > 5) {
|
||||
gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
|
||||
g_tbls += 5 * k * 32;
|
||||
coding += 5;
|
||||
rows -= 5;
|
||||
}
|
||||
switch (rows) {
|
||||
case 5:
|
||||
gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_dot_prod_neon(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_dot_prod_neon(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_dot_prod_neon(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_dot_prod_neon(len, k, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding)
|
||||
{
|
||||
if (len < 16) {
|
||||
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
while (rows > 6) {
|
||||
gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
g_tbls += 6 * k * 32;
|
||||
coding += 6;
|
||||
rows -= 6;
|
||||
}
|
||||
switch (rows) {
|
||||
case 6:
|
||||
gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 5:
|
||||
gf_5vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_mad_neon(len, k, vec_i, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
}
|
36
erasure_code/aarch64/ec_multibinary_arm.S
Normal file
36
erasure_code/aarch64/ec_multibinary_arm.S
Normal file
@ -0,0 +1,36 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include "aarch64_multibinary.h"
|
||||
|
||||
mbin_interface ec_encode_data
|
||||
mbin_interface gf_vect_mul
|
||||
mbin_interface gf_vect_dot_prod
|
||||
mbin_interface gf_vect_mad
|
||||
mbin_interface ec_encode_data_update
|
399
erasure_code/aarch64/gf_2vect_dot_prod_neon.S
Normal file
399
erasure_code/aarch64/gf_2vect_dot_prod_neon.S
Normal file
@ -0,0 +1,399 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
|
||||
.global gf_2vect_dot_prod_neon
|
||||
.type gf_2vect_dot_prod_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_tbl .req x2
|
||||
x_src .req x3
|
||||
x_dest .req x4
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_vec_i .req x5
|
||||
x_ptr .req x6
|
||||
x_pos .req x7
|
||||
x_tmp .req x8
|
||||
x_tbl1 .req x9
|
||||
x_tbl2 .req x10
|
||||
x_dest1 .req x11
|
||||
x_dest2 .req x12
|
||||
|
||||
/* vectors */
|
||||
v_gft1_lo .req v0
|
||||
v_gft1_hi .req v1
|
||||
v_gft2_lo .req v2
|
||||
v_gft2_hi .req v3
|
||||
q_gft1_lo .req q0
|
||||
q_gft1_hi .req q1
|
||||
q_gft2_lo .req q2
|
||||
q_gft2_hi .req q3
|
||||
|
||||
v_mask0f .req v4
|
||||
q_mask0f .req q4
|
||||
|
||||
v_tmp1_lo .req v5
|
||||
v_tmp1_hi .req v6
|
||||
v_tmp1 .req v7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
v_data_4 .req v12
|
||||
v_data_5 .req v13
|
||||
v_data_6 .req v14
|
||||
v_data_7 .req v15
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
q_data_4 .req q12
|
||||
q_data_5 .req q13
|
||||
q_data_6 .req q14
|
||||
q_data_7 .req q15
|
||||
|
||||
v_p1_0 .req v16
|
||||
v_p1_1 .req v17
|
||||
v_p1_2 .req v18
|
||||
v_p1_3 .req v19
|
||||
v_p1_4 .req v20
|
||||
v_p1_5 .req v21
|
||||
v_p1_6 .req v22
|
||||
v_p1_7 .req v23
|
||||
v_p2_0 .req v24
|
||||
v_p2_1 .req v25
|
||||
v_p2_2 .req v26
|
||||
v_p2_3 .req v27
|
||||
v_p2_4 .req v28
|
||||
v_p2_5 .req v29
|
||||
v_p2_6 .req v30
|
||||
v_p2_7 .req v31
|
||||
|
||||
q_p1_0 .req q16
|
||||
q_p1_1 .req q17
|
||||
q_p1_2 .req q18
|
||||
q_p1_3 .req q19
|
||||
q_p1_4 .req q20
|
||||
q_p1_5 .req q21
|
||||
q_p1_6 .req q22
|
||||
q_p1_7 .req q23
|
||||
q_p2_0 .req q24
|
||||
q_p2_1 .req q25
|
||||
q_p2_2 .req q26
|
||||
q_p2_3 .req q27
|
||||
q_p2_4 .req q28
|
||||
q_p2_5 .req q29
|
||||
q_p2_6 .req q30
|
||||
q_p2_7 .req q31
|
||||
|
||||
v_p1 .req v_p1_0
|
||||
q_p1 .req q_p1_0
|
||||
v_p2 .req v_p2_0
|
||||
q_p2 .req q_p2_0
|
||||
v_data .req v_p1_1
|
||||
q_data .req q_p1_1
|
||||
v_data_lo .req v_p1_2
|
||||
v_data_hi .req v_p1_3
|
||||
|
||||
gf_2vect_dot_prod_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
mov x_pos, #0
|
||||
lsl x_vec, x_vec, #3
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
|
||||
.Lloop128_init:
|
||||
/* less than 128 bytes, goto Lloop16_init */
|
||||
cmp x_len, #128
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_len, x_len, #128
|
||||
|
||||
.Lloop128:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p1_1.16b, #0
|
||||
movi v_p1_2.16b, #0
|
||||
movi v_p1_3.16b, #0
|
||||
movi v_p1_4.16b, #0
|
||||
movi v_p1_5.16b, #0
|
||||
movi v_p1_6.16b, #0
|
||||
movi v_p1_7.16b, #0
|
||||
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p2_1.16b, #0
|
||||
movi v_p2_2.16b, #0
|
||||
movi v_p2_3.16b, #0
|
||||
movi v_p2_4.16b, #0
|
||||
movi v_p2_5.16b, #0
|
||||
movi v_p2_6.16b, #0
|
||||
movi v_p2_7.16b, #0
|
||||
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop128_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
add x_ptr, x_ptr, x_pos
|
||||
|
||||
ldp q_data_0, q_data_1, [x_ptr], #32
|
||||
ldp q_data_2, q_data_3, [x_ptr], #32
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
ldp q_data_4, q_data_5, [x_ptr], #32
|
||||
ldp q_data_6, q_data_7, [x_ptr], #32
|
||||
prfm pldl1strm, [x_ptr]
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1keep, [x_tbl2]
|
||||
|
||||
/* data_0 */
|
||||
and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
|
||||
ushr v_data_0.16b, v_data_0.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
|
||||
ushr v_data_1.16b, v_data_1.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
|
||||
ushr v_data_2.16b, v_data_2.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
|
||||
ushr v_data_3.16b, v_data_3.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_4 */
|
||||
and v_tmp1.16b, v_data_4.16b, v_mask0f.16b
|
||||
ushr v_data_4.16b, v_data_4.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_5 */
|
||||
and v_tmp1.16b, v_data_5.16b, v_mask0f.16b
|
||||
ushr v_data_5.16b, v_data_5.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_6 */
|
||||
and v_tmp1.16b, v_data_6.16b, v_mask0f.16b
|
||||
ushr v_data_6.16b, v_data_6.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_7 */
|
||||
and v_tmp1.16b, v_data_7.16b, v_mask0f.16b
|
||||
ushr v_data_7.16b, v_data_7.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop128_vects
|
||||
|
||||
.Lloop128_vects_end:
|
||||
add x_ptr, x_dest1, x_pos
|
||||
stp q_p1_0, q_p1_1, [x_ptr], #32
|
||||
stp q_p1_2, q_p1_3, [x_ptr], #32
|
||||
stp q_p1_4, q_p1_5, [x_ptr], #32
|
||||
stp q_p1_6, q_p1_7, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest2, x_pos
|
||||
stp q_p2_0, q_p2_1, [x_ptr], #32
|
||||
stp q_p2_2, q_p2_3, [x_ptr], #32
|
||||
stp q_p2_4, q_p2_5, [x_ptr], #32
|
||||
stp q_p2_6, q_p2_7, [x_ptr]
|
||||
|
||||
add x_pos, x_pos, #128
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop128
|
||||
|
||||
.Lloop128_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
|
||||
add x_len, x_len, #128
|
||||
cmp x_pos, x_len
|
||||
beq .return_pass
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_len, x_len, #16
|
||||
cmp x_pos, x_len
|
||||
bgt .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
movi v_p1.16b, #0
|
||||
movi v_p2.16b, #0
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop16_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
ldr q_data, [x_ptr, x_pos]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp1_lo.16b
|
||||
eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp1_lo.16b
|
||||
eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
bne .Lloop16_vects
|
||||
|
||||
.Lloop16_vects_end:
|
||||
str q_p1, [x_dest1, x_pos]
|
||||
str q_p2, [x_dest2, x_pos]
|
||||
add x_pos, x_pos, #16
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop16
|
||||
|
||||
.Lloop16_end:
|
||||
sub x_tmp, x_pos, x_len
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16_init:
|
||||
mov x_pos, x_len
|
||||
b .Lloop16
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
401
erasure_code/aarch64/gf_2vect_mad_neon.S
Normal file
401
erasure_code/aarch64/gf_2vect_mad_neon.S
Normal file
@ -0,0 +1,401 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_2vect_mad_neon
|
||||
.type gf_2vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x7
|
||||
x_dest2 .req x8
|
||||
x_tmp .req x9
|
||||
x_tbl1 .req x10
|
||||
x_tbl2 .req x11
|
||||
x_const .req x12
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp_lo .req v1
|
||||
v_tmp_hi .req v2
|
||||
v_tmp .req v3
|
||||
q_tmp .req q3
|
||||
|
||||
v_gft1_lo .req v4
|
||||
v_gft1_hi .req v5
|
||||
v_gft2_lo .req v6
|
||||
v_gft2_hi .req v7
|
||||
q_gft1_lo .req q4
|
||||
q_gft1_hi .req q5
|
||||
q_gft2_lo .req q6
|
||||
q_gft2_hi .req q7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
v_data_4 .req v12
|
||||
v_data_5 .req v13
|
||||
v_data_6 .req v14
|
||||
v_data_7 .req v15
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
q_data_4 .req q12
|
||||
q_data_5 .req q13
|
||||
q_data_6 .req q14
|
||||
q_data_7 .req q15
|
||||
|
||||
v_data_0_lo .req v16
|
||||
v_data_1_lo .req v17
|
||||
v_data_2_lo .req v18
|
||||
v_data_3_lo .req v19
|
||||
v_data_4_lo .req v20
|
||||
v_data_5_lo .req v21
|
||||
v_data_6_lo .req v22
|
||||
v_data_7_lo .req v23
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
v_data_4_hi .req v_data_4
|
||||
v_data_5_hi .req v_data_5
|
||||
v_data_6_hi .req v_data_6
|
||||
v_data_7_hi .req v_data_7
|
||||
|
||||
v_d0 .req v24
|
||||
v_d1 .req v25
|
||||
v_d2 .req v26
|
||||
v_d3 .req v27
|
||||
v_d4 .req v28
|
||||
v_d5 .req v29
|
||||
v_d6 .req v30
|
||||
v_d7 .req v31
|
||||
q_d0 .req q24
|
||||
q_d1 .req q25
|
||||
q_d2 .req q26
|
||||
q_d3 .req q27
|
||||
q_d4 .req q28
|
||||
q_d5 .req q29
|
||||
q_d6 .req q30
|
||||
q_d7 .req q31
|
||||
|
||||
v_data .req v16
|
||||
q_data .req q16
|
||||
v_data_lo .req v17
|
||||
v_data_hi .req v18
|
||||
|
||||
|
||||
gf_2vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
lsl x_vec, x_vec, #5
|
||||
add x_tbl1, x_tbl, x_vec_i
|
||||
add x_tbl2, x_tbl1, x_vec
|
||||
add x_src_end, x_src, x_len
|
||||
|
||||
ldr x_dest1, [x_dest]
|
||||
ldr x_dest2, [x_dest, #8]
|
||||
ldr q_gft1_lo, [x_tbl1]
|
||||
ldr q_gft1_hi, [x_tbl1, #16]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
|
||||
.Lloop128_init:
|
||||
/* less than 128 bytes, goto Lloop16_init */
|
||||
cmp x_len, #128
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop128:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
ldr q_data_4, [x_src, #16*4]
|
||||
ldr q_data_5, [x_src, #16*5]
|
||||
ldr q_data_6, [x_src, #16*6]
|
||||
ldr q_data_7, [x_src, #16*7]
|
||||
|
||||
ldr q_d0, [x_dest1, #16*0]
|
||||
ldr q_d1, [x_dest1, #16*1]
|
||||
ldr q_d2, [x_dest1, #16*2]
|
||||
ldr q_d3, [x_dest1, #16*3]
|
||||
ldr q_d4, [x_dest1, #16*4]
|
||||
ldr q_d5, [x_dest1, #16*5]
|
||||
ldr q_d6, [x_dest1, #16*6]
|
||||
ldr q_d7, [x_dest1, #16*7]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
|
||||
and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
|
||||
and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
|
||||
and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
ushr v_data_4_hi.16b, v_data_4.16b, #4
|
||||
ushr v_data_5_hi.16b, v_data_5.16b, #4
|
||||
ushr v_data_6_hi.16b, v_data_6.16b, #4
|
||||
ushr v_data_7_hi.16b, v_data_7.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest1, #16*0]
|
||||
str q_d1, [x_dest1, #16*1]
|
||||
str q_d2, [x_dest1, #16*2]
|
||||
str q_d3, [x_dest1, #16*3]
|
||||
str q_d4, [x_dest1, #16*4]
|
||||
str q_d5, [x_dest1, #16*5]
|
||||
str q_d6, [x_dest1, #16*6]
|
||||
str q_d7, [x_dest1, #16*7]
|
||||
|
||||
ldr q_d0, [x_dest2, #16*0]
|
||||
ldr q_d1, [x_dest2, #16*1]
|
||||
ldr q_d2, [x_dest2, #16*2]
|
||||
ldr q_d3, [x_dest2, #16*3]
|
||||
ldr q_d4, [x_dest2, #16*4]
|
||||
ldr q_d5, [x_dest2, #16*5]
|
||||
ldr q_d6, [x_dest2, #16*6]
|
||||
ldr q_d7, [x_dest2, #16*7]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest2, #16*0]
|
||||
str q_d1, [x_dest2, #16*1]
|
||||
str q_d2, [x_dest2, #16*2]
|
||||
str q_d3, [x_dest2, #16*3]
|
||||
str q_d4, [x_dest2, #16*4]
|
||||
str q_d5, [x_dest2, #16*5]
|
||||
str q_d6, [x_dest2, #16*6]
|
||||
str q_d7, [x_dest2, #16*7]
|
||||
|
||||
add x_src, x_src, #128
|
||||
add x_dest1, x_dest1, #128
|
||||
add x_dest2, x_dest2, #128
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop128
|
||||
|
||||
.Lloop128_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
|
||||
ldr q_d0, [x_dest1]
|
||||
ldr q_d1, [x_dest2]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest1]
|
||||
str q_d1, [x_dest2]
|
||||
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_dest2, x_dest2, #16
|
||||
add x_src, x_src, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
sub x_dest2, x_dest2, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d0, [x_dest1]
|
||||
ldr q_d1, [x_dest2]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest1]
|
||||
str q_d1, [x_dest2]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
358
erasure_code/aarch64/gf_3vect_dot_prod_neon.S
Normal file
358
erasure_code/aarch64/gf_3vect_dot_prod_neon.S
Normal file
@ -0,0 +1,358 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
|
||||
.global gf_3vect_dot_prod_neon
|
||||
.type gf_3vect_dot_prod_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_tbl .req x2
|
||||
x_src .req x3
|
||||
x_dest .req x4
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_vec_i .req x5
|
||||
x_ptr .req x6
|
||||
x_pos .req x7
|
||||
x_tmp .req x8
|
||||
x_dest1 .req x9
|
||||
x_tbl1 .req x10
|
||||
x_dest2 .req x11
|
||||
x_tbl2 .req x12
|
||||
x_dest3 .req x13
|
||||
x_tbl3 .req x14
|
||||
|
||||
/* vectors */
|
||||
v_gft1_lo .req v0
|
||||
v_gft1_hi .req v1
|
||||
v_gft2_lo .req v2
|
||||
v_gft2_hi .req v3
|
||||
v_gft3_lo .req v4
|
||||
v_gft3_hi .req v5
|
||||
q_gft1_lo .req q0
|
||||
q_gft1_hi .req q1
|
||||
q_gft2_lo .req q2
|
||||
q_gft2_hi .req q3
|
||||
q_gft3_lo .req q4
|
||||
q_gft3_hi .req q5
|
||||
|
||||
v_mask0f .req v6
|
||||
q_mask0f .req q6
|
||||
v_tmp1 .req v7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_tmp1_lo .req v12
|
||||
v_tmp1_hi .req v13
|
||||
|
||||
v_p1_0 .req v20
|
||||
v_p1_1 .req v21
|
||||
v_p1_2 .req v22
|
||||
v_p1_3 .req v23
|
||||
v_p2_0 .req v24
|
||||
v_p2_1 .req v25
|
||||
v_p2_2 .req v26
|
||||
v_p2_3 .req v27
|
||||
v_p3_0 .req v28
|
||||
v_p3_1 .req v29
|
||||
v_p3_2 .req v30
|
||||
v_p3_3 .req v31
|
||||
|
||||
q_p1_0 .req q20
|
||||
q_p1_1 .req q21
|
||||
q_p1_2 .req q22
|
||||
q_p1_3 .req q23
|
||||
q_p2_0 .req q24
|
||||
q_p2_1 .req q25
|
||||
q_p2_2 .req q26
|
||||
q_p2_3 .req q27
|
||||
q_p3_0 .req q28
|
||||
q_p3_1 .req q29
|
||||
q_p3_2 .req q30
|
||||
q_p3_3 .req q31
|
||||
|
||||
v_data .req v_p1_1
|
||||
q_data .req q_p1_1
|
||||
v_data_lo .req v_p1_2
|
||||
v_data_hi .req v_p1_3
|
||||
|
||||
|
||||
gf_3vect_dot_prod_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
mov x_pos, #0
|
||||
lsl x_vec, x_vec, #3
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_len, x_len, #64
|
||||
|
||||
.Lloop64:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p1_1.16b, #0
|
||||
movi v_p1_2.16b, #0
|
||||
movi v_p1_3.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p2_1.16b, #0
|
||||
movi v_p2_2.16b, #0
|
||||
movi v_p2_3.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
movi v_p3_1.16b, #0
|
||||
movi v_p3_2.16b, #0
|
||||
movi v_p3_3.16b, #0
|
||||
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl1, x_vec, lsl #2
|
||||
add x_tbl3, x_tbl2, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop64_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
add x_ptr, x_ptr, x_pos
|
||||
|
||||
ldr q_data_0, [x_ptr], #16
|
||||
ldr q_data_1, [x_ptr], #16
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
||||
|
||||
ldr q_data_2, [x_ptr], #16
|
||||
ldr q_data_3, [x_ptr], #16
|
||||
prfm pldl1strm, [x_ptr]
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1keep, [x_tbl2]
|
||||
prfm pldl1keep, [x_tbl3]
|
||||
|
||||
/* data_0 */
|
||||
and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
|
||||
ushr v_data_0.16b, v_data_0.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
|
||||
ushr v_data_1.16b, v_data_1.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
|
||||
ushr v_data_2.16b, v_data_2.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
|
||||
ushr v_data_3.16b, v_data_3.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop64_vects
|
||||
|
||||
.Lloop64_vects_end:
|
||||
add x_ptr, x_dest1, x_pos
|
||||
stp q_p1_0, q_p1_1, [x_ptr], #32
|
||||
stp q_p1_2, q_p1_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest2, x_pos
|
||||
stp q_p2_0, q_p2_1, [x_ptr], #32
|
||||
stp q_p2_2, q_p2_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest3, x_pos
|
||||
stp q_p3_0, q_p3_1, [x_ptr], #32
|
||||
stp q_p3_2, q_p3_3, [x_ptr]
|
||||
|
||||
add x_pos, x_pos, #64
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
|
||||
add x_len, x_len, #64
|
||||
cmp x_pos, x_len
|
||||
beq .return_pass
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_len, x_len, #16
|
||||
cmp x_pos, x_len
|
||||
bgt .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl1, x_vec, lsl #2
|
||||
add x_tbl3, x_tbl2, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop16_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
ldr q_data, [x_ptr, x_pos]
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
bne .Lloop16_vects
|
||||
|
||||
.Lloop16_vects_end:
|
||||
str q_p1_0, [x_dest1, x_pos]
|
||||
str q_p2_0, [x_dest2, x_pos]
|
||||
str q_p3_0, [x_dest3, x_pos]
|
||||
add x_pos, x_pos, #16
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop16
|
||||
|
||||
.Lloop16_end:
|
||||
sub x_tmp, x_pos, x_len
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16_init:
|
||||
mov x_pos, x_len
|
||||
b .Lloop16
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
381
erasure_code/aarch64/gf_3vect_mad_neon.S
Normal file
381
erasure_code/aarch64/gf_3vect_mad_neon.S
Normal file
@ -0,0 +1,381 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_3vect_mad_neon
|
||||
.type gf_3vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x7
|
||||
x_dest2 .req x8
|
||||
x_dest3 .req x_dest
|
||||
x_tmp .req x10
|
||||
x_tbl1 .req x11
|
||||
x_tbl2 .req x12
|
||||
x_tbl3 .req x13
|
||||
x_const .req x14
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp_lo .req v1
|
||||
v_tmp_hi .req v2
|
||||
v_tmp .req v3
|
||||
q_tmp .req q3
|
||||
|
||||
v_gft1_lo .req v4
|
||||
v_gft1_hi .req v5
|
||||
v_gft2_lo .req v6
|
||||
v_gft2_hi .req v7
|
||||
v_gft3_lo .req v16
|
||||
v_gft3_hi .req v17
|
||||
q_gft1_lo .req q4
|
||||
q_gft1_hi .req q5
|
||||
q_gft2_lo .req q6
|
||||
q_gft2_hi .req q7
|
||||
q_gft3_lo .req q16
|
||||
q_gft3_hi .req q17
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_data_0_lo .req v12
|
||||
v_data_1_lo .req v13
|
||||
v_data_2_lo .req v14
|
||||
v_data_3_lo .req v15
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
|
||||
v_d1_0 .req v20
|
||||
v_d1_1 .req v21
|
||||
v_d1_2 .req v22
|
||||
v_d1_3 .req v23
|
||||
v_d2_0 .req v24
|
||||
v_d2_1 .req v25
|
||||
v_d2_2 .req v26
|
||||
v_d2_3 .req v27
|
||||
v_d3_0 .req v28
|
||||
v_d3_1 .req v29
|
||||
v_d3_2 .req v30
|
||||
v_d3_3 .req v31
|
||||
q_d1_0 .req q20
|
||||
q_d1_1 .req q21
|
||||
q_d1_2 .req q22
|
||||
q_d1_3 .req q23
|
||||
q_d2_0 .req q24
|
||||
q_d2_1 .req q25
|
||||
q_d2_2 .req q26
|
||||
q_d2_3 .req q27
|
||||
q_d3_0 .req q28
|
||||
q_d3_1 .req q29
|
||||
q_d3_2 .req q30
|
||||
q_d3_3 .req q31
|
||||
|
||||
v_data .req v21
|
||||
q_data .req q21
|
||||
v_data_lo .req v22
|
||||
v_data_hi .req v23
|
||||
|
||||
gf_3vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
lsl x_vec, x_vec, #5
|
||||
add x_tbl1, x_tbl, x_vec_i
|
||||
add x_tbl2, x_tbl1, x_vec
|
||||
add x_tbl3, x_tbl2, x_vec
|
||||
add x_src_end, x_src, x_len
|
||||
ldr x_dest1, [x_dest]
|
||||
ldr x_dest2, [x_dest, #8]
|
||||
ldr x_dest3, [x_dest, #16]
|
||||
ldr q_gft1_lo, [x_tbl1]
|
||||
ldr q_gft1_hi, [x_tbl1, #16]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop64:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
add x_src, x_src, #64
|
||||
|
||||
ldr q_d1_0, [x_dest1, #16*0]
|
||||
ldr q_d1_1, [x_dest1, #16*1]
|
||||
ldr q_d1_2, [x_dest1, #16*2]
|
||||
ldr q_d1_3, [x_dest1, #16*3]
|
||||
|
||||
ldr q_d2_0, [x_dest2, #16*0]
|
||||
ldr q_d2_1, [x_dest2, #16*1]
|
||||
ldr q_d2_2, [x_dest2, #16*2]
|
||||
ldr q_d2_3, [x_dest2, #16*3]
|
||||
|
||||
ldr q_d3_0, [x_dest3, #16*0]
|
||||
ldr q_d3_1, [x_dest3, #16*1]
|
||||
ldr q_d3_2, [x_dest3, #16*2]
|
||||
ldr q_d3_3, [x_dest3, #16*3]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
str q_d1_1, [x_dest1, #16*1]
|
||||
str q_d1_2, [x_dest1, #16*2]
|
||||
str q_d1_3, [x_dest1, #16*3]
|
||||
add x_dest1, x_dest1, #64
|
||||
|
||||
str q_d2_0, [x_dest2, #16*0]
|
||||
str q_d2_1, [x_dest2, #16*1]
|
||||
str q_d2_2, [x_dest2, #16*2]
|
||||
str q_d2_3, [x_dest2, #16*3]
|
||||
add x_dest2, x_dest2, #64
|
||||
|
||||
str q_d3_0, [x_dest3, #16*0]
|
||||
str q_d3_1, [x_dest3, #16*1]
|
||||
str q_d3_2, [x_dest3, #16*2]
|
||||
str q_d3_3, [x_dest3, #16*3]
|
||||
add x_dest3, x_dest3, #64
|
||||
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
add x_src, x_src, #16
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_dest2, x_dest2, #16
|
||||
add x_dest3, x_dest3, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
sub x_dest2, x_dest2, x_tmp
|
||||
sub x_dest3, x_dest3, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
421
erasure_code/aarch64/gf_4vect_dot_prod_neon.S
Normal file
421
erasure_code/aarch64/gf_4vect_dot_prod_neon.S
Normal file
@ -0,0 +1,421 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_4vect_dot_prod_neon
|
||||
.type gf_4vect_dot_prod_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_tbl .req x2
|
||||
x_src .req x3
|
||||
x_dest .req x4
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_vec_i .req x5
|
||||
x_ptr .req x6
|
||||
x_pos .req x7
|
||||
x_tmp .req x8
|
||||
x_dest1 .req x9
|
||||
x_tbl1 .req x10
|
||||
x_dest2 .req x11
|
||||
x_tbl2 .req x12
|
||||
x_dest3 .req x13
|
||||
x_tbl3 .req x14
|
||||
x_dest4 .req x_dest
|
||||
x_tbl4 .req x15
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
q_mask0f .req q0
|
||||
v_tmp1_lo .req v1
|
||||
v_tmp1_hi .req v2
|
||||
v_tmp1 .req v3
|
||||
q_tmp1 .req q3
|
||||
|
||||
v_p1_0 .req v4
|
||||
v_p2_0 .req v5
|
||||
v_p3_0 .req v6
|
||||
v_p4_0 .req v7
|
||||
|
||||
q_p1_0 .req q4
|
||||
q_p2_0 .req q5
|
||||
q_p3_0 .req q6
|
||||
q_p4_0 .req q7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_p1_3 .req v12
|
||||
v_p2_3 .req v13
|
||||
v_p3_3 .req v14
|
||||
v_p4_3 .req v15
|
||||
q_p1_3 .req q12
|
||||
q_p2_3 .req q13
|
||||
q_p3_3 .req q14
|
||||
q_p4_3 .req q15
|
||||
|
||||
v_gft1_lo .req v16
|
||||
v_gft1_hi .req v17
|
||||
v_gft2_lo .req v18
|
||||
v_gft2_hi .req v19
|
||||
v_gft3_lo .req v20
|
||||
v_gft3_hi .req v21
|
||||
v_gft4_lo .req v22
|
||||
v_gft4_hi .req v23
|
||||
q_gft1_lo .req q16
|
||||
q_gft1_hi .req q17
|
||||
q_gft2_lo .req q18
|
||||
q_gft2_hi .req q19
|
||||
q_gft3_lo .req q20
|
||||
q_gft3_hi .req q21
|
||||
q_gft4_lo .req q22
|
||||
q_gft4_hi .req q23
|
||||
|
||||
v_p1_1 .req v24
|
||||
v_p1_2 .req v25
|
||||
v_p2_1 .req v26
|
||||
v_p2_2 .req v27
|
||||
v_p3_1 .req v28
|
||||
v_p3_2 .req v29
|
||||
v_p4_1 .req v30
|
||||
v_p4_2 .req v31
|
||||
|
||||
q_p1_1 .req q24
|
||||
q_p1_2 .req q25
|
||||
q_p2_1 .req q26
|
||||
q_p2_2 .req q27
|
||||
q_p3_1 .req q28
|
||||
q_p3_2 .req q29
|
||||
q_p4_1 .req q30
|
||||
q_p4_2 .req q31
|
||||
|
||||
v_data .req v_tmp1
|
||||
q_data .req q_tmp1
|
||||
v_data_lo .req v_tmp1_lo
|
||||
v_data_hi .req v_tmp1_hi
|
||||
|
||||
gf_4vect_dot_prod_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
mov x_pos, #0
|
||||
lsl x_vec, x_vec, #3
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
ldr x_dest4, [x_dest, #8*3]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_len, x_len, #64
|
||||
|
||||
.Lloop64:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p1_1.16b, #0
|
||||
movi v_p1_2.16b, #0
|
||||
movi v_p1_3.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p2_1.16b, #0
|
||||
movi v_p2_2.16b, #0
|
||||
movi v_p2_3.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
movi v_p3_1.16b, #0
|
||||
movi v_p3_2.16b, #0
|
||||
movi v_p3_3.16b, #0
|
||||
movi v_p4_0.16b, #0
|
||||
movi v_p4_1.16b, #0
|
||||
movi v_p4_2.16b, #0
|
||||
movi v_p4_3.16b, #0
|
||||
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl1, x_vec, lsl #2
|
||||
add x_tbl3, x_tbl2, x_vec, lsl #2
|
||||
add x_tbl4, x_tbl3, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1keep, [x_tbl2]
|
||||
prfm pldl1keep, [x_tbl3]
|
||||
prfm pldl1keep, [x_tbl4]
|
||||
|
||||
.Lloop64_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
add x_ptr, x_ptr, x_pos
|
||||
|
||||
ldr q_data_0, [x_ptr], #16
|
||||
ldr q_data_1, [x_ptr], #16
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
||||
ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
||||
ldr q_data_2, [x_ptr], #16
|
||||
ldr q_data_3, [x_ptr], #16
|
||||
|
||||
prfm pldl1strm, [x_ptr]
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1keep, [x_tbl2]
|
||||
prfm pldl1keep, [x_tbl3]
|
||||
prfm pldl1keep, [x_tbl4]
|
||||
|
||||
/* data_0 */
|
||||
and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
|
||||
ushr v_data_0.16b, v_data_0.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
|
||||
ushr v_data_1.16b, v_data_1.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
|
||||
ushr v_data_2.16b, v_data_2.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
|
||||
ushr v_data_3.16b, v_data_3.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop64_vects
|
||||
|
||||
.Lloop64_vects_end:
|
||||
add x_ptr, x_dest1, x_pos
|
||||
stp q_p1_0, q_p1_1, [x_ptr], #32
|
||||
stp q_p1_2, q_p1_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest2, x_pos
|
||||
stp q_p2_0, q_p2_1, [x_ptr], #32
|
||||
stp q_p2_2, q_p2_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest3, x_pos
|
||||
stp q_p3_0, q_p3_1, [x_ptr], #32
|
||||
stp q_p3_2, q_p3_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest4, x_pos
|
||||
stp q_p4_0, q_p4_1, [x_ptr], #32
|
||||
stp q_p4_2, q_p4_3, [x_ptr]
|
||||
|
||||
add x_pos, x_pos, #64
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
|
||||
add x_len, x_len, #64
|
||||
cmp x_pos, x_len
|
||||
beq .return_pass
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_len, x_len, #16
|
||||
cmp x_pos, x_len
|
||||
bgt .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
movi v_p4_0.16b, #0
|
||||
mov x_tbl1, x_tbl
|
||||
add x_tbl2, x_tbl1, x_vec, lsl #2
|
||||
add x_tbl3, x_tbl2, x_vec, lsl #2
|
||||
add x_tbl4, x_tbl3, x_vec, lsl #2
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop16_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
ldr q_data, [x_ptr, x_pos]
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
|
||||
ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
|
||||
ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
|
||||
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1keep, [x_tbl2]
|
||||
prfm pldl1keep, [x_tbl3]
|
||||
prfm pldl1keep, [x_tbl4]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
eor v_gft4_lo.16b, v_gft4_hi.16b, v_gft4_lo.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
bne .Lloop16_vects
|
||||
|
||||
.Lloop16_vects_end:
|
||||
str q_p1_0, [x_dest1, x_pos]
|
||||
str q_p2_0, [x_dest2, x_pos]
|
||||
str q_p3_0, [x_dest3, x_pos]
|
||||
str q_p4_0, [x_dest4, x_pos]
|
||||
add x_pos, x_pos, #16
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop16
|
||||
|
||||
.Lloop16_end:
|
||||
sub x_tmp, x_pos, x_len
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16_init:
|
||||
mov x_pos, x_len
|
||||
b .Lloop16
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
455
erasure_code/aarch64/gf_4vect_mad_neon.S
Normal file
455
erasure_code/aarch64/gf_4vect_mad_neon.S
Normal file
@ -0,0 +1,455 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
|
||||
.global gf_4vect_mad_neon
|
||||
.type gf_4vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x7
|
||||
x_dest2 .req x8
|
||||
x_dest3 .req x9
|
||||
x_dest4 .req x_dest
|
||||
x_tmp .req x10
|
||||
x_tbl1 .req x11
|
||||
x_tbl2 .req x12
|
||||
x_tbl3 .req x13
|
||||
x_tbl4 .req x14
|
||||
x_const .req x15
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp_lo .req v1
|
||||
v_tmp_hi .req v2
|
||||
v_tmp .req v3
|
||||
q_tmp .req q3
|
||||
|
||||
v_gft1_lo .req v4
|
||||
v_gft1_hi .req v5
|
||||
v_gft2_lo .req v6
|
||||
v_gft2_hi .req v7
|
||||
v_gft3_lo .req v16
|
||||
v_gft3_hi .req v17
|
||||
v_gft4_lo .req v18
|
||||
v_gft4_hi .req v19
|
||||
q_gft1_lo .req q4
|
||||
q_gft1_hi .req q5
|
||||
q_gft2_lo .req q6
|
||||
q_gft2_hi .req q7
|
||||
q_gft3_lo .req q16
|
||||
q_gft3_hi .req q17
|
||||
q_gft4_lo .req q18
|
||||
q_gft4_hi .req q19
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_data_0_lo .req v12
|
||||
v_data_1_lo .req v13
|
||||
v_data_2_lo .req v14
|
||||
v_data_3_lo .req v15
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
|
||||
v_d1_0 .req v20
|
||||
v_d1_1 .req v21
|
||||
v_d1_2 .req v22
|
||||
v_d1_3 .req v23
|
||||
v_d2_0 .req v24
|
||||
v_d2_1 .req v25
|
||||
v_d2_2 .req v26
|
||||
v_d2_3 .req v27
|
||||
v_d3_0 .req v28
|
||||
v_d3_1 .req v29
|
||||
v_d3_2 .req v30
|
||||
v_d3_3 .req v31
|
||||
q_d1_0 .req q20
|
||||
q_d1_1 .req q21
|
||||
q_d1_2 .req q22
|
||||
q_d1_3 .req q23
|
||||
q_d2_0 .req q24
|
||||
q_d2_1 .req q25
|
||||
q_d2_2 .req q26
|
||||
q_d2_3 .req q27
|
||||
q_d3_0 .req q28
|
||||
q_d3_1 .req q29
|
||||
q_d3_2 .req q30
|
||||
q_d3_3 .req q31
|
||||
|
||||
v_d4_0 .req v_d1_0
|
||||
v_d4_1 .req v_d1_1
|
||||
v_d4_2 .req v_d1_2
|
||||
v_d4_3 .req v_d1_3
|
||||
q_d4_0 .req q_d1_0
|
||||
q_d4_1 .req q_d1_1
|
||||
q_d4_2 .req q_d1_2
|
||||
q_d4_3 .req q_d1_3
|
||||
|
||||
v_data .req v21
|
||||
q_data .req q21
|
||||
v_data_lo .req v22
|
||||
v_data_hi .req v23
|
||||
|
||||
gf_4vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
lsl x_vec, x_vec, #5
|
||||
add x_tbl1, x_tbl, x_vec_i
|
||||
add x_tbl2, x_tbl1, x_vec
|
||||
add x_tbl3, x_tbl2, x_vec
|
||||
add x_tbl4, x_tbl3, x_vec
|
||||
add x_src_end, x_src, x_len
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
ldr x_dest4, [x_dest, #8*3]
|
||||
ldr q_gft1_lo, [x_tbl1]
|
||||
ldr q_gft1_hi, [x_tbl1, #16]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
ldr q_gft4_lo, [x_tbl4]
|
||||
ldr q_gft4_hi, [x_tbl4, #16]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop64:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
add x_src, x_src, #64
|
||||
|
||||
ldr q_d1_0, [x_dest1, #16*0]
|
||||
ldr q_d1_1, [x_dest1, #16*1]
|
||||
ldr q_d1_2, [x_dest1, #16*2]
|
||||
ldr q_d1_3, [x_dest1, #16*3]
|
||||
|
||||
ldr q_d2_0, [x_dest2, #16*0]
|
||||
ldr q_d2_1, [x_dest2, #16*1]
|
||||
ldr q_d2_2, [x_dest2, #16*2]
|
||||
ldr q_d2_3, [x_dest2, #16*3]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
str q_d1_1, [x_dest1, #16*1]
|
||||
str q_d1_2, [x_dest1, #16*2]
|
||||
str q_d1_3, [x_dest1, #16*3]
|
||||
add x_dest1, x_dest1, #64
|
||||
|
||||
str q_d2_0, [x_dest2, #16*0]
|
||||
str q_d2_1, [x_dest2, #16*1]
|
||||
str q_d2_2, [x_dest2, #16*2]
|
||||
str q_d2_3, [x_dest2, #16*3]
|
||||
add x_dest2, x_dest2, #64
|
||||
|
||||
ldr q_d3_0, [x_dest3, #16*0]
|
||||
ldr q_d3_1, [x_dest3, #16*1]
|
||||
ldr q_d3_2, [x_dest3, #16*2]
|
||||
ldr q_d3_3, [x_dest3, #16*3]
|
||||
|
||||
ldr q_d4_0, [x_dest4, #16*0]
|
||||
ldr q_d4_1, [x_dest4, #16*1]
|
||||
ldr q_d4_2, [x_dest4, #16*2]
|
||||
ldr q_d4_3, [x_dest4, #16*3]
|
||||
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d3_0, [x_dest3, #16*0]
|
||||
str q_d3_1, [x_dest3, #16*1]
|
||||
str q_d3_2, [x_dest3, #16*2]
|
||||
str q_d3_3, [x_dest3, #16*3]
|
||||
add x_dest3, x_dest3, #64
|
||||
|
||||
str q_d4_0, [x_dest4, #16*0]
|
||||
str q_d4_1, [x_dest4, #16*1]
|
||||
str q_d4_2, [x_dest4, #16*2]
|
||||
str q_d4_3, [x_dest4, #16*3]
|
||||
add x_dest4, x_dest4, #64
|
||||
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_d4_0, [x_dest4]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d3_0, [x_dest3]
|
||||
str q_d4_0, [x_dest4]
|
||||
|
||||
add x_src, x_src, #16
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_dest2, x_dest2, #16
|
||||
add x_dest3, x_dest3, #16
|
||||
add x_dest4, x_dest4, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
sub x_dest2, x_dest2, x_tmp
|
||||
sub x_dest3, x_dest3, x_tmp
|
||||
sub x_dest4, x_dest4, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_d4_0, [x_dest4]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d3_0, [x_dest3]
|
||||
str q_d4_0, [x_dest4]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
481
erasure_code/aarch64/gf_5vect_dot_prod_neon.S
Normal file
481
erasure_code/aarch64/gf_5vect_dot_prod_neon.S
Normal file
@ -0,0 +1,481 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
|
||||
.global gf_5vect_dot_prod_neon
|
||||
.type gf_5vect_dot_prod_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_tbl .req x2
|
||||
x_src .req x3
|
||||
x_dest .req x4
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_vec_i .req x5
|
||||
x_ptr .req x6
|
||||
x_pos .req x7
|
||||
x_tmp .req x8
|
||||
x_dest1 .req x9
|
||||
x_dest2 .req x10
|
||||
x_dest3 .req x11
|
||||
x_dest4 .req x12
|
||||
x_dest5 .req x13
|
||||
|
||||
/* vectors */
|
||||
v_tmp1 .req v0
|
||||
q_tmp1 .req q0
|
||||
v_tmp2 .req v1
|
||||
q_tmp2 .req q1
|
||||
|
||||
v_mask0f .req v_tmp1
|
||||
q_mask0f .req q_tmp1
|
||||
v_tmp_lo .req v_tmp1
|
||||
v_tmp_hi .req v_tmp2
|
||||
|
||||
v_gft_lo .req v2
|
||||
v_gft_hi .req v3
|
||||
q_gft_lo .req q2
|
||||
q_gft_hi .req q3
|
||||
|
||||
v_p1_0 .req v4
|
||||
v_p2_0 .req v5
|
||||
v_p3_0 .req v6
|
||||
v_p4_0 .req v7
|
||||
|
||||
q_p1_0 .req q4
|
||||
q_p2_0 .req q5
|
||||
q_p3_0 .req q6
|
||||
q_p4_0 .req q7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_data_0_lo .req v12
|
||||
v_data_1_lo .req v13
|
||||
v_data_2_lo .req v14
|
||||
v_data_3_lo .req v15
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
|
||||
v_p5_0 .req v16
|
||||
v_p1_1 .req v17
|
||||
v_p2_1 .req v18
|
||||
v_p3_1 .req v19
|
||||
v_p4_1 .req v20
|
||||
v_p5_1 .req v21
|
||||
v_p1_2 .req v22
|
||||
v_p2_2 .req v23
|
||||
v_p3_2 .req v24
|
||||
v_p4_2 .req v25
|
||||
v_p5_2 .req v26
|
||||
v_p1_3 .req v27
|
||||
v_p2_3 .req v28
|
||||
v_p3_3 .req v29
|
||||
v_p4_3 .req v30
|
||||
v_p5_3 .req v31
|
||||
|
||||
q_p5_0 .req q16
|
||||
q_p1_1 .req q17
|
||||
q_p2_1 .req q18
|
||||
q_p3_1 .req q19
|
||||
q_p4_1 .req q20
|
||||
q_p5_1 .req q21
|
||||
q_p1_2 .req q22
|
||||
q_p2_2 .req q23
|
||||
q_p3_2 .req q24
|
||||
q_p4_2 .req q25
|
||||
q_p5_2 .req q26
|
||||
q_p1_3 .req q27
|
||||
q_p2_3 .req q28
|
||||
q_p3_3 .req q29
|
||||
q_p4_3 .req q30
|
||||
q_p5_3 .req q31
|
||||
|
||||
v_data .req v_p1_1
|
||||
q_data .req q_p1_1
|
||||
v_data_lo .req v_p2_1
|
||||
v_data_hi .req v_p3_1
|
||||
|
||||
v_gft1_lo .req v_p4_1
|
||||
v_gft1_hi .req v_p5_1
|
||||
v_gft2_lo .req v_p1_2
|
||||
v_gft2_hi .req v_p2_2
|
||||
v_gft3_lo .req v_p3_2
|
||||
v_gft3_hi .req v_p4_2
|
||||
v_gft4_lo .req v_p5_2
|
||||
v_gft4_hi .req v_p1_3
|
||||
v_gft5_lo .req v_p2_3
|
||||
v_gft5_hi .req v_p3_3
|
||||
q_gft1_lo .req q_p4_1
|
||||
q_gft1_hi .req q_p5_1
|
||||
q_gft2_lo .req q_p1_2
|
||||
q_gft2_hi .req q_p2_2
|
||||
q_gft3_lo .req q_p3_2
|
||||
q_gft3_hi .req q_p4_2
|
||||
q_gft4_lo .req q_p5_2
|
||||
q_gft4_hi .req q_p1_3
|
||||
q_gft5_lo .req q_p2_3
|
||||
q_gft5_hi .req q_p3_3
|
||||
|
||||
|
||||
gf_5vect_dot_prod_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
mov x_pos, #0
|
||||
lsl x_vec, x_vec, #3
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
ldr x_dest4, [x_dest, #8*3]
|
||||
ldr x_dest5, [x_dest, #8*4]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_len, x_len, #64
|
||||
|
||||
.Lloop64:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p1_1.16b, #0
|
||||
movi v_p1_2.16b, #0
|
||||
movi v_p1_3.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p2_1.16b, #0
|
||||
movi v_p2_2.16b, #0
|
||||
movi v_p2_3.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
movi v_p3_1.16b, #0
|
||||
movi v_p3_2.16b, #0
|
||||
movi v_p3_3.16b, #0
|
||||
movi v_p4_0.16b, #0
|
||||
movi v_p4_1.16b, #0
|
||||
movi v_p4_2.16b, #0
|
||||
movi v_p4_3.16b, #0
|
||||
movi v_p5_0.16b, #0
|
||||
movi v_p5_1.16b, #0
|
||||
movi v_p5_2.16b, #0
|
||||
movi v_p5_3.16b, #0
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop64_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_ptr, x_ptr, x_pos
|
||||
|
||||
ldr q_data_0, [x_ptr], #16
|
||||
ldr q_data_1, [x_ptr], #16
|
||||
ldr q_data_2, [x_ptr], #16
|
||||
ldr q_data_3, [x_ptr], #16
|
||||
prfm pldl2keep, [x_ptr]
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
|
||||
/* v_p1_x */
|
||||
add x_tmp, x_tbl, x_vec_i, lsl #2
|
||||
add x_vec_i, x_vec_i, #8
|
||||
ldp q_gft_lo, q_gft_hi, [x_tmp]
|
||||
prfm pldl3keep, [x_tmp, #32]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p2_x */
|
||||
ldp q_gft_lo, q_gft_hi, [x_tmp]
|
||||
prfm pldl3keep, [x_tmp, #32]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p3_x */
|
||||
ldp q_gft_lo, q_gft_hi, [x_tmp]
|
||||
prfm pldl3keep, [x_tmp, #32]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p4_x */
|
||||
ldp q_gft_lo, q_gft_hi, [x_tmp]
|
||||
prfm pldl3keep, [x_tmp, #32]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p5_x */
|
||||
ldp q_gft_lo, q_gft_hi, [x_tmp]
|
||||
prfm pldl3keep, [x_tmp, #32]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop64_vects
|
||||
|
||||
.Lloop64_vects_end:
|
||||
add x_ptr, x_dest1, x_pos
|
||||
stp q_p1_0, q_p1_1, [x_ptr], #32
|
||||
stp q_p1_2, q_p1_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest2, x_pos
|
||||
stp q_p2_0, q_p2_1, [x_ptr], #32
|
||||
stp q_p2_2, q_p2_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest3, x_pos
|
||||
stp q_p3_0, q_p3_1, [x_ptr], #32
|
||||
stp q_p3_2, q_p3_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest4, x_pos
|
||||
stp q_p4_0, q_p4_1, [x_ptr], #32
|
||||
stp q_p4_2, q_p4_3, [x_ptr]
|
||||
|
||||
add x_ptr, x_dest5, x_pos
|
||||
stp q_p5_0, q_p5_1, [x_ptr], #32
|
||||
stp q_p5_2, q_p5_3, [x_ptr]
|
||||
|
||||
add x_pos, x_pos, #64
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
|
||||
add x_len, x_len, #64
|
||||
cmp x_pos, x_len
|
||||
beq .return_pass
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_len, x_len, #16
|
||||
cmp x_pos, x_len
|
||||
bgt .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
movi v_p1_0.16b, #0
|
||||
movi v_p2_0.16b, #0
|
||||
movi v_p3_0.16b, #0
|
||||
movi v_p4_0.16b, #0
|
||||
movi v_p5_0.16b, #0
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop16_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
ldr q_data, [x_ptr, x_pos]
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
add x_tmp, x_tbl, x_vec_i, lsl #2
|
||||
add x_vec_i, x_vec_i, #8
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tmp]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
ldp q_gft2_lo, q_gft2_hi, [x_tmp]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
ldp q_gft3_lo, q_gft3_hi, [x_tmp]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
ldp q_gft4_lo, q_gft4_hi, [x_tmp]
|
||||
add x_tmp, x_tmp, x_vec, lsl #2
|
||||
ldp q_gft5_lo, q_gft5_hi, [x_tmp]
|
||||
|
||||
tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
eor v_gft4_lo.16b, v_gft4_hi.16b, v_gft4_lo.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
|
||||
eor v_gft5_lo.16b, v_gft5_hi.16b, v_gft5_lo.16b
|
||||
eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
bne .Lloop16_vects
|
||||
|
||||
.Lloop16_vects_end:
|
||||
str q_p1_0, [x_dest1, x_pos]
|
||||
str q_p2_0, [x_dest2, x_pos]
|
||||
str q_p3_0, [x_dest3, x_pos]
|
||||
str q_p4_0, [x_dest4, x_pos]
|
||||
str q_p5_0, [x_dest5, x_pos]
|
||||
add x_pos, x_pos, #16
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop16
|
||||
|
||||
.Lloop16_end:
|
||||
sub x_tmp, x_pos, x_len
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16_init:
|
||||
mov x_pos, x_len
|
||||
b .Lloop16
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
534
erasure_code/aarch64/gf_5vect_mad_neon.S
Normal file
534
erasure_code/aarch64/gf_5vect_mad_neon.S
Normal file
@ -0,0 +1,534 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_5vect_mad_neon
|
||||
.type gf_5vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x7
|
||||
x_dest2 .req x8
|
||||
x_dest3 .req x9
|
||||
x_dest4 .req x10
|
||||
x_dest5 .req x_dest
|
||||
x_tmp .req x11
|
||||
x_tbl1 .req x12
|
||||
x_tbl2 .req x13
|
||||
x_tbl3 .req x14
|
||||
x_tbl4 .req x15
|
||||
x_tbl5 .req x16
|
||||
x_const .req x17
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp_lo .req v1
|
||||
v_tmp_hi .req v2
|
||||
v_tmp .req v3
|
||||
q_tmp .req q3
|
||||
|
||||
v_gft1_lo .req v4
|
||||
v_gft1_hi .req v5
|
||||
v_gft2_lo .req v6
|
||||
v_gft2_hi .req v7
|
||||
v_gft3_lo .req v16
|
||||
v_gft3_hi .req v17
|
||||
q_gft1_lo .req q4
|
||||
q_gft1_hi .req q5
|
||||
q_gft2_lo .req q6
|
||||
q_gft2_hi .req q7
|
||||
q_gft3_lo .req q16
|
||||
q_gft3_hi .req q17
|
||||
|
||||
v_gft4_lo .req v18
|
||||
v_gft4_hi .req v19
|
||||
q_gft4_lo .req q18
|
||||
q_gft4_hi .req q19
|
||||
v_gft5_lo .req v_gft2_lo
|
||||
v_gft5_hi .req v_gft2_hi
|
||||
q_gft5_lo .req q_gft2_lo
|
||||
q_gft5_hi .req q_gft2_hi
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_data_0_lo .req v12
|
||||
v_data_1_lo .req v13
|
||||
v_data_2_lo .req v14
|
||||
v_data_3_lo .req v15
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
|
||||
v_d1_0 .req v20
|
||||
v_d1_1 .req v21
|
||||
v_d1_2 .req v22
|
||||
v_d1_3 .req v23
|
||||
v_d2_0 .req v24
|
||||
v_d2_1 .req v25
|
||||
v_d2_2 .req v26
|
||||
v_d2_3 .req v27
|
||||
v_d3_0 .req v28
|
||||
v_d3_1 .req v29
|
||||
v_d3_2 .req v30
|
||||
v_d3_3 .req v31
|
||||
q_d1_0 .req q20
|
||||
q_d1_1 .req q21
|
||||
q_d1_2 .req q22
|
||||
q_d1_3 .req q23
|
||||
q_d2_0 .req q24
|
||||
q_d2_1 .req q25
|
||||
q_d2_2 .req q26
|
||||
q_d2_3 .req q27
|
||||
q_d3_0 .req q28
|
||||
q_d3_1 .req q29
|
||||
q_d3_2 .req q30
|
||||
q_d3_3 .req q31
|
||||
|
||||
v_d4_0 .req v_d1_0
|
||||
v_d4_1 .req v_d1_1
|
||||
v_d4_2 .req v_d1_2
|
||||
v_d4_3 .req v_d1_3
|
||||
q_d4_0 .req q_d1_0
|
||||
q_d4_1 .req q_d1_1
|
||||
q_d4_2 .req q_d1_2
|
||||
q_d4_3 .req q_d1_3
|
||||
v_d5_0 .req v_d2_0
|
||||
v_d5_1 .req v_d2_1
|
||||
v_d5_2 .req v_d2_2
|
||||
v_d5_3 .req v_d2_3
|
||||
q_d5_0 .req q_d2_0
|
||||
q_d5_1 .req q_d2_1
|
||||
q_d5_2 .req q_d2_2
|
||||
q_d5_3 .req q_d2_3
|
||||
|
||||
v_data .req v21
|
||||
q_data .req q21
|
||||
v_data_lo .req v22
|
||||
v_data_hi .req v23
|
||||
|
||||
gf_5vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
lsl x_vec, x_vec, #5
|
||||
add x_tbl1, x_tbl, x_vec_i
|
||||
add x_tbl2, x_tbl1, x_vec
|
||||
add x_tbl3, x_tbl2, x_vec
|
||||
add x_tbl4, x_tbl3, x_vec
|
||||
add x_tbl5, x_tbl4, x_vec
|
||||
add x_src_end, x_src, x_len
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
ldr x_dest4, [x_dest, #8*3]
|
||||
ldr x_dest5, [x_dest, #8*4]
|
||||
ldr q_gft1_lo, [x_tbl1]
|
||||
ldr q_gft1_hi, [x_tbl1, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
ldr q_gft4_lo, [x_tbl4]
|
||||
ldr q_gft4_hi, [x_tbl4, #16]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop64:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
add x_src, x_src, #64
|
||||
|
||||
ldr q_d1_0, [x_dest1, #16*0]
|
||||
ldr q_d1_1, [x_dest1, #16*1]
|
||||
ldr q_d1_2, [x_dest1, #16*2]
|
||||
ldr q_d1_3, [x_dest1, #16*3]
|
||||
|
||||
ldr q_d2_0, [x_dest2, #16*0]
|
||||
ldr q_d2_1, [x_dest2, #16*1]
|
||||
ldr q_d2_2, [x_dest2, #16*2]
|
||||
ldr q_d2_3, [x_dest2, #16*3]
|
||||
|
||||
ldr q_d3_0, [x_dest3, #16*0]
|
||||
ldr q_d3_1, [x_dest3, #16*1]
|
||||
ldr q_d3_2, [x_dest3, #16*2]
|
||||
ldr q_d3_3, [x_dest3, #16*3]
|
||||
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
str q_d1_1, [x_dest1, #16*1]
|
||||
str q_d1_2, [x_dest1, #16*2]
|
||||
str q_d1_3, [x_dest1, #16*3]
|
||||
add x_dest1, x_dest1, #64
|
||||
|
||||
str q_d2_0, [x_dest2, #16*0]
|
||||
str q_d2_1, [x_dest2, #16*1]
|
||||
str q_d2_2, [x_dest2, #16*2]
|
||||
str q_d2_3, [x_dest2, #16*3]
|
||||
add x_dest2, x_dest2, #64
|
||||
|
||||
str q_d3_0, [x_dest3, #16*0]
|
||||
str q_d3_1, [x_dest3, #16*1]
|
||||
str q_d3_2, [x_dest3, #16*2]
|
||||
str q_d3_3, [x_dest3, #16*3]
|
||||
add x_dest3, x_dest3, #64
|
||||
|
||||
ldr q_d4_0, [x_dest4, #16*0]
|
||||
ldr q_d4_1, [x_dest4, #16*1]
|
||||
ldr q_d4_2, [x_dest4, #16*2]
|
||||
ldr q_d4_3, [x_dest4, #16*3]
|
||||
|
||||
ldr q_d5_0, [x_dest5, #16*0]
|
||||
ldr q_d5_1, [x_dest5, #16*1]
|
||||
ldr q_d5_2, [x_dest5, #16*2]
|
||||
ldr q_d5_3, [x_dest5, #16*3]
|
||||
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest5 */
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4, #16*0]
|
||||
str q_d4_1, [x_dest4, #16*1]
|
||||
str q_d4_2, [x_dest4, #16*2]
|
||||
str q_d4_3, [x_dest4, #16*3]
|
||||
add x_dest4, x_dest4, #64
|
||||
|
||||
str q_d5_0, [x_dest5, #16*0]
|
||||
str q_d5_1, [x_dest5, #16*1]
|
||||
str q_d5_2, [x_dest5, #16*2]
|
||||
str q_d5_3, [x_dest5, #16*3]
|
||||
add x_dest5, x_dest5, #64
|
||||
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
ldr q_d4_0, [x_dest4]
|
||||
ldr q_d5_0, [x_dest5]
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
str q_d5_0, [x_dest5]
|
||||
|
||||
add x_src, x_src, #16
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_dest2, x_dest2, #16
|
||||
add x_dest3, x_dest3, #16
|
||||
add x_dest4, x_dest4, #16
|
||||
add x_dest5, x_dest5, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
sub x_dest2, x_dest2, x_tmp
|
||||
sub x_dest3, x_dest3, x_tmp
|
||||
sub x_dest4, x_dest4, x_tmp
|
||||
sub x_dest5, x_dest5, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
ldr q_d4_0, [x_dest4]
|
||||
ldr q_d5_0, [x_dest5]
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
str q_d5_0, [x_dest5]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
609
erasure_code/aarch64/gf_6vect_mad_neon.S
Normal file
609
erasure_code/aarch64/gf_6vect_mad_neon.S
Normal file
@ -0,0 +1,609 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
.global gf_6vect_mad_neon
|
||||
.type gf_6vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x7
|
||||
x_dest2 .req x8
|
||||
x_dest3 .req x9
|
||||
x_dest4 .req x10
|
||||
x_dest5 .req x11
|
||||
x_dest6 .req x_dest
|
||||
x_tmp .req x12
|
||||
x_tbl1 .req x13
|
||||
x_tbl2 .req x14
|
||||
x_tbl3 .req x15
|
||||
x_tbl4 .req x16
|
||||
x_tbl5 .req x17
|
||||
x_tbl6 .req x_tbl
|
||||
x_const .req x18
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp_lo .req v1
|
||||
v_tmp_hi .req v2
|
||||
v_tmp .req v3
|
||||
q_tmp .req q3
|
||||
|
||||
v_gft1_lo .req v4
|
||||
v_gft1_hi .req v5
|
||||
v_gft2_lo .req v6
|
||||
v_gft2_hi .req v7
|
||||
v_gft3_lo .req v16
|
||||
v_gft3_hi .req v17
|
||||
q_gft1_lo .req q4
|
||||
q_gft1_hi .req q5
|
||||
q_gft2_lo .req q6
|
||||
q_gft2_hi .req q7
|
||||
q_gft3_lo .req q16
|
||||
q_gft3_hi .req q17
|
||||
|
||||
v_gft4_lo .req v18
|
||||
v_gft4_hi .req v19
|
||||
q_gft4_lo .req q18
|
||||
q_gft4_hi .req q19
|
||||
v_gft5_lo .req v_gft2_lo
|
||||
v_gft5_hi .req v_gft2_hi
|
||||
q_gft5_lo .req q_gft2_lo
|
||||
q_gft5_hi .req q_gft2_hi
|
||||
v_gft6_lo .req v_gft3_lo
|
||||
v_gft6_hi .req v_gft3_hi
|
||||
q_gft6_lo .req q_gft3_lo
|
||||
q_gft6_hi .req q_gft3_hi
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
|
||||
v_data_0_lo .req v12
|
||||
v_data_1_lo .req v13
|
||||
v_data_2_lo .req v14
|
||||
v_data_3_lo .req v15
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
|
||||
v_d1_0 .req v20
|
||||
v_d1_1 .req v21
|
||||
v_d1_2 .req v22
|
||||
v_d1_3 .req v23
|
||||
v_d2_0 .req v24
|
||||
v_d2_1 .req v25
|
||||
v_d2_2 .req v26
|
||||
v_d2_3 .req v27
|
||||
v_d3_0 .req v28
|
||||
v_d3_1 .req v29
|
||||
v_d3_2 .req v30
|
||||
v_d3_3 .req v31
|
||||
q_d1_0 .req q20
|
||||
q_d1_1 .req q21
|
||||
q_d1_2 .req q22
|
||||
q_d1_3 .req q23
|
||||
q_d2_0 .req q24
|
||||
q_d2_1 .req q25
|
||||
q_d2_2 .req q26
|
||||
q_d2_3 .req q27
|
||||
q_d3_0 .req q28
|
||||
q_d3_1 .req q29
|
||||
q_d3_2 .req q30
|
||||
q_d3_3 .req q31
|
||||
|
||||
v_d4_0 .req v_d1_0
|
||||
v_d4_1 .req v_d1_1
|
||||
v_d4_2 .req v_d1_2
|
||||
v_d4_3 .req v_d1_3
|
||||
q_d4_0 .req q_d1_0
|
||||
q_d4_1 .req q_d1_1
|
||||
q_d4_2 .req q_d1_2
|
||||
q_d4_3 .req q_d1_3
|
||||
v_d5_0 .req v_d2_0
|
||||
v_d5_1 .req v_d2_1
|
||||
v_d5_2 .req v_d2_2
|
||||
v_d5_3 .req v_d2_3
|
||||
q_d5_0 .req q_d2_0
|
||||
q_d5_1 .req q_d2_1
|
||||
q_d5_2 .req q_d2_2
|
||||
q_d5_3 .req q_d2_3
|
||||
v_d6_0 .req v_d3_0
|
||||
v_d6_1 .req v_d3_1
|
||||
v_d6_2 .req v_d3_2
|
||||
v_d6_3 .req v_d3_3
|
||||
q_d6_0 .req q_d3_0
|
||||
q_d6_1 .req q_d3_1
|
||||
q_d6_2 .req q_d3_2
|
||||
q_d6_3 .req q_d3_3
|
||||
|
||||
v_data .req v21
|
||||
q_data .req q21
|
||||
v_data_lo .req v22
|
||||
v_data_hi .req v23
|
||||
|
||||
gf_6vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
lsl x_vec, x_vec, #5
|
||||
add x_tbl1, x_tbl, x_vec_i
|
||||
add x_tbl2, x_tbl1, x_vec
|
||||
add x_tbl3, x_tbl2, x_vec
|
||||
add x_tbl4, x_tbl3, x_vec
|
||||
add x_tbl5, x_tbl4, x_vec
|
||||
add x_tbl6, x_tbl5, x_vec
|
||||
add x_src_end, x_src, x_len
|
||||
ldr x_dest1, [x_dest, #8*0]
|
||||
ldr x_dest2, [x_dest, #8*1]
|
||||
ldr x_dest3, [x_dest, #8*2]
|
||||
ldr x_dest4, [x_dest, #8*3]
|
||||
ldr x_dest5, [x_dest, #8*4]
|
||||
ldr x_dest6, [x_dest, #8*5]
|
||||
ldr q_gft1_lo, [x_tbl1]
|
||||
ldr q_gft1_hi, [x_tbl1, #16]
|
||||
ldr q_gft4_lo, [x_tbl4]
|
||||
ldr q_gft4_hi, [x_tbl4, #16]
|
||||
|
||||
.Lloop64_init:
|
||||
/* less than 64 bytes, goto Lloop16_init */
|
||||
cmp x_len, #64
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop64:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
add x_src, x_src, #64
|
||||
|
||||
ldr q_d1_0, [x_dest1, #16*0]
|
||||
ldr q_d1_1, [x_dest1, #16*1]
|
||||
ldr q_d1_2, [x_dest1, #16*2]
|
||||
ldr q_d1_3, [x_dest1, #16*3]
|
||||
|
||||
ldr q_d2_0, [x_dest2, #16*0]
|
||||
ldr q_d2_1, [x_dest2, #16*1]
|
||||
ldr q_d2_2, [x_dest2, #16*2]
|
||||
ldr q_d2_3, [x_dest2, #16*3]
|
||||
|
||||
ldr q_d3_0, [x_dest3, #16*0]
|
||||
ldr q_d3_1, [x_dest3, #16*1]
|
||||
ldr q_d3_2, [x_dest3, #16*2]
|
||||
ldr q_d3_3, [x_dest3, #16*3]
|
||||
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
str q_d1_1, [x_dest1, #16*1]
|
||||
str q_d1_2, [x_dest1, #16*2]
|
||||
str q_d1_3, [x_dest1, #16*3]
|
||||
add x_dest1, x_dest1, #64
|
||||
|
||||
str q_d2_0, [x_dest2, #16*0]
|
||||
str q_d2_1, [x_dest2, #16*1]
|
||||
str q_d2_2, [x_dest2, #16*2]
|
||||
str q_d2_3, [x_dest2, #16*3]
|
||||
add x_dest2, x_dest2, #64
|
||||
|
||||
str q_d3_0, [x_dest3, #16*0]
|
||||
str q_d3_1, [x_dest3, #16*1]
|
||||
str q_d3_2, [x_dest3, #16*2]
|
||||
str q_d3_3, [x_dest3, #16*3]
|
||||
add x_dest3, x_dest3, #64
|
||||
|
||||
ldr q_d4_0, [x_dest4, #16*0]
|
||||
ldr q_d4_1, [x_dest4, #16*1]
|
||||
ldr q_d4_2, [x_dest4, #16*2]
|
||||
ldr q_d4_3, [x_dest4, #16*3]
|
||||
|
||||
ldr q_d5_0, [x_dest5, #16*0]
|
||||
ldr q_d5_1, [x_dest5, #16*1]
|
||||
ldr q_d5_2, [x_dest5, #16*2]
|
||||
ldr q_d5_3, [x_dest5, #16*3]
|
||||
|
||||
ldr q_d6_0, [x_dest6, #16*0]
|
||||
ldr q_d6_1, [x_dest6, #16*1]
|
||||
ldr q_d6_2, [x_dest6, #16*2]
|
||||
ldr q_d6_3, [x_dest6, #16*3]
|
||||
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
ldr q_gft6_lo, [x_tbl6]
|
||||
ldr q_gft6_hi, [x_tbl6, #16]
|
||||
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest5 */
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest6 */
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4, #16*0]
|
||||
str q_d4_1, [x_dest4, #16*1]
|
||||
str q_d4_2, [x_dest4, #16*2]
|
||||
str q_d4_3, [x_dest4, #16*3]
|
||||
add x_dest4, x_dest4, #64
|
||||
|
||||
str q_d5_0, [x_dest5, #16*0]
|
||||
str q_d5_1, [x_dest5, #16*1]
|
||||
str q_d5_2, [x_dest5, #16*2]
|
||||
str q_d5_3, [x_dest5, #16*3]
|
||||
add x_dest5, x_dest5, #64
|
||||
|
||||
str q_d6_0, [x_dest6, #16*0]
|
||||
str q_d6_1, [x_dest6, #16*1]
|
||||
str q_d6_2, [x_dest6, #16*2]
|
||||
str q_d6_3, [x_dest6, #16*3]
|
||||
add x_dest6, x_dest6, #64
|
||||
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop64
|
||||
|
||||
.Lloop64_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #64
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
ldr q_d4_0, [x_dest4]
|
||||
ldr q_d5_0, [x_dest5]
|
||||
ldr q_d6_0, [x_dest6]
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
ldr q_gft6_lo, [x_tbl6]
|
||||
ldr q_gft6_hi, [x_tbl6, #16]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
str q_d5_0, [x_dest5]
|
||||
str q_d6_0, [x_dest6]
|
||||
|
||||
add x_src, x_src, #16
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_dest2, x_dest2, #16
|
||||
add x_dest3, x_dest3, #16
|
||||
add x_dest4, x_dest4, #16
|
||||
add x_dest5, x_dest5, #16
|
||||
add x_dest6, x_dest6, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
sub x_dest2, x_dest2, x_tmp
|
||||
sub x_dest3, x_dest3, x_tmp
|
||||
sub x_dest4, x_dest4, x_tmp
|
||||
sub x_dest5, x_dest5, x_tmp
|
||||
sub x_dest6, x_dest6, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
ldr q_d2_0, [x_dest2]
|
||||
ldr q_d3_0, [x_dest3]
|
||||
ldr q_gft2_lo, [x_tbl2]
|
||||
ldr q_gft2_hi, [x_tbl2, #16]
|
||||
ldr q_gft3_lo, [x_tbl3]
|
||||
ldr q_gft3_hi, [x_tbl3, #16]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
str q_d2_0, [x_dest2]
|
||||
str q_d3_0, [x_dest3]
|
||||
|
||||
ldr q_d4_0, [x_dest4]
|
||||
ldr q_d5_0, [x_dest5]
|
||||
ldr q_d6_0, [x_dest6]
|
||||
ldr q_gft5_lo, [x_tbl5]
|
||||
ldr q_gft5_hi, [x_tbl5, #16]
|
||||
ldr q_gft6_lo, [x_tbl6]
|
||||
ldr q_gft6_hi, [x_tbl6, #16]
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
|
||||
eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
str q_d5_0, [x_dest5]
|
||||
str q_d6_0, [x_dest6]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
299
erasure_code/aarch64/gf_vect_dot_prod_neon.S
Normal file
299
erasure_code/aarch64/gf_vect_dot_prod_neon.S
Normal file
@ -0,0 +1,299 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_vect_dot_prod_neon
|
||||
.type gf_vect_dot_prod_neon, %function
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_tbl .req x2
|
||||
x_src .req x3
|
||||
x_dest1 .req x4
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_vec_i .req x5
|
||||
x_ptr .req x6
|
||||
x_pos .req x7
|
||||
x_tmp .req x8
|
||||
x_tbl1 .req x9
|
||||
|
||||
/* vectors */
|
||||
v_gft1_lo .req v0
|
||||
v_gft1_hi .req v1
|
||||
q_gft1_lo .req q0
|
||||
q_gft1_hi .req q1
|
||||
v_mask0f .req v2
|
||||
q_mask0f .req q2
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
v_data_4 .req v12
|
||||
v_data_5 .req v13
|
||||
v_data_6 .req v14
|
||||
v_data_7 .req v15
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
q_data_4 .req q12
|
||||
q_data_5 .req q13
|
||||
q_data_6 .req q14
|
||||
q_data_7 .req q15
|
||||
|
||||
v_data_0_lo .req v16
|
||||
v_data_1_lo .req v17
|
||||
v_data_2_lo .req v18
|
||||
v_data_3_lo .req v19
|
||||
v_data_4_lo .req v20
|
||||
v_data_5_lo .req v21
|
||||
v_data_6_lo .req v22
|
||||
v_data_7_lo .req v23
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
v_data_4_hi .req v_data_4
|
||||
v_data_5_hi .req v_data_5
|
||||
v_data_6_hi .req v_data_6
|
||||
v_data_7_hi .req v_data_7
|
||||
|
||||
v_p0 .req v24
|
||||
v_p1 .req v25
|
||||
v_p2 .req v26
|
||||
v_p3 .req v27
|
||||
v_p4 .req v28
|
||||
v_p5 .req v29
|
||||
v_p6 .req v30
|
||||
v_p7 .req v31
|
||||
q_p0 .req q24
|
||||
q_p1 .req q25
|
||||
q_p2 .req q26
|
||||
q_p3 .req q27
|
||||
q_p4 .req q28
|
||||
q_p5 .req q29
|
||||
q_p6 .req q30
|
||||
q_p7 .req q31
|
||||
|
||||
v_p .req v_p0
|
||||
q_p .req q_p0
|
||||
v_data .req v_p1
|
||||
q_data .req q_p1
|
||||
v_data_lo .req v_p2
|
||||
v_data_hi .req v_p3
|
||||
|
||||
|
||||
gf_vect_dot_prod_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
mov x_pos, #0
|
||||
|
||||
lsl x_vec, x_vec, #3
|
||||
|
||||
.Lloop128_init:
|
||||
/* less than 128 bytes, goto Lloop16_init */
|
||||
cmp x_len, #128
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_len, x_len, #128
|
||||
|
||||
.Lloop128:
|
||||
movi v_p0.16b, #0
|
||||
movi v_p1.16b, #0
|
||||
movi v_p2.16b, #0
|
||||
movi v_p3.16b, #0
|
||||
movi v_p4.16b, #0
|
||||
movi v_p5.16b, #0
|
||||
movi v_p6.16b, #0
|
||||
movi v_p7.16b, #0
|
||||
|
||||
mov x_tbl1, x_tbl
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop128_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
add x_ptr, x_ptr, x_pos
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
|
||||
ldp q_data_0, q_data_1, [x_ptr], #32
|
||||
ldp q_data_2, q_data_3, [x_ptr], #32
|
||||
ldp q_data_4, q_data_5, [x_ptr], #32
|
||||
ldp q_data_6, q_data_7, [x_ptr]
|
||||
|
||||
prfm pldl1keep, [x_tbl1]
|
||||
prfm pldl1strm, [x_ptr]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
|
||||
and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
|
||||
and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
|
||||
and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
ushr v_data_4_hi.16b, v_data_4.16b, #4
|
||||
ushr v_data_5_hi.16b, v_data_5.16b, #4
|
||||
ushr v_data_6_hi.16b, v_data_6.16b, #4
|
||||
ushr v_data_7_hi.16b, v_data_7.16b, #4
|
||||
|
||||
tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
|
||||
tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
|
||||
tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
|
||||
eor v_data_0_hi.16b, v_data_0_lo.16b, v_data_0_hi.16b
|
||||
eor v_data_1_hi.16b, v_data_1_lo.16b, v_data_1_hi.16b
|
||||
eor v_data_2_hi.16b, v_data_2_lo.16b, v_data_2_hi.16b
|
||||
eor v_data_3_hi.16b, v_data_3_lo.16b, v_data_3_hi.16b
|
||||
eor v_data_4_hi.16b, v_data_4_lo.16b, v_data_4_hi.16b
|
||||
eor v_data_5_hi.16b, v_data_5_lo.16b, v_data_5_hi.16b
|
||||
eor v_data_6_hi.16b, v_data_6_lo.16b, v_data_6_hi.16b
|
||||
eor v_data_7_hi.16b, v_data_7_lo.16b, v_data_7_hi.16b
|
||||
|
||||
eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
|
||||
eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
|
||||
eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
|
||||
eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
|
||||
eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
|
||||
eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
|
||||
eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
|
||||
eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop128_vects
|
||||
|
||||
.Lloop128_vects_end:
|
||||
add x_ptr, x_dest1, x_pos
|
||||
stp q_p0, q_p1, [x_ptr], #32
|
||||
stp q_p2, q_p3, [x_ptr], #32
|
||||
stp q_p4, q_p5, [x_ptr], #32
|
||||
stp q_p6, q_p7, [x_ptr]
|
||||
|
||||
add x_pos, x_pos, #128
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop128
|
||||
|
||||
.Lloop128_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
|
||||
add x_len, x_len, #128
|
||||
cmp x_pos, x_len
|
||||
beq .return_pass
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_len, x_len, #16
|
||||
cmp x_pos, x_len
|
||||
bgt .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
movi v_p.16b, #0
|
||||
mov x_tbl1, x_tbl
|
||||
mov x_vec_i, #0
|
||||
|
||||
.Lloop16_vects:
|
||||
ldr x_ptr, [x_src, x_vec_i]
|
||||
ldr q_data, [x_ptr, x_pos]
|
||||
add x_vec_i, x_vec_i, #8
|
||||
|
||||
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_data_hi.16b, v_data_lo.16b, v_data_hi.16b
|
||||
eor v_p.16b, v_p.16b, v_data_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
blt .Lloop16_vects
|
||||
|
||||
.Lloop16_vects_end:
|
||||
str q_p, [x_dest1, x_pos]
|
||||
add x_pos, x_pos, #16
|
||||
cmp x_pos, x_len
|
||||
ble .Lloop16
|
||||
|
||||
.Lloop16_end:
|
||||
sub x_tmp, x_pos, x_len
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16_init:
|
||||
mov x_pos, x_len
|
||||
b .Lloop16
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
314
erasure_code/aarch64/gf_vect_mad_neon.S
Normal file
314
erasure_code/aarch64/gf_vect_mad_neon.S
Normal file
@ -0,0 +1,314 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
.text
|
||||
|
||||
.global gf_vect_mad_neon
|
||||
.type gf_vect_mad_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_vec .req x1
|
||||
x_vec_i .req x2
|
||||
x_tbl .req x3
|
||||
x_src .req x4
|
||||
x_dest .req x5
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_src_end .req x6
|
||||
x_dest1 .req x_dest
|
||||
x_tmp .req x7
|
||||
x_const .req x8
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
v_tmp .req v1
|
||||
q_tmp .req q1
|
||||
|
||||
v_tmp1_lo .req v2
|
||||
v_tmp1_hi .req v3
|
||||
v_tmp2_lo .req v4
|
||||
v_tmp2_hi .req v5
|
||||
|
||||
v_gft1_lo .req v6
|
||||
v_gft1_hi .req v7
|
||||
q_gft1_lo .req q6
|
||||
q_gft1_hi .req q7
|
||||
|
||||
v_data_0 .req v8
|
||||
v_data_1 .req v9
|
||||
v_data_2 .req v10
|
||||
v_data_3 .req v11
|
||||
v_data_4 .req v12
|
||||
v_data_5 .req v13
|
||||
v_data_6 .req v14
|
||||
v_data_7 .req v15
|
||||
q_data_0 .req q8
|
||||
q_data_1 .req q9
|
||||
q_data_2 .req q10
|
||||
q_data_3 .req q11
|
||||
q_data_4 .req q12
|
||||
q_data_5 .req q13
|
||||
q_data_6 .req q14
|
||||
q_data_7 .req q15
|
||||
|
||||
v_data_0_lo .req v16
|
||||
v_data_1_lo .req v17
|
||||
v_data_2_lo .req v18
|
||||
v_data_3_lo .req v19
|
||||
v_data_4_lo .req v20
|
||||
v_data_5_lo .req v21
|
||||
v_data_6_lo .req v22
|
||||
v_data_7_lo .req v23
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
v_data_4_hi .req v_data_4
|
||||
v_data_5_hi .req v_data_5
|
||||
v_data_6_hi .req v_data_6
|
||||
v_data_7_hi .req v_data_7
|
||||
|
||||
v_d1_0 .req v24
|
||||
v_d1_1 .req v25
|
||||
v_d1_2 .req v26
|
||||
v_d1_3 .req v27
|
||||
v_d1_4 .req v28
|
||||
v_d1_5 .req v29
|
||||
v_d1_6 .req v30
|
||||
v_d1_7 .req v31
|
||||
q_d1_0 .req q24
|
||||
q_d1_1 .req q25
|
||||
q_d1_2 .req q26
|
||||
q_d1_3 .req q27
|
||||
q_d1_4 .req q28
|
||||
q_d1_5 .req q29
|
||||
q_d1_6 .req q30
|
||||
q_d1_7 .req q31
|
||||
|
||||
v_data .req v_d1_1
|
||||
q_data .req q_d1_1
|
||||
v_data_lo .req v_d1_2
|
||||
v_data_hi .req v_d1_3
|
||||
|
||||
|
||||
gf_vect_mad_neon:
|
||||
/* less than 16 bytes, return_fail */
|
||||
cmp x_len, #16
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
lsl x_vec_i, x_vec_i, #5
|
||||
add x_tbl, x_tbl, x_vec_i
|
||||
add x_src_end, x_src, x_len
|
||||
|
||||
ldr q_gft1_lo, [x_tbl]
|
||||
ldr q_gft1_hi, [x_tbl, #16]
|
||||
|
||||
.Lloop128_init:
|
||||
/* less than 128 bytes, goto Lloop16_init */
|
||||
cmp x_len, #128
|
||||
blt .Lloop16_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop128:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
ldr q_data_4, [x_src, #16*4]
|
||||
ldr q_data_5, [x_src, #16*5]
|
||||
ldr q_data_6, [x_src, #16*6]
|
||||
ldr q_data_7, [x_src, #16*7]
|
||||
|
||||
ldr q_d1_0, [x_dest1, #16*0]
|
||||
ldr q_d1_1, [x_dest1, #16*1]
|
||||
ldr q_d1_2, [x_dest1, #16*2]
|
||||
ldr q_d1_3, [x_dest1, #16*3]
|
||||
ldr q_d1_4, [x_dest1, #16*4]
|
||||
ldr q_d1_5, [x_dest1, #16*5]
|
||||
ldr q_d1_6, [x_dest1, #16*6]
|
||||
ldr q_d1_7, [x_dest1, #16*7]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
|
||||
and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
|
||||
and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
|
||||
and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
ushr v_data_4_hi.16b, v_data_4.16b, #4
|
||||
ushr v_data_5_hi.16b, v_data_5.16b, #4
|
||||
ushr v_data_6_hi.16b, v_data_6.16b, #4
|
||||
ushr v_data_7_hi.16b, v_data_7.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
str q_d1_1, [x_dest1, #16*1]
|
||||
str q_d1_2, [x_dest1, #16*2]
|
||||
str q_d1_3, [x_dest1, #16*3]
|
||||
str q_d1_4, [x_dest1, #16*4]
|
||||
str q_d1_5, [x_dest1, #16*5]
|
||||
str q_d1_6, [x_dest1, #16*6]
|
||||
str q_d1_7, [x_dest1, #16*7]
|
||||
|
||||
add x_src, x_src, #128
|
||||
add x_dest1, x_dest1, #128
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop128
|
||||
|
||||
.Lloop128_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop16_init:
|
||||
sub x_src_end, x_src_end, #16
|
||||
cmp x_src, x_src_end
|
||||
bhi .lessthan16_init
|
||||
|
||||
.Lloop16:
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
|
||||
add x_dest1, x_dest1, #16
|
||||
add x_src, x_src, #16
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop16
|
||||
|
||||
.lessthan16_init:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #16
|
||||
beq .return_pass
|
||||
|
||||
.lessthan16:
|
||||
mov x_src, x_src_end
|
||||
sub x_dest1, x_dest1, x_tmp
|
||||
|
||||
ldr x_const, =const_tbl
|
||||
sub x_const, x_const, x_tmp
|
||||
ldr q_tmp, [x_const, #16]
|
||||
|
||||
ldr q_data, [x_src]
|
||||
ldr q_d1_0, [x_dest1]
|
||||
|
||||
and v_data_lo.16b, v_data.16b, v_mask0f.16b
|
||||
ushr v_data_hi.16b, v_data.16b, #4
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.balign 8
|
||||
const_tbl:
|
||||
.dword 0x0000000000000000, 0x0000000000000000
|
||||
.dword 0xffffffffffffffff, 0xffffffffffffffff
|
235
erasure_code/aarch64/gf_vect_mul_neon.S
Normal file
235
erasure_code/aarch64/gf_vect_mul_neon.S
Normal file
@ -0,0 +1,235 @@
|
||||
/**************************************************************
|
||||
Copyright (c) 2019 Huawei Technologies Co., Ltd.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Huawei Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
.text
|
||||
|
||||
.global gf_vect_mul_neon
|
||||
.type gf_vect_mul_neon, %function
|
||||
|
||||
|
||||
/* arguments */
|
||||
x_len .req x0
|
||||
x_tbl .req x1
|
||||
x_src .req x2
|
||||
x_dest .req x3
|
||||
|
||||
/* returns */
|
||||
w_ret .req w0
|
||||
|
||||
/* local variables */
|
||||
x_dest1 .req x_dest
|
||||
x_src_end .req x4
|
||||
x_tmp .req x5
|
||||
|
||||
/* vectors */
|
||||
v_mask0f .req v0
|
||||
|
||||
v_gft1_lo .req v2
|
||||
v_gft1_hi .req v3
|
||||
q_gft1_lo .req q2
|
||||
q_gft1_hi .req q3
|
||||
|
||||
v_data_0 .req v16
|
||||
v_data_1 .req v17
|
||||
v_data_2 .req v18
|
||||
v_data_3 .req v19
|
||||
v_data_4 .req v20
|
||||
v_data_5 .req v21
|
||||
v_data_6 .req v22
|
||||
v_data_7 .req v23
|
||||
q_data_0 .req q16
|
||||
q_data_1 .req q17
|
||||
q_data_2 .req q18
|
||||
q_data_3 .req q19
|
||||
q_data_4 .req q20
|
||||
q_data_5 .req q21
|
||||
q_data_6 .req q22
|
||||
q_data_7 .req q23
|
||||
|
||||
v_data_0_lo .req v24
|
||||
v_data_1_lo .req v25
|
||||
v_data_2_lo .req v26
|
||||
v_data_3_lo .req v27
|
||||
v_data_4_lo .req v28
|
||||
v_data_5_lo .req v29
|
||||
v_data_6_lo .req v30
|
||||
v_data_7_lo .req v31
|
||||
v_data_0_hi .req v_data_0
|
||||
v_data_1_hi .req v_data_1
|
||||
v_data_2_hi .req v_data_2
|
||||
v_data_3_hi .req v_data_3
|
||||
v_data_4_hi .req v_data_4
|
||||
v_data_5_hi .req v_data_5
|
||||
v_data_6_hi .req v_data_6
|
||||
v_data_7_hi .req v_data_7
|
||||
|
||||
|
||||
gf_vect_mul_neon:
|
||||
/* less than 32 bytes, return_fail */
|
||||
cmp x_len, #32
|
||||
blt .return_fail
|
||||
|
||||
movi v_mask0f.16b, #0x0f
|
||||
add x_src_end, x_src, x_len
|
||||
ldr q_gft1_lo, [x_tbl]
|
||||
ldr q_gft1_hi, [x_tbl, #16]
|
||||
|
||||
|
||||
.Lloop128_init:
|
||||
/* less than 128 bytes, goto Lloop16_init */
|
||||
cmp x_len, #128
|
||||
blt .Lloop32_init
|
||||
|
||||
/* save d8 ~ d15 to stack */
|
||||
sub sp, sp, #64
|
||||
stp d8, d9, [sp]
|
||||
stp d10, d11, [sp, #16]
|
||||
stp d12, d13, [sp, #32]
|
||||
stp d14, d15, [sp, #48]
|
||||
|
||||
sub x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop128:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
ldr q_data_2, [x_src, #16*2]
|
||||
ldr q_data_3, [x_src, #16*3]
|
||||
ldr q_data_4, [x_src, #16*4]
|
||||
ldr q_data_5, [x_src, #16*5]
|
||||
ldr q_data_6, [x_src, #16*6]
|
||||
ldr q_data_7, [x_src, #16*7]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
|
||||
and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
|
||||
and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
|
||||
and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
|
||||
and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
|
||||
and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
|
||||
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
ushr v_data_2_hi.16b, v_data_2.16b, #4
|
||||
ushr v_data_3_hi.16b, v_data_3.16b, #4
|
||||
ushr v_data_4_hi.16b, v_data_4.16b, #4
|
||||
ushr v_data_5_hi.16b, v_data_5.16b, #4
|
||||
ushr v_data_6_hi.16b, v_data_6.16b, #4
|
||||
ushr v_data_7_hi.16b, v_data_7.16b, #4
|
||||
|
||||
tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
|
||||
tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
|
||||
tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
|
||||
eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
|
||||
eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
|
||||
eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b
|
||||
eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b
|
||||
eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b
|
||||
eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b
|
||||
eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b
|
||||
eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b
|
||||
|
||||
str q_data_0, [x_dest1, #16*0]
|
||||
str q_data_1, [x_dest1, #16*1]
|
||||
str q_data_2, [x_dest1, #16*2]
|
||||
str q_data_3, [x_dest1, #16*3]
|
||||
str q_data_4, [x_dest1, #16*4]
|
||||
str q_data_5, [x_dest1, #16*5]
|
||||
str q_data_6, [x_dest1, #16*6]
|
||||
str q_data_7, [x_dest1, #16*7]
|
||||
|
||||
add x_src, x_src, #128
|
||||
add x_dest1, x_dest1, #128
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop128
|
||||
|
||||
.Lloop128_end:
|
||||
/* restore d8 ~ d15 */
|
||||
ldp d8, d9, [sp]
|
||||
ldp d10, d11, [sp, #16]
|
||||
ldp d12, d13, [sp, #32]
|
||||
ldp d14, d15, [sp, #48]
|
||||
add sp, sp, #64
|
||||
add x_src_end, x_src_end, #128
|
||||
|
||||
.Lloop32_init:
|
||||
sub x_src_end, x_src_end, #32
|
||||
cmp x_src, x_src_end
|
||||
bhi .return_fail
|
||||
|
||||
.Lloop32:
|
||||
ldr q_data_0, [x_src, #16*0]
|
||||
ldr q_data_1, [x_src, #16*1]
|
||||
|
||||
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
|
||||
and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
|
||||
ushr v_data_0_hi.16b, v_data_0.16b, #4
|
||||
ushr v_data_1_hi.16b, v_data_1.16b, #4
|
||||
tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
|
||||
eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
|
||||
str q_data_0, [x_dest1, #16*0]
|
||||
str q_data_1, [x_dest1, #16*1]
|
||||
|
||||
add x_dest1, x_dest1, #32
|
||||
add x_src, x_src, #32
|
||||
cmp x_src, x_src_end
|
||||
bls .Lloop32
|
||||
|
||||
.Lloop32_end:
|
||||
sub x_tmp, x_src, x_src_end
|
||||
cmp x_tmp, #32
|
||||
beq .return_pass
|
||||
b .return_fail
|
||||
|
||||
.return_pass:
|
||||
mov w_ret, #0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov w_ret, #1
|
||||
ret
|
Loading…
Reference in New Issue
Block a user