diff --git a/erasure_code/aarch64/Makefile.am b/erasure_code/aarch64/Makefile.am index 94bb5a1..47bbf12 100644 --- a/erasure_code/aarch64/Makefile.am +++ b/erasure_code/aarch64/Makefile.am @@ -42,4 +42,19 @@ lsrc_aarch64 += \ erasure_code/aarch64/gf_5vect_mad_neon.S \ erasure_code/aarch64/gf_6vect_mad_neon.S \ erasure_code/aarch64/gf_vect_mul_neon.S \ + erasure_code/aarch64/gf_vect_mad_sve.S \ + erasure_code/aarch64/gf_2vect_mad_sve.S \ + erasure_code/aarch64/gf_3vect_mad_sve.S \ + erasure_code/aarch64/gf_4vect_mad_sve.S \ + erasure_code/aarch64/gf_5vect_mad_sve.S \ + erasure_code/aarch64/gf_6vect_mad_sve.S \ + erasure_code/aarch64/gf_vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_2vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_3vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_4vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_5vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_6vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_7vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_8vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_vect_mul_sve.S \ erasure_code/aarch64/ec_multibinary_arm.S diff --git a/erasure_code/aarch64/ec_aarch64_dispatcher.c b/erasure_code/aarch64/ec_aarch64_dispatcher.c index ba66347..42bd780 100644 --- a/erasure_code/aarch64/ec_aarch64_dispatcher.c +++ b/erasure_code/aarch64/ec_aarch64_dispatcher.c @@ -30,7 +30,11 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) { - if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_dot_prod_sve); + if (auxval & HWCAP_ASIMD) return PROVIDER_INFO(gf_vect_dot_prod_neon); return PROVIDER_BASIC(gf_vect_dot_prod); @@ -38,7 +42,11 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) DEFINE_INTERFACE_DISPATCHER(gf_vect_mad) { - if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mad_sve); + if (auxval & HWCAP_ASIMD) return PROVIDER_INFO(gf_vect_mad_neon); return PROVIDER_BASIC(gf_vect_mad); @@ -46,7 +54,11 @@ DEFINE_INTERFACE_DISPATCHER(gf_vect_mad) DEFINE_INTERFACE_DISPATCHER(ec_encode_data) { - if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_sve); + if (auxval & HWCAP_ASIMD) return PROVIDER_INFO(ec_encode_data_neon); return PROVIDER_BASIC(ec_encode_data); @@ -54,7 +66,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data) DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update) { - if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_update_sve); + if (auxval & HWCAP_ASIMD) return PROVIDER_INFO(ec_encode_data_update_neon); return PROVIDER_BASIC(ec_encode_data_update); @@ -62,7 +78,11 @@ DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update) DEFINE_INTERFACE_DISPATCHER(gf_vect_mul) { - if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mul_sve); + if (auxval & HWCAP_ASIMD) return PROVIDER_INFO(gf_vect_mul_neon); return PROVIDER_BASIC(gf_vect_mul); diff --git a/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/erasure_code/aarch64/ec_aarch64_highlevel_func.c index dd23702..e001fd7 100644 --- a/erasure_code/aarch64/ec_aarch64_highlevel_func.c +++ b/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -125,3 +125,140 @@ void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned ch break; } } + +/* SVE */ +extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); +extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 11) { + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + + switch (rows) { + case 11: + /* 7 + 4 */ + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 7 * k * 32; + coding += 7; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 10: + /* 6 + 4 */ + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 9: + /* 5 + 4 */ + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 8: + /* 4 + 4 */ + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 7: + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 6: + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 5: + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding); + break; + default: + break; + } +} + +void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding); + break; + default: + break; + } +} diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S new file mode 100644 index 0000000..abe5083 --- /dev/null +++ b/erasure_code/aarch64/gf_2vect_dot_prod_sve.S @@ -0,0 +1,164 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_2vect_dot_prod_sve +.type gf_2vect_dot_prod_sve, %function +/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_dest1 .req x10 +x_dest2 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +gf_2vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_2vect_mad_sve.S b/erasure_code/aarch64/gf_2vect_mad_sve.S new file mode 100644 index 0000000..5e83210 --- /dev/null +++ b/erasure_code/aarch64/gf_2vect_mad_sve.S @@ -0,0 +1,148 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_2vect_mad_sve +.type gf_2vect_mad_sve, %function + +/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +gf_2vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S new file mode 100644 index 0000000..b326c72 --- /dev/null +++ b/erasure_code/aarch64/gf_3vect_dot_prod_sve.S @@ -0,0 +1,185 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_3vect_dot_prod_sve +.type gf_3vect_dot_prod_sve, %function +/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_dest1 .req x11 +x_dest2 .req x12 +x_dest3 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +gf_3vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldr x_dest3, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + prfb pldl2keep, p0, [x_tbl3] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_3vect_mad_sve.S b/erasure_code/aarch64/gf_3vect_mad_sve.S new file mode 100644 index 0000000..52c2ffc --- /dev/null +++ b/erasure_code/aarch64/gf_3vect_mad_sve.S @@ -0,0 +1,171 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_3vect_mad_sve +.type gf_3vect_mad_sve, %function + +/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +gf_3vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* dest data prefetch */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + prfb pldl2strm, p0, [x_dest3, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S new file mode 100644 index 0000000..ae7cdcb --- /dev/null +++ b/erasure_code/aarch64/gf_4vect_dot_prod_sve.S @@ -0,0 +1,204 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_4vect_dot_prod_sve +.type gf_4vect_dot_prod_sve, %function +/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_dest1 .req x12 +x_dest2 .req x13 +x_dest3 .req x14 +x_dest4 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +gf_4vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_4vect_mad_sve.S b/erasure_code/aarch64/gf_4vect_mad_sve.S new file mode 100644 index 0000000..8bf682c --- /dev/null +++ b/erasure_code/aarch64/gf_4vect_mad_sve.S @@ -0,0 +1,190 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_4vect_mad_sve +.type gf_4vect_mad_sve, %function + +/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +gf_4vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S new file mode 100644 index 0000000..ae999ff --- /dev/null +++ b/erasure_code/aarch64/gf_5vect_dot_prod_sve.S @@ -0,0 +1,233 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_5vect_dot_prod_sve +.type gf_5vect_dot_prod_sve, %function +/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_dest1 .req x13 +x_dest2 .req x14 +x_dest4 .req x15 +x_dest5 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +gf_5vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #16 /* alignment */ + str x19, [sp] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldr x_dest5, [x_dest, #8*4] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + prfb pldl2keep, p0, [x_tbl5] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x19, [sp] + add sp, sp, #16 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_5vect_mad_sve.S b/erasure_code/aarch64/gf_5vect_mad_sve.S new file mode 100644 index 0000000..82e88d9 --- /dev/null +++ b/erasure_code/aarch64/gf_5vect_mad_sve.S @@ -0,0 +1,214 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_5vect_mad_sve +.type gf_5vect_mad_sve, %function + +/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +gf_5vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + prfb pldl2strm, p0, [x_dest5, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + /* store dest data, governed by p0 */ + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S new file mode 100644 index 0000000..1196bc1 --- /dev/null +++ b/erasure_code/aarch64/gf_6vect_dot_prod_sve.S @@ -0,0 +1,254 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_6vect_dot_prod_sve +.type gf_6vect_dot_prod_sve, %function +/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_dest1 .req x14 +x_dest2 .req x15 +x_dest6 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 +x_dest4 .req x20 +x_dest5 .req x21 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +gf_6vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #32 /* alignment */ + stp x19, x20, [sp] + str x21, [sp, #16] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_dest6.d, z_gft6_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x21, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #32 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_6vect_mad_sve.S b/erasure_code/aarch64/gf_6vect_mad_sve.S new file mode 100644 index 0000000..670e664 --- /dev/null +++ b/erasure_code/aarch64/gf_6vect_mad_sve.S @@ -0,0 +1,233 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_6vect_mad_sve +.type gf_6vect_mad_sve, %function + +/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest6 .req x11 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +gf_6vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + /* load table 6 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft6_lo, q_gft6_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + prfb pldl2strm, p0, [x_dest5, x_pos] + prfb pldl2strm, p0, [x_dest6, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + ld1b z_dest6.b, p0/z, [x_dest6, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* dest6 */ + tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_tmp_lo.d, z_dest6.d + eor z_dest6.d, z_tmp_hi.d, z_dest6.d + + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S new file mode 100644 index 0000000..cccaec5 --- /dev/null +++ b/erasure_code/aarch64/gf_7vect_dot_prod_sve.S @@ -0,0 +1,277 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_7vect_dot_prod_sve +.type gf_7vect_dot_prod_sve, %function +/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x_dest /* reused */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +gf_7vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #48 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + str x23, [sp, #32] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + prfb pldl2keep, p0, [x_tbl7] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x23, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #48 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S new file mode 100644 index 0000000..ee839a4 --- /dev/null +++ b/erasure_code/aarch64/gf_8vect_dot_prod_sve.S @@ -0,0 +1,303 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_8vect_dot_prod_sve +.type gf_8vect_dot_prod_sve, %function +/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x24 +x_dest8 .req x_dest /* reused */ +x_tbl8 .req x25 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z8 + +z_gft8_lo .req z9 +z_gft8_hi .req z10 +q_gft8_lo .req q9 +q_gft8_hi .req q10 + +z_dest8 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +gf_8vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #80 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + str d10, [sp, #56] + str x25, [sp, #64] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + mov z_dest8.b, #0 /* clear z_dest8 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32 + prfb pldl2keep, p0, [x_tbl7] + prfb pldl2keep, p0, [x_tbl8] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + /* dest 8 */ + tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b + tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b + eor z_dest8.d, z_gft8_lo.d, z_dest8.d + eor z_dest8.d, z_gft8_hi.d, z_dest8.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + st1b z_dest8.b, p0, [x_dest8, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x25, [sp, #64] + ldr d10, [sp, #56] + ldp d8, d9, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #80 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/erasure_code/aarch64/gf_vect_dot_prod_sve.S new file mode 100644 index 0000000..7cf3d0d --- /dev/null +++ b/erasure_code/aarch64/gf_vect_dot_prod_sve.S @@ -0,0 +1,128 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_vect_dot_prod_sve +.type gf_vect_dot_prod_sve, %function +/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest1 .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tbl1 .req x8 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +gf_vect_dot_prod_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov z_dest.b, #0 /* clear z_dest */ + mov x_vec_i, #0 /* clear x_vec_i */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + + /* load gf_table */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32 + for each src vect */ + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_gft1_lo.d, z_dest.d + eor z_dest.d, z_gft1_hi.d, z_dest.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects + + /* end of Loop 2 */ + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest1, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_vect_mad_sve.S b/erasure_code/aarch64/gf_vect_mad_sve.S new file mode 100644 index 0000000..970cf23 --- /dev/null +++ b/erasure_code/aarch64/gf_vect_mad_sve.S @@ -0,0 +1,123 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + + +.global gf_vect_mad_sve +.type gf_vect_mad_sve, %function + +/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_tmp1_lo .req z4 +z_tmp1_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +gf_vect_mad_sve: + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest.b, p0/z, [x_dest, x_pos] + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_lo.d, z_dest.d + eor z_dest.d, z_tmp1_hi.d, z_dest.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/erasure_code/aarch64/gf_vect_mul_sve.S b/erasure_code/aarch64/gf_vect_mul_sve.S new file mode 100644 index 0000000..195b597 --- /dev/null +++ b/erasure_code/aarch64/gf_vect_mul_sve.S @@ -0,0 +1,117 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +.global gf_vect_mul_sve +.type gf_vect_mul_sve, %function + +/* Refer to include/gf_vect_mul.h + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + * + * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest); + */ + +/* arguments */ +x_len .req x0 +x_tbl .req x1 +x_src .req x2 +x_dest .req x3 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x4 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src /* reuse */ + +z_dest .req z3 +z_tmp1_lo .req z4 +z_tmp1_hi .req z_dest /* reuse */ + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +gf_vect_mul_sve: + /* less than 32 bytes, return_fail */ + cmp x_len, #32 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret