mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 09:23:50 +01:00
erasure_code: modify eor way in aarch64 neon codes
Change-Id: I9fb9219c5f280ed88194ec63234af046a5a036ae Signed-off-by: Hang Li <lihang48@hisilicon.com>
This commit is contained in:
parent
ce9e56054a
commit
02a86dfb3f
@ -200,12 +200,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
@ -214,12 +214,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
@ -228,12 +228,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
@ -242,12 +242,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_4 */
|
||||
@ -256,12 +256,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b
|
||||
eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b
|
||||
eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_5 */
|
||||
@ -270,12 +270,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b
|
||||
eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b
|
||||
eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_6 */
|
||||
@ -284,12 +284,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b
|
||||
eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b
|
||||
eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_7 */
|
||||
@ -298,12 +298,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b
|
||||
eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b
|
||||
eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
@ -363,12 +363,12 @@ gf_2vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp1_lo.16b
|
||||
eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b
|
||||
eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp1_lo.16b
|
||||
eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b
|
||||
eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
|
@ -197,42 +197,42 @@ gf_2vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
|
||||
eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
|
||||
eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
|
||||
eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
|
||||
eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
|
||||
eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
|
||||
eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest1, #16*0]
|
||||
@ -255,42 +255,42 @@ gf_2vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
|
||||
eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
|
||||
eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
|
||||
eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
|
||||
eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
|
||||
eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
|
||||
eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest2, #16*0]
|
||||
@ -333,12 +333,12 @@ gf_2vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
|
||||
eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
|
||||
eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d0, [x_dest1]
|
||||
|
@ -187,17 +187,17 @@ gf_3vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
@ -206,17 +206,17 @@ gf_3vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
@ -225,17 +225,17 @@ gf_3vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
@ -244,17 +244,17 @@ gf_3vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
@ -322,11 +322,11 @@ gf_3vect_dot_prod_neon:
|
||||
tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
|
@ -192,62 +192,62 @@ gf_3vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
@ -297,17 +297,17 @@ gf_3vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
|
@ -213,22 +213,22 @@ gf_4vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_1 */
|
||||
@ -237,22 +237,22 @@ gf_4vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
|
||||
eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_2 */
|
||||
@ -261,22 +261,22 @@ gf_4vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
|
||||
eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
|
||||
|
||||
/* data_3 */
|
||||
@ -285,22 +285,22 @@ gf_4vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
|
||||
eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
@ -382,13 +382,13 @@ gf_4vect_dot_prod_neon:
|
||||
tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
eor v_gft4_lo.16b, v_gft4_hi.16b, v_gft4_lo.16b
|
||||
eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
|
@ -208,43 +208,43 @@ gf_4vect_mad_neon:
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
@ -272,43 +272,43 @@ gf_4vect_mad_neon:
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d3_0, [x_dest3, #16*0]
|
||||
@ -351,12 +351,12 @@ gf_4vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
@ -366,12 +366,12 @@ gf_4vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d3_0, [x_dest3]
|
||||
|
@ -238,22 +238,22 @@ gf_5vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
|
||||
eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
|
||||
eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
|
||||
eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p2_x */
|
||||
@ -263,22 +263,22 @@ gf_5vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
|
||||
eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
|
||||
eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
|
||||
eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p3_x */
|
||||
@ -288,22 +288,22 @@ gf_5vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
|
||||
eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
|
||||
eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
|
||||
eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p4_x */
|
||||
@ -313,22 +313,22 @@ gf_5vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
|
||||
eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
|
||||
eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
|
||||
eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* v_p5_x */
|
||||
@ -337,22 +337,22 @@ gf_5vect_dot_prod_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
|
||||
eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
|
||||
eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
|
||||
eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
|
||||
eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
@ -439,15 +439,15 @@ gf_5vect_dot_prod_neon:
|
||||
tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
|
||||
eor v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
|
||||
eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
|
||||
eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
|
||||
eor v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
|
||||
eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
|
||||
eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
|
||||
eor v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
|
||||
eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
|
||||
eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
|
||||
eor v_gft4_lo.16b, v_gft4_hi.16b, v_gft4_lo.16b
|
||||
eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
|
||||
eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
|
||||
eor v_gft5_lo.16b, v_gft5_hi.16b, v_gft5_lo.16b
|
||||
eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
|
||||
eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
|
@ -230,64 +230,64 @@ gf_5vect_mad_neon:
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
@ -324,43 +324,43 @@ gf_5vect_mad_neon:
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest5 */
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
|
||||
eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
|
||||
eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
|
||||
eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4, #16*0]
|
||||
@ -406,17 +406,17 @@ gf_5vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
@ -430,12 +430,12 @@ gf_5vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
|
@ -246,64 +246,64 @@ gf_6vect_mad_neon:
|
||||
/* dest1 */
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest2 */
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
|
||||
eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
|
||||
eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
|
||||
eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest3 */
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
|
||||
eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
|
||||
eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
|
||||
eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
@ -347,64 +347,64 @@ gf_6vect_mad_neon:
|
||||
/* dest4 */
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
|
||||
eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
|
||||
eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
|
||||
eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest5 */
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
|
||||
eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
|
||||
eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
|
||||
eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
|
||||
|
||||
/* dest6 */
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
|
||||
eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
|
||||
eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
|
||||
eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
|
||||
eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4, #16*0]
|
||||
@ -458,17 +458,17 @@ gf_6vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
|
||||
eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
|
||||
eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
@ -485,17 +485,17 @@ gf_6vect_mad_neon:
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
|
||||
eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
|
||||
eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
|
||||
|
||||
tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
|
||||
eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
|
||||
eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
|
||||
|
||||
str q_d4_0, [x_dest4]
|
||||
|
@ -203,22 +203,21 @@ gf_vect_dot_prod_neon:
|
||||
tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
|
||||
tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
|
||||
eor v_data_0_hi.16b, v_data_0_lo.16b, v_data_0_hi.16b
|
||||
eor v_data_1_hi.16b, v_data_1_lo.16b, v_data_1_hi.16b
|
||||
eor v_data_2_hi.16b, v_data_2_lo.16b, v_data_2_hi.16b
|
||||
eor v_data_3_hi.16b, v_data_3_lo.16b, v_data_3_hi.16b
|
||||
eor v_data_4_hi.16b, v_data_4_lo.16b, v_data_4_hi.16b
|
||||
eor v_data_5_hi.16b, v_data_5_lo.16b, v_data_5_hi.16b
|
||||
eor v_data_6_hi.16b, v_data_6_lo.16b, v_data_6_hi.16b
|
||||
eor v_data_7_hi.16b, v_data_7_lo.16b, v_data_7_hi.16b
|
||||
|
||||
eor v_p0.16b, v_data_0_lo.16b, v_p0.16b
|
||||
eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
|
||||
eor v_p1.16b, v_data_1_lo.16b, v_p1.16b
|
||||
eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
|
||||
eor v_p2.16b, v_data_2_lo.16b, v_p2.16b
|
||||
eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
|
||||
eor v_p3.16b, v_data_3_lo.16b, v_p3.16b
|
||||
eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
|
||||
eor v_p4.16b, v_data_4_lo.16b, v_p4.16b
|
||||
eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
|
||||
eor v_p5.16b, v_data_5_lo.16b, v_p5.16b
|
||||
eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
|
||||
eor v_p6.16b, v_data_6_lo.16b, v_p6.16b
|
||||
eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
|
||||
eor v_p7.16b, v_data_7_lo.16b, v_p7.16b
|
||||
eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
@ -269,7 +268,7 @@ gf_vect_dot_prod_neon:
|
||||
|
||||
tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_data_hi.16b, v_data_lo.16b, v_data_hi.16b
|
||||
eor v_p.16b, v_data_lo.16b, v_p.16b
|
||||
eor v_p.16b, v_p.16b, v_data_hi.16b
|
||||
|
||||
cmp x_vec_i, x_vec
|
||||
|
@ -190,9 +190,9 @@ gf_vect_mad_neon:
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
|
||||
eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
|
||||
@ -200,9 +200,9 @@ gf_vect_mad_neon:
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
|
||||
eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
|
||||
eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
|
||||
@ -210,9 +210,9 @@ gf_vect_mad_neon:
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
|
||||
eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
|
||||
eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
|
||||
@ -220,9 +220,9 @@ gf_vect_mad_neon:
|
||||
tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
|
||||
tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
|
||||
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
|
||||
eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
|
||||
eor v_tmp2_hi.16b, v_tmp2_lo.16b, v_tmp2_hi.16b
|
||||
eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
|
||||
eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1, #16*0]
|
||||
@ -262,7 +262,7 @@ gf_vect_mad_neon:
|
||||
|
||||
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
|
||||
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
|
||||
eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
|
||||
eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
|
||||
eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
|
||||
|
||||
str q_d1_0, [x_dest1]
|
||||
|
Loading…
Reference in New Issue
Block a user