diff --git a/include/multibinary_arm.h b/include/multibinary_arm.h new file mode 100644 index 0000000..7bc2a86 --- /dev/null +++ b/include/multibinary_arm.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018, Arm Limited. */ + +/* TODO: support SVE */ +.macro mbin_dispatch name:req, func_neon:req, func_sve + .section .data + .balign 8 + \name\()_dispatched: + .quad \name\()_dispatch_init + + .text + .global \name + \name\(): + adrp x9, \name\()_dispatched + ldr x10, [x9, :lo12:\name\()_dispatched] + br x10 + \name\()_dispatch_init: + add x9, x9, :lo12:\name\()_dispatched + adrp x10, \func_neon + add x10, x10, :lo12:\func_neon + str x10, [x9] + br x10 +.endm + +#if 0 +Macro expanded: mbin_dispatch xor_gen, xor_gen_neon + +.section .data +.balign 8 + +xor_gen_dispatched: + .quad xor_gen_dispatch_init + +.text + +.global xor_gen +xor_gen: + adrp x9, xor_gen_dispatched + ldr x10, [x9, :lo12:xor_gen_dispatched] + br x10 + +xor_gen_dispatch_init: + add x9, x9, :lo12:xor_gen_dispatched + adrp x10, xor_gen_neon + add x10, x10, :lo12:xor_gen_neon + str x10, [x9] + br x10 +#endif diff --git a/raid/Makefile.am b/raid/Makefile.am index 5a211d1..6065baf 100644 --- a/raid/Makefile.am +++ b/raid/Makefile.am @@ -27,10 +27,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## +include raid/aarch64/Makefile.am + lsrc += raid/raid_base.c lsrc_base_aliases += raid/raid_base_aliases.c -lsrc_aarch64 += raid/raid_base_aliases.c lsrc_x86_64 += \ raid/xor_gen_sse.asm \ diff --git a/raid/aarch64/Makefile.am b/raid/aarch64/Makefile.am new file mode 100644 index 0000000..dea55e6 --- /dev/null +++ b/raid/aarch64/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (c) 2018, Arm Limited. + +lsrc_aarch64 += \ + raid/aarch64/xor_gen_neon.S \ + raid/aarch64/pq_gen_neon.S \ + raid/aarch64/xor_check_neon.S \ + raid/aarch64/pq_check_neon.S \ + raid/aarch64/raid_multibinary_arm.S diff --git a/raid/aarch64/pq_check_neon.S b/raid/aarch64/pq_check_neon.S new file mode 100644 index 0000000..6858f6b --- /dev/null +++ b/raid/aarch64/pq_check_neon.S @@ -0,0 +1,313 @@ +/* Copyright (c) 2018, Arm Limited. */ + +.text + +.global pq_check_neon + +/* int pq_check_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 3 */ +x_vects .req x0 +w_len .req w1 /* MUST be 16x bytes */ +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dst_p .req x3 +x_dst_q .req x4 +x_dst_q_end .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_src_ptr_end .req x9 +x_src_last .req x10 +x_srcn .req x11 +w_min .req w12 +/* vectors */ +/* v0 ~ v7 : temporary p */ +/* v8 ~ v15: temporary q */ +/* v16 ~ v23: next 128 bytes */ +v_mask0 .req v24 +v_mask1 .req v25 +v_mask2 .req v26 +v_mask3 .req v27 +v_gf8poly .req v28 +v_0x80 .req v29 + +/* + * src_ptr_end --> + * -------+----------+ + * . | src[0] | + * . +----------+ +------------------+ + * src_ptr --> | src[1] | - srcn -> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-4] | + * -------+----------+ src_last +------------------+ + * src --> | src[v-3] | ---------> | buffer | + * +----------+ +------------------+ + * | src[v-2] | - dst_p -> | buffer | + * +----------+ +------------------+ + * | src[v-1] | - dst_q -> | buffer | dst_q_end + * +----------+ +------------------+ + */ + +pq_check_neon: + sub x_src_ptr_end, x_src, #8 + + sub w_vects, w_vects, #3 + add x_src, x_src, x_vects, lsl #3 + + ldr x_src_last, [x_src] + ldp x_dst_p, x_dst_q, [x_src, #8] + + add x_dst_q_end, x_dst_q, x_len + + mov w_min, #-1 + mov w_col, #0 + movi v_gf8poly.16b, #0x1D + movi v_0x80.16b, #0x80 + +.Lloop128_init: + /* less than 128 byts? */ + cmp w_len, #128 + blo .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_dst_q_end, x_dst_q_end, #128 + + /* batch process (vects-2)*128 bytes */ + /* v0~v7: p; v8~v15: q; v16~v23: in */ +.Lloop128: + ldr q0, [x_src_last, #16*0] + ldr q1, [x_src_last, #16*1] + ldr q2, [x_src_last, #16*2] + ldr q3, [x_src_last, #16*3] + ldr q4, [x_src_last, #16*4] + ldr q5, [x_src_last, #16*5] + ldr q6, [x_src_last, #16*6] + ldr q7, [x_src_last, #16*7] + add x_src_last, x_src_last, #128 + + mov v8.16b, v0.16b + mov v9.16b, v1.16b + mov v10.16b, v2.16b + mov v11.16b, v3.16b + mov v12.16b, v4.16b + mov v13.16b, v5.16b + mov v14.16b, v6.16b + mov v15.16b, v7.16b + + cbz w_vects, .Lloop128_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop128_vects: + ldr x_srcn, [x_src_ptr], #-8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + cmhs v_mask0.16b, v8.16b, v_0x80.16b + cmhs v_mask1.16b, v9.16b, v_0x80.16b + cmhs v_mask2.16b, v10.16b, v_0x80.16b + cmhs v_mask3.16b, v11.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v8.16b, v8.16b, #1 + shl v9.16b, v9.16b, #1 + shl v10.16b, v10.16b, #1 + shl v11.16b, v11.16b, #1 + eor v8.16b, v8.16b, v_mask0.16b + eor v9.16b, v9.16b, v_mask1.16b + eor v10.16b, v10.16b, v_mask2.16b + eor v11.16b, v11.16b, v_mask3.16b + eor v8.16b, v8.16b, v16.16b + eor v9.16b, v9.16b, v17.16b + eor v10.16b, v10.16b, v18.16b + eor v11.16b, v11.16b, v19.16b + + cmhs v_mask0.16b, v12.16b, v_0x80.16b + cmhs v_mask1.16b, v13.16b, v_0x80.16b + cmhs v_mask2.16b, v14.16b, v_0x80.16b + cmhs v_mask3.16b, v15.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v12.16b, v12.16b, #1 + shl v13.16b, v13.16b, #1 + shl v14.16b, v14.16b, #1 + shl v15.16b, v15.16b, #1 + eor v12.16b, v12.16b, v_mask0.16b + eor v13.16b, v13.16b, v_mask1.16b + eor v14.16b, v14.16b, v_mask2.16b + eor v15.16b, v15.16b, v_mask3.16b + eor v12.16b, v12.16b, v20.16b + eor v13.16b, v13.16b, v21.16b + eor v14.16b, v14.16b, v22.16b + eor v15.16b, v15.16b, v23.16b + + bne .Lloop128_vects + +.Lloop128_vects_end: + /* v16~v23: true p, q */ + ldr q16, [x_dst_p, #16*0] + ldr q17, [x_dst_p, #16*1] + ldr q18, [x_dst_p, #16*2] + ldr q19, [x_dst_p, #16*3] + ldr q20, [x_dst_p, #16*4] + ldr q21, [x_dst_p, #16*5] + ldr q22, [x_dst_p, #16*6] + ldr q23, [x_dst_p, #16*7] + + cmeq v0.16b, v0.16b, v16.16b + cmeq v1.16b, v1.16b, v17.16b + cmeq v2.16b, v2.16b, v18.16b + cmeq v3.16b, v3.16b, v19.16b + cmeq v4.16b, v4.16b, v20.16b + cmeq v5.16b, v5.16b, v21.16b + cmeq v6.16b, v6.16b, v22.16b + cmeq v7.16b, v7.16b, v23.16b + + ldr q16, [x_dst_q, #16*0] + ldr q17, [x_dst_q, #16*1] + ldr q18, [x_dst_q, #16*2] + ldr q19, [x_dst_q, #16*3] + ldr q20, [x_dst_q, #16*4] + ldr q21, [x_dst_q, #16*5] + ldr q22, [x_dst_q, #16*6] + ldr q23, [x_dst_q, #16*7] + + and v0.16b, v0.16b, v1.16b + and v2.16b, v2.16b, v3.16b + and v4.16b, v4.16b, v5.16b + and v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v2.16b + and v4.16b, v4.16b, v6.16b + and v0.16b, v0.16b, v4.16b + + cmeq v8.16b, v8.16b, v16.16b + cmeq v9.16b, v9.16b, v17.16b + cmeq v10.16b, v10.16b, v18.16b + cmeq v11.16b, v11.16b, v19.16b + cmeq v12.16b, v12.16b, v20.16b + cmeq v13.16b, v13.16b, v21.16b + cmeq v14.16b, v14.16b, v22.16b + cmeq v15.16b, v15.16b, v23.16b + + and v8.16b, v8.16b, v9.16b + and v10.16b, v10.16b, v11.16b + and v12.16b, v12.16b, v13.16b + and v14.16b, v14.16b, v15.16b + and v8.16b, v8.16b, v10.16b + and v12.16b, v12.16b, v14.16b + and v8.16b, v8.16b, v12.16b + + and v0.16b, v0.16b, v8.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lloop128_end + + add x_dst_p, x_dst_p, #128 + add x_dst_q, x_dst_q, #128 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #128 + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + cbz w_min, .Lerror + + add x_dst_q_end, x_dst_q_end, #128 + +.Lloop16_init: + tst w_len, #0x7F + beq .Lloop16_end + sub x_dst_q_end, x_dst_q_end, #16 + + /* batch process (vects-2)*16 bytes */ + /* v0: p; v1: q; v2: in; v3: mask */ +.Lloop16: + ldr q0, [x_src_last], #16 + mov v1.16b, v0.16b + + cbz w_vects, .Lloop16_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #-8 + ldr q2, [x_srcn, x_col] + cmp x_src_ptr, x_src_ptr_end + + eor v0.16b, v0.16b, v2.16b + + cmhs v3.16b, v1.16b, v_0x80.16b + and v3.16b, v3.16b, v_gf8poly.16b + + shl v1.16b, v1.16b, #1 + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + + bne .Lloop16_vects + +.Lloop16_vects_end: + /* v4: true p; v5: true q */ + ldr q4, [x_dst_p], #16 + ldr q5, [x_dst_q], #16 + cmp x_dst_q, x_dst_q_end + + cmeq v0.16b, v0.16b, v4.16b + cmeq v1.16b, v1.16b, v5.16b + and v0.16b, v0.16b, v1.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lerror + + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + mov w_ret, #0 + ret + +.Lerror: + mov w_ret, #1 + ret diff --git a/raid/aarch64/pq_gen_neon.S b/raid/aarch64/pq_gen_neon.S new file mode 100644 index 0000000..7a5365c --- /dev/null +++ b/raid/aarch64/pq_gen_neon.S @@ -0,0 +1,254 @@ +/* Copyright (c) 2018, Arm Limited. */ + +.text + +.global pq_gen_neon + +/* int pq_gen_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 3 */ +x_vects .req x0 +w_len .req w1 /* MUST be 16x bytes */ +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dst_p .req x3 +x_dst_q .req x4 +x_dst_q_end .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_src_ptr_end .req x9 +x_src_last .req x10 +x_srcn .req x11 +/* vectors */ +/* v0 ~ v7 : temporary p */ +/* v8 ~ v15: temporary q */ +/* v16 ~ v23: next 128 bytes */ +v_mask0 .req v24 +v_mask1 .req v25 +v_mask2 .req v26 +v_mask3 .req v27 +v_gf8poly .req v28 +v_0x80 .req v29 + +/* + * src_ptr_end --> + * -------+----------+ + * . | src[0] | + * . +----------+ +------------------+ + * src_ptr --> | src[1] | - srcn -> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-4] | + * -------+----------+ src_last +------------------+ + * src --> | src[v-3] | ---------> | buffer | + * +----------+ +------------------+ + * | src[v-2] | - dst_p -> | buffer | + * +----------+ +------------------+ + * | src[v-1] | - dst_q -> | buffer | dst_q_end + * +----------+ +------------------+ + */ + +pq_gen_neon: + sub x_src_ptr_end, x_src, #8 + + sub w_vects, w_vects, #3 + add x_src, x_src, x_vects, lsl #3 + + ldr x_src_last, [x_src] + ldp x_dst_p, x_dst_q, [x_src, #8] + + add x_dst_q_end, x_dst_q, x_len + + mov w_col, #0 + movi v_gf8poly.16b, #0x1D + movi v_0x80.16b, #0x80 + +.Lloop128_init: + /* less than 128 byts? */ + cmp w_len, #128 + blo .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_dst_q_end, x_dst_q_end, #128 + + /* batch process (vects-2)*128 bytes */ + /* v0~v7: p; v8~v15: q; v16~v23: in */ +.Lloop128: + ldr q0, [x_src_last, #16*0] + ldr q1, [x_src_last, #16*1] + ldr q2, [x_src_last, #16*2] + ldr q3, [x_src_last, #16*3] + ldr q4, [x_src_last, #16*4] + ldr q5, [x_src_last, #16*5] + ldr q6, [x_src_last, #16*6] + ldr q7, [x_src_last, #16*7] + add x_src_last, x_src_last, #128 + + mov v8.16b, v0.16b + mov v9.16b, v1.16b + mov v10.16b, v2.16b + mov v11.16b, v3.16b + mov v12.16b, v4.16b + mov v13.16b, v5.16b + mov v14.16b, v6.16b + mov v15.16b, v7.16b + + cbz w_vects, .Lloop128_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop128_vects: + ldr x_srcn, [x_src_ptr], #-8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + cmhs v_mask0.16b, v8.16b, v_0x80.16b + cmhs v_mask1.16b, v9.16b, v_0x80.16b + cmhs v_mask2.16b, v10.16b, v_0x80.16b + cmhs v_mask3.16b, v11.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v8.16b, v8.16b, #1 + shl v9.16b, v9.16b, #1 + shl v10.16b, v10.16b, #1 + shl v11.16b, v11.16b, #1 + eor v8.16b, v8.16b, v_mask0.16b + eor v9.16b, v9.16b, v_mask1.16b + eor v10.16b, v10.16b, v_mask2.16b + eor v11.16b, v11.16b, v_mask3.16b + eor v8.16b, v8.16b, v16.16b + eor v9.16b, v9.16b, v17.16b + eor v10.16b, v10.16b, v18.16b + eor v11.16b, v11.16b, v19.16b + + cmhs v_mask0.16b, v12.16b, v_0x80.16b + cmhs v_mask1.16b, v13.16b, v_0x80.16b + cmhs v_mask2.16b, v14.16b, v_0x80.16b + cmhs v_mask3.16b, v15.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v12.16b, v12.16b, #1 + shl v13.16b, v13.16b, #1 + shl v14.16b, v14.16b, #1 + shl v15.16b, v15.16b, #1 + eor v12.16b, v12.16b, v_mask0.16b + eor v13.16b, v13.16b, v_mask1.16b + eor v14.16b, v14.16b, v_mask2.16b + eor v15.16b, v15.16b, v_mask3.16b + eor v12.16b, v12.16b, v20.16b + eor v13.16b, v13.16b, v21.16b + eor v14.16b, v14.16b, v22.16b + eor v15.16b, v15.16b, v23.16b + + bne .Lloop128_vects + +.Lloop128_vects_end: + str q0, [x_dst_p, #16*0] + str q1, [x_dst_p, #16*1] + str q2, [x_dst_p, #16*2] + str q3, [x_dst_p, #16*3] + str q4, [x_dst_p, #16*4] + str q5, [x_dst_p, #16*5] + str q6, [x_dst_p, #16*6] + str q7, [x_dst_p, #16*7] + + str q8, [x_dst_q, #16*0] + str q9, [x_dst_q, #16*1] + str q10, [x_dst_q, #16*2] + str q11, [x_dst_q, #16*3] + str q12, [x_dst_q, #16*4] + str q13, [x_dst_q, #16*5] + str q14, [x_dst_q, #16*6] + str q15, [x_dst_q, #16*7] + + add x_dst_p, x_dst_p, #128 + add x_dst_q, x_dst_q, #128 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #128 + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_dst_q_end, x_dst_q_end, #128 + +.Lloop16_init: + tst w_len, #0x7F + beq .Lloop16_end + sub x_dst_q_end, x_dst_q_end, #16 + + /* batch process (vects-2)*16 bytes */ + /* v0: p; v1: q; v2: in; v3: mask */ +.Lloop16: + ldr q0, [x_src_last], #16 + mov v1.16b, v0.16b + + cbz w_vects, .Lloop16_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #-8 + ldr q2, [x_srcn, x_col] + cmp x_src_ptr, x_src_ptr_end + + eor v0.16b, v0.16b, v2.16b + + cmhs v3.16b, v1.16b, v_0x80.16b + and v3.16b, v3.16b, v_gf8poly.16b + + shl v1.16b, v1.16b, #1 + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + + bne .Lloop16_vects + +.Lloop16_vects_end: + str q0, [x_dst_p], #16 + str q1, [x_dst_q], #16 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + mov w_ret, #0 + ret diff --git a/raid/aarch64/raid_multibinary_arm.S b/raid/aarch64/raid_multibinary_arm.S new file mode 100644 index 0000000..bf84eeb --- /dev/null +++ b/raid/aarch64/raid_multibinary_arm.S @@ -0,0 +1,8 @@ +/* Copyright (c) 2018, Arm Limited. */ + +#include "multibinary_arm.h" + +mbin_dispatch xor_gen, xor_gen_neon +mbin_dispatch xor_check, xor_check_neon +mbin_dispatch pq_gen, pq_gen_neon +mbin_dispatch pq_check, pq_check_neon diff --git a/raid/aarch64/xor_check_neon.S b/raid/aarch64/xor_check_neon.S new file mode 100644 index 0000000..8c4809f --- /dev/null +++ b/raid/aarch64/xor_check_neon.S @@ -0,0 +1,243 @@ +/* Copyright (c) 2018, Arm Limited. */ + +.text + +.global xor_check_neon + +/* int xor_check_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 2 */ +x_vects .req x0 +w_len .req w1 +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +w_in .req w1 /* share w_len */ +x_src0 .req x3 +x_src0_end .req x4 +w_len256 .req w5 /* share w_len16 */ +x_len256 .req x5 +w_len16 .req w5 +x_len16 .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_srcn .req x9 +x_src_ptr_end .req x10 +w_xor .req w11 +/* v0 ~ v15: temporary results */ +/* v16 ~ v31: next 256 bytes */ + +/* + * +----------+ +------------------+ + * src --> | src[0] | - src0 -> | buffer | src0_end + * --------+----------+ +------------------+ + * . | ...... | + * . +----------+ +------------------+ + * src_ptr ~~> | src[n] | - srcn ~> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-1] | + * --------+----------+ + * src_ptr_end --> + */ + +xor_check_neon: + add x_src_ptr_end, x_src, x_vects, lsl #3 + ldr x_src0, [x_src] + add x_src0_end, x_src0, x_len + + sub w_vects, w_vects, #1 + mov w_col, #0 + mov w_xor, #0 + +.Lloop256_init: + /* len256 = len - len%256; len %= 256 */ + mov w_len256, w_len + and w_len, w_len, #0xFF + sub w_len256, w_len256, w_len + + /* less than 256 byts? */ + cbz w_len256, .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src0_end, x_src0_end, #256 + + /* batch process vects*256 bytes */ +.Lloop256: + ldr q0, [x_src0, #16*0] + ldr q1, [x_src0, #16*1] + ldr q2, [x_src0, #16*2] + ldr q3, [x_src0, #16*3] + ldr q4, [x_src0, #16*4] + ldr q5, [x_src0, #16*5] + ldr q6, [x_src0, #16*6] + ldr q7, [x_src0, #16*7] + ldr q8, [x_src0, #16*8] + ldr q9, [x_src0, #16*9] + ldr q10, [x_src0, #16*10] + ldr q11, [x_src0, #16*11] + ldr q12, [x_src0, #16*12] + ldr q13, [x_src0, #16*13] + ldr q14, [x_src0, #16*14] + ldr q15, [x_src0, #16*15] + add x_src0, x_src0, #256 + + cbz w_vects, .Lloop256_vects_end + + add x_src_ptr, x_src, #8 +.Lloop256_vects: + ldr x_srcn, [x_src_ptr], #8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + ldr q24, [x_srcn, #16*8] + ldr q25, [x_srcn, #16*9] + ldr q26, [x_srcn, #16*10] + ldr q27, [x_srcn, #16*11] + ldr q28, [x_srcn, #16*12] + ldr q29, [x_srcn, #16*13] + ldr q30, [x_srcn, #16*14] + ldr q31, [x_srcn, #16*15] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + eor v8.16b, v8.16b, v24.16b + eor v9.16b, v9.16b, v25.16b + eor v10.16b, v10.16b, v26.16b + eor v11.16b, v11.16b, v27.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + + bne .Lloop256_vects + +.Lloop256_vects_end: + orr v0.16b, v0.16b, v1.16b + orr v2.16b, v2.16b, v3.16b + orr v4.16b, v4.16b, v5.16b + orr v6.16b, v6.16b, v7.16b + orr v8.16b, v8.16b, v9.16b + orr v10.16b, v10.16b, v11.16b + orr v12.16b, v12.16b, v13.16b + orr v14.16b, v14.16b, v15.16b + orr v0.16b, v0.16b, v2.16b + orr v4.16b, v4.16b, v6.16b + orr v8.16b, v8.16b, v10.16b + orr v12.16b, v12.16b, v14.16b + orr v0.16b, v0.16b, v4.16b + orr v8.16b, v8.16b, v12.16b + orr v0.16b, v0.16b, v8.16b + umaxv b0, v0.16b + umov w_xor, v0.b[0] + cbnz w_xor, .Lloop256_end + + cmp x_src0, x_src0_end + add w_col, w_col, #256 + bls .Lloop256 + +.Lloop256_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + cbnz w_xor, .Lerror + + add x_src0_end, x_src0_end, #256 + +.Lloop16_init: + /* len16 = len - len%16; len %= 16 */ + mov w_len16, w_len + and w_len, w_len, #0xF + sub w_len16, w_len16, w_len + + /* less than 16 bytes? */ + cbz w_len16, .Lloop1_init + + sub x_src0_end, x_src0_end, #16 + + /* batch process vects*16 bytes */ +.Lloop16: + ldr q0, [x_src0], #16 + cbz w_vects, .Lloop16_vects_end + + add x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_src_ptr_end + ldr q1, [x_srcn, x_col] + eor v0.16b, v0.16b, v1.16b + bne .Lloop16_vects + +.Lloop16_vects_end: + umaxv b0, v0.16b + umov w_xor, v0.b[0] + cbnz w_xor, .Lerror + cmp x_src0, x_src0_end + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + add x_src0_end, x_src0_end, #16 + +.Lloop1_init: + cbnz w_len, .Lloop1 + mov w_ret, #0 + ret + + /* batch process vects*1 bytes */ +.Lloop1: + ldrb w_xor, [x_src0], #1 + cbz w_vects, .Lloop1_vects_end + + add x_src_ptr, x_src, #8 +.Lloop1_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_src_ptr_end + ldrb w_in, [x_srcn, x_col] + eor w_xor, w_xor, w_in + bne .Lloop1_vects + +.Lloop1_vects_end: + cbnz w_xor, .Lerror + cmp x_src0, x_src0_end + add w_col, w_col, #1 + bne .Lloop1 + +.Lloop1_end: + mov w_ret, #0 + ret + +.Lerror: + mov w_ret, #1 + ret diff --git a/raid/aarch64/xor_gen_neon.S b/raid/aarch64/xor_gen_neon.S new file mode 100644 index 0000000..f953b58 --- /dev/null +++ b/raid/aarch64/xor_gen_neon.S @@ -0,0 +1,236 @@ +/* Copyright (c) 2018, Arm Limited. */ + +.text + +.global xor_gen_neon + +/* int xor_gen_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 2 */ +x_vects .req x0 +w_len .req w1 +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +w_in .req w1 /* share w_len */ +x_src0 .req x3 +x_src0_end .req x4 +w_len256 .req w5 /* share w_len16, w_xor */ +x_len256 .req x5 +w_len16 .req w5 +x_len16 .req x5 +w_xor .req w5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_srcn .req x9 +x_dst .req x10 +x_dst_ptr .req x11 +/* v0 ~ v15: temporary results */ +/* v16 ~ v31: next 256 bytes */ + +/* + * +----------+ +------------------+ + * src --> | src[0] | - src0 -> | buffer | src0_end + * --------+----------+ +------------------+ + * . | ...... | + * . +----------+ +------------------+ + * src_ptr ~~> | src[n] | - srcn ~> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-2] | + * --------+----------+ +------------------+ + * dst_ptr --> | src[v-1] | -- dst --> | buffer | + * +----------+ +------------------+ + */ + +xor_gen_neon: + add x_dst_ptr, x_src, x_vects, lsl #3 + ldr x_dst, [x_dst_ptr, #-8]! + ldr x_src0, [x_src] + add x_src0_end, x_src0, x_len + + sub w_vects, w_vects, #2 + mov w_col, #0 + +.Loop256_init: + /* len256 = len - len%256; len %= 256 */ + mov w_len256, w_len + and w_len, w_len, #0xFF + sub w_len256, w_len256, w_len + + /* less than 256 byts? */ + cbz w_len256, .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src0_end, x_src0_end, #256 + + /* batch process (vects-1)*256 bytes */ +.Lloop256: + ldr q0, [x_src0, #16*0] + ldr q1, [x_src0, #16*1] + ldr q2, [x_src0, #16*2] + ldr q3, [x_src0, #16*3] + ldr q4, [x_src0, #16*4] + ldr q5, [x_src0, #16*5] + ldr q6, [x_src0, #16*6] + ldr q7, [x_src0, #16*7] + ldr q8, [x_src0, #16*8] + ldr q9, [x_src0, #16*9] + ldr q10, [x_src0, #16*10] + ldr q11, [x_src0, #16*11] + ldr q12, [x_src0, #16*12] + ldr q13, [x_src0, #16*13] + ldr q14, [x_src0, #16*14] + ldr q15, [x_src0, #16*15] + add x_src0, x_src0, #256 + + cbz w_vects, .Lloop256_vects_end + + add x_src_ptr, x_src, #8 +.Lloop256_vects: + ldr x_srcn, [x_src_ptr], #8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_dst_ptr + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + ldr q24, [x_srcn, #16*8] + ldr q25, [x_srcn, #16*9] + ldr q26, [x_srcn, #16*10] + ldr q27, [x_srcn, #16*11] + ldr q28, [x_srcn, #16*12] + ldr q29, [x_srcn, #16*13] + ldr q30, [x_srcn, #16*14] + ldr q31, [x_srcn, #16*15] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + eor v8.16b, v8.16b, v24.16b + eor v9.16b, v9.16b, v25.16b + eor v10.16b, v10.16b, v26.16b + eor v11.16b, v11.16b, v27.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + + bne .Lloop256_vects + +.Lloop256_vects_end: + str q0, [x_dst, #16*0] + str q1, [x_dst, #16*1] + str q2, [x_dst, #16*2] + str q3, [x_dst, #16*3] + str q4, [x_dst, #16*4] + str q5, [x_dst, #16*5] + str q6, [x_dst, #16*6] + str q7, [x_dst, #16*7] + str q8, [x_dst, #16*8] + str q9, [x_dst, #16*9] + str q10, [x_dst, #16*10] + str q11, [x_dst, #16*11] + str q12, [x_dst, #16*12] + str q13, [x_dst, #16*13] + str q14, [x_dst, #16*14] + str q15, [x_dst, #16*15] + + cmp x_src0, x_src0_end + add x_dst, x_dst, #256 + add w_col, w_col, #256 + bls .Lloop256 + +.Lloop256_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_src0_end, x_src0_end, #256 + +.Lloop16_init: + /* len16 = len - len%16; len %= 16 */ + mov w_len16, w_len + and w_len, w_len, #0xF + sub w_len16, w_len16, w_len + + /* less than 16 bytes? */ + cbz w_len16, .Lloop1_init + + sub x_src0_end, x_src0_end, #16 + + /* batch process (vects-1)*16 bytes */ +.Lloop16: + ldr q0, [x_src0], #16 + cbz w_vects, .Lloop16_vects_end + + add x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_dst_ptr + ldr q1, [x_srcn, x_col] + eor v0.16b, v0.16b, v1.16b + bne .Lloop16_vects + +.Lloop16_vects_end: + cmp x_src0, x_src0_end + str q0, [x_dst], #16 + add w_col, w_col, #16 + bls .Lloop16 + +.Loop16_end: + add x_src0_end, x_src0_end, #16 + +.Lloop1_init: + cbnz w_len, .Lloop1 + mov w_ret, #0 + ret + + /* batch process (vects-1)*1 bytes */ +.Lloop1: + ldrb w_xor, [x_src0], #1 + cbz w_vects, .Lloop1_vects_end + + add x_src_ptr, x_src, #8 +.Lloop1_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_dst_ptr + ldrb w_in, [x_srcn, x_col] + eor w_xor, w_xor, w_in + bne .Lloop1_vects + +.Lloop1_vects_end: + cmp x_src0, x_src0_end + strb w_xor, [x_dst], #1 + add w_col, w_col, #1 + bne .Lloop1 + +.Loop1_end: + mov w_ret, #0 + ret