Remove apple specific versions of arm64 macros with arguments

The apple assembler for arm64 can handle the gnu binutils style
macros just fine, so there is no need to duplicate all of these
macros in two syntaxes, when the new one works fine in all cases.

We already require a new enough assembler to support the gnu binutils
style features since we use the .rept directive in a few places.
This commit is contained in:
Martin Storsjö 2015-03-27 10:54:14 +02:00
parent cdce1b73ca
commit d8202cf38f
8 changed files with 0 additions and 984 deletions

View File

@ -33,80 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.d}[0], [$4], $5
ld1 {$1.d}[0], [$4], $5
ld1 {$2.d}[0], [$4], $5
ld1 {$3.d}[0], [$4], $5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.d}[0], [$4], $5
st1 {$1.d}[0], [$4], $5
st1 {$2.d}[0], [$4], $5
st1 {$3.d}[0], [$4], $5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.8b}, [$4], $5
ld1 {$1.8b}, [$4], $5
ld1 {$2.8b}, [$4], $5
ld1 {$3.8b}, [$4], $5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.8b}, [$4], $5
st1 {$1.8b}, [$4], $5
st1 {$2.8b}, [$4], $5
st1 {$3.8b}, [$4], $5
// }
.endm
.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.2d}, [$4], $5
ld1 {$1.2d}, [$4], $5
ld1 {$2.2d}, [$4], $5
ld1 {$3.2d}, [$4], $5
// }
.endm
.macro STORE16_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.2d}, [$4], $5
st1 {$1.2d}, [$4], $5
st1 {$2.2d}, [$4], $5
st1 {$3.2d}, [$4], $5
// }
.endm
.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.16b}, [$4], $5
ld1 {$1.16b}, [$4], $5
ld1 {$2.16b}, [$4], $5
ld1 {$3.16b}, [$4], $5
// }
.endm
.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.16b}, [$4], $5
st1 {$1.16b}, [$4], $5
st1 {$2.16b}, [$4], $5
st1 {$3.16b}, [$4], $5
// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: $0~$3, src*, src_stride
ld1 {\arg0\().d}[0], [\arg4], \arg5
@ -179,8 +105,6 @@
// }
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon

View File

@ -33,265 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro MASK_MATRIX
uabd $6.16b, $1.16b, $2.16b
cmhi $6.16b, $4.16b, $6.16b
uabd $4.16b, $0.16b, $1.16b
cmhi $4.16b, $5.16b, $4.16b
and $6.16b, $6.16b, $4.16b
uabd $4.16b, $3.16b, $2.16b
cmhi $4.16b, $5.16b, $4.16b
and $6.16b, $6.16b, $4.16b
.endm
.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
//v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
urhadd $8.16b, $2.16b, $3.16b
uhadd $8.16b, $0.16b, $8.16b
usubl $9.8h, $8.8b, $1.8b
sqxtn $9.8b, $9.8h
usubl2 $8.8h, $8.16b, $1.16b
sqxtn2 $9.16b, $8.8h
smax $8.16b, $9.16b, $5.16b
//
smin $8.16b, $8.16b, $6.16b
uabd $9.16b, $0.16b, $2.16b
cmhi $9.16b, $4.16b, $9.16b
and $8.16b, $8.16b, $9.16b
and $8.16b, $8.16b, $7.16b
add $8.16b, $1.16b, $8.16b
abs $9.16b, $9.16b
.endm
.macro DIFF_LUMA_LT4_P0_Q0_1
usubl $5.8h, $0.8b, $3.8b
usubl $6.8h, $2.8b, $1.8b
shl $6.8h, $6.8h, #2
add $5.8h, $5.8h, $6.8h
sqrshrn $4.8b, $5.8h, #3
.endm
.macro DIFF_LUMA_LT4_P0_Q0_2
usubl2 $5.8h, $0.16b, $3.16b
usubl2 $6.8h, $2.16b, $1.16b
shl $6.8h, $6.8h, #2
add $5.8h, $5.8h, $6.8h
sqrshrn2 $4.16b, $5.8h, #3
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
cmge $1.16b, $0.16b, #0
and $1.16b, $0.16b, $1.16b
sub $0.16b, $1.16b, $0.16b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_1
uaddl $8.8h, $1.8b, $2.8b
uaddl $9.8h, $3.8b, $4.8b
add $9.8h, $9.8h, $8.8h
uaddl $8.8h, $0.8b, $1.8b
shl $8.8h, $8.8h, #1
add $8.8h, $9.8h, $8.8h
rshrn $0.8b, $9.8h, #2
rshrn $7.8b, $8.8h, #3
shl $9.8h, $9.8h, #1
usubl $8.8h, $5.8b, $1.8b
add $9.8h, $8.8h, $9.8h
uaddl $8.8h, $2.8b, $5.8b
uaddw $8.8h, $8.8h, $2.8b
uaddw $8.8h, $8.8h, $3.8b
rshrn $9.8b, $9.8h, #3
rshrn $8.8b, $8.8h, #2
bsl $6.8b, $9.8b, $8.8b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_2
uaddl2 $8.8h, $1.16b, $2.16b
uaddl2 $9.8h, $3.16b, $4.16b
add $9.8h, $9.8h, $8.8h
uaddl2 $8.8h, $0.16b, $1.16b
shl $8.8h, $8.8h, #1
add $8.8h, $9.8h, $8.8h
rshrn2 $0.16b, $9.8h, #2
rshrn2 $7.16b, $8.8h, #3
shl $9.8h, $9.8h, #1
usubl2 $8.8h, $5.16b, $1.16b
add $9.8h, $8.8h, $9.8h
uaddl2 $8.8h, $2.16b, $5.16b
uaddw2 $8.8h, $8.8h, $2.16b
uaddw2 $8.8h, $8.8h, $3.16b
rshrn2 $9.16b, $9.8h, #3
rshrn2 $8.16b, $8.8h, #2
bsl $6.16b, $9.16b, $8.16b
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_1
uaddl $4.8h, $0.8b, $3.8b
shl $4.8h, $4.8h, #1
usubl $5.8h, $1.8b, $3.8b
add $5.8h, $5.8h, $4.8h
rshrn $6.8b, $5.8h, #2
usubl $5.8h, $2.8b, $0.8b
add $5.8h, $5.8h, $4.8h
rshrn $7.8b, $5.8h, #2
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_2
uaddl2 $4.8h, $0.16b, $3.16b
shl $4.8h, $4.8h, #1
usubl2 $5.8h, $1.16b, $3.16b
add $5.8h, $5.8h, $4.8h
rshrn2 $6.16b, $5.8h, #2
usubl2 $5.8h, $2.16b, $0.16b
add $5.8h, $5.8h, $4.8h
rshrn2 $7.16b, $5.8h, #2
.endm
.macro DIFF_LUMA_EQ4_MASK
mov $3.16b, $2.16b
bsl $3.16b, $0.16b, $1.16b
.endm
.macro LOAD_LUMA_DATA_3
ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
.macro LOAD_LUMA_DATA_4
ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
.endm
.macro STORE_LUMA_DATA_4
st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
.endm
.macro STORE_LUMA_DATA_3
st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
.macro LOAD_CHROMA_DATA_4
ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
.endm
.macro STORE_CHROMA_DATA_2
st2 {$0.b, $1.b} [$3], [$2], x2
.endm
.macro ZERO_JUMP_END
mov $1, $0.d[0]
mov $2, $0.d[1]
orr $1, $1, $2
cbz $1, $3
.endm
.macro BS_NZC_CHECK
ld1 {v0.16b}, [$0]
//Arrange the input data --- TOP
ands x6, $1, #2
cbz x6, bs_nzc_check_jump0
sub x6, $0, $2, lsl #4
sub x6, x6, $2, lsl #3
add x6, x6, #12
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
ext v1.16b, v1.16b, v0.16b, #12
add $3.16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
ands x6, $1, #1
cbz x6, bs_nzc_check_jump1
sub x6, $0, #21
add x7, x6, #4
ld1 {v1.b} [12], [x6]
add x6, x7, #4
ld1 {v1.b} [13], [x7]
add x7, x6, #4
ld1 {v1.b} [14], [x6]
ld1 {v1.b} [15], [x7]
bs_nzc_check_jump1:
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ext v1.16b, v1.16b, v0.16b, #12
add $4.16b, v0.16b, v1.16b
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
mov w6, #4
sabd v20.8h, $0.8h, $1.8h
sabd v21.8h, $1.8h, $2.8h
dup $0.8h, w6
sabd v22.8h, $2.8h, $3.8h
sabd v23.8h, $3.8h, $4.8h
cmge v20.8h, v20.8h, $0.8h
cmge v21.8h, v21.8h, $0.8h
cmge v22.8h, v22.8h, $0.8h
cmge v23.8h, v23.8h, $0.8h
addp v20.8h, v20.8h, v21.8h
addp v21.8h, v22.8h, v23.8h
addhn $5.8b, v20.8h, v20.8h
addhn2 $5.16b, v21.8h, v21.8h
.endm
.macro BS_MV_CHECK
ldp q0, q1, [$0], #32
ldp q2, q3, [$0]
sub $0, $0, #32
// Arrenge the input data --- TOP
ands x6, $1, #2
cbz x6, bs_mv_check_jump0
sub x6, $0, $2, lsl #6
add x6, x6, #48
ld1 {v4.16b}, [x6]
bs_mv_check_jump0:
BS_COMPARE_MV v4, v0, v1, v2, v3, $3
// Arrange the input data --- LEFT
ands x6, $1, #1
cbz x6, bs_mv_check_jump1
sub x6, $0, #52
add x7, x6, #16
ld1 {v4.s} [0], [x6]
add x6, x7, #16
ld1 {v4.s} [1], [x7]
add x7, x6, #16
ld1 {v4.s} [2], [x6]
ld1 {v4.s} [3], [x7]
bs_mv_check_jump1:
zip1 $5.4s, v0.4s, v2.4s
zip2 $6.4s, v0.4s, v2.4s
zip1 v0.4s, v1.4s, v3.4s
zip2 v2.4s, v1.4s, v3.4s
zip2 v1.4s, $5.4s, v0.4s
zip1 v0.4s, $5.4s, v0.4s
zip2 v3.4s, $6.4s, v2.4s
zip1 v2.4s, $6.4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, $4
.endm
#else
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
@ -549,7 +290,6 @@ bs_mv_check_jump1:
zip1 v2.4s, \arg6\().4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
mov w1, #1

View File

@ -35,181 +35,6 @@
.align 4
filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
#ifdef __APPLE__
.macro FILTER_6TAG_8BITS1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS2
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $2.8b, $6.8b
rshrn $6.8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $2.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $3.8b, $6.8b
rshrn $6.8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $3.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1
// { // input:a, b, c, dst_d;
sub $0.8h, $0.8h, $1.8h //a-b
sshr $0.8h, $0.8h, #2 //(a-b)/4
sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2
// { // input:a, b, c, dst_d;
sub $0.8h, $0.8h, $1.8h //a-b
sshr $0.8h, $0.8h, #2 //(a-b)/4
sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
// }
.endm
.macro UNPACK_2_16BITS_TO_ABC
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
ext $4.16b, $0.16b, $1.16b, #4 //src[0]
ext $3.16b, $0.16b, $1.16b, #6 //src[1]
add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
ext $2.16b, $0.16b, $1.16b, #8 //src[2]
add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
ext $2.16b, $0.16b, $1.16b, #10 //src[3]
add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
// }
.endm
.macro AVERAGE_TWO_8BITS1
// { // input:dst_d, src_d A and B; working: v5
uaddl v30.8h, $2.8b, $1.8b
rshrn $0.8b, v30.8h, #1
// }
.endm
.macro AVERAGE_TWO_8BITS2
// { // input:dst_d, src_d A and B; working: v5
uaddl2 v30.8h, $2.16b, $1.16b
rshrn2 $0.16b, v30.8h, #1
// }
.endm
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X},
rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
addv $3, $2.4h
sqrshrun $0.8b, $0.8h, #5
// }
.endm
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
saddlv $5, $3.4s
//sshr $0.2d, $0.2d, #4
sqrshrun $0.2s, $0.2d, #10
uqxtn $0.4h, $0.4s
uqxtn $0.8b, $0.8h
// }
.endm
#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
@ -382,7 +207,6 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
#endif
//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon

View File

@ -32,40 +32,7 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
sshr $8.4h, $1.4h, #1
sshr $9.4h, $3.4h, #1
ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3];
saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
sshr $6.4s, $1.4s, #1
sshr $7.4s, $3.4s, #1
sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
@ -98,7 +65,6 @@
add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon

View File

@ -93,93 +93,6 @@
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm
#ifdef __APPLE__
.macro SELECT_BEST_COST
cmp w1, $0
csel $0, $0, w1, $2
cset w7, $1
cmp w2, $0
mov w6, #2
csel $0, $0, w2, $2
csel w7, w7, w6, $2
.endm
.macro SELECT_BEST_COST_PREFER_HIGHER arg0
SELECT_BEST_COST \arg0, ls, hi
.endm
.macro SELECT_BEST_COST_PREFER_LOWER arg0
SELECT_BEST_COST \arg0, lo, hs
.endm
.macro LOAD_CHROMA_DATA
sub x9, $0, x1
ld1 {$1}, [x9] //top_cb
sub x9, $0, #1
ld1 {$2}[8], [x9], x1
ld1 {$2}[9], [x9], x1
ld1 {$2}[10], [x9], x1
ld1 {$2}[11], [x9], x1
ld1 {$2}[12], [x9], x1
ld1 {$2}[13], [x9], x1
ld1 {$2}[14], [x9], x1
ld1 {$2}[15], [x9], x1 //left_cb
.endm
.macro LOAD_8X4_DATA
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
ld1 {v0.8b}, [$0], x3
ld1 {v1.8b}, [$0], x3
ld1 {v0.d}[1], [$0], x3
ld1 {v1.d}[1], [$0], x3
trn1 v2.4s, v0.4s, v1.4s
trn2 v1.4s, v0.4s, v1.4s
trn1 v20.2d, v2.2d, v1.2d
trn2 v21.2d, v2.2d, v1.2d
.endm
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
uadd$9 v0.8h, $0, $1
usub$9 v1.8h, $0, $1
trn1 v3.2d, v0.2d, v1.2d
trn2 v1.2d, v0.2d, v1.2d
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
trn1 v0.4s, v4.4s, v5.4s
trn2 v1.4s, v4.4s, v5.4s
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
trn1 v0.8h, v4.8h, v5.8h
trn2 v1.8h, v4.8h, v5.8h
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
//16x16_v
trn1 v0.2s, v4.2s, v5.2s
trn2 v1.2s, v4.2s, v5.2s
sabal $5, v0.4h, $2
sabal $5, v1.4h, $8.4h
sabal2 $5, v4.8h, $8.8h
sabal2 $5, v5.8h, $8.8h
//16x16_h
ins v3.d[0], v4.d[1]
trn1 v0.4h, v4.4h, v3.4h
trn2 v1.4h, v4.4h, v3.4h
sabal $6, v0.4h, $3
sabdl v4.4s, v1.4h, $8.4h
sabal v4.4s, v5.4h, $8.4h
sabal2 v4.4s, v5.8h, $8.8h
add $6, $6, v4.4s
//16x16_dc_both
sabal $7, v0.4h, $4
add $7, $7, v4.4s
.endm
#else
.macro SELECT_BEST_COST arg0, arg1, arg2
cmp w1, \arg0
csel \arg0, \arg0, w1, \arg2
@ -265,7 +178,6 @@
sabal \arg7, v0.4h, \arg4
add \arg7, \arg7, v4.4s
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
ldr x11, [sp, #0]

View File

@ -68,89 +68,6 @@
ld1 {v7.16b}, [x0], x1
.endm
#ifdef __APPLE__
.macro LOAD_8X8_2
ld1 {v16.8b}, [$0], x3
ld1 {v17.8b}, [$0], x3
ld1 {v18.8b}, [$0], x3
ld1 {v19.8b}, [$0], x3
ld1 {v20.8b}, [$0], x3
ld1 {v21.8b}, [$0], x3
ld1 {v22.8b}, [$0], x3
ld1 {v23.8b}, [$0], x3
.endm
.macro CALC_ABS_8X8_1
uab$1l $0, v0.8b, v16.8b
uabal $0, v1.8b, v17.8b
uabal $0, v2.8b, v18.8b
uabal $0, v3.8b, v19.8b
uabal $0, v4.8b, v20.8b
uabal $0, v5.8b, v21.8b
uabal $0, v6.8b, v22.8b
uabal $0, v7.8b, v23.8b
.endm
.macro CALC_ABS_8X8_2
uab$0l v29.8h, v0.8b, v18.8b
uabal v29.8h, v1.8b, v19.8b
uabal v29.8h, v2.8b, v20.8b
uabal v29.8h, v3.8b, v21.8b
uabal v29.8h, v4.8b, v22.8b
uabal v29.8h, v5.8b, v23.8b
uabal v29.8h, v6.8b, v24.8b
uabal v29.8h, v7.8b, v25.8b
.endm
.macro LOAD_16X8_2
ld1 {v16.16b}, [$0], x3
ld1 {v17.16b}, [$0], x3
ld1 {v18.16b}, [$0], x3
ld1 {v19.16b}, [$0], x3
ld1 {v20.16b}, [$0], x3
ld1 {v21.16b}, [$0], x3
ld1 {v22.16b}, [$0], x3
ld1 {v23.16b}, [$0], x3
.endm
.macro CALC_ABS_16X8_1
uab$1l $0, v0.8b, v16.8b
uabal2 $0, v0.16b,v16.16b
uabal $0, v1.8b, v17.8b
uabal2 $0, v1.16b,v17.16b
uabal $0, v2.8b, v18.8b
uabal2 $0, v2.16b,v18.16b
uabal $0, v3.8b, v19.8b
uabal2 $0, v3.16b,v19.16b
uabal $0, v4.8b, v20.8b
uabal2 $0, v4.16b,v20.16b
uabal $0, v5.8b, v21.8b
uabal2 $0, v5.16b,v21.16b
uabal $0, v6.8b, v22.8b
uabal2 $0, v6.16b,v22.16b
uabal $0, v7.8b, v23.8b
uabal2 $0, v7.16b,v23.16b
.endm
.macro CALC_ABS_16X8_2
uab$0l v29.8h, v0.8b, v18.8b
uabal2 v29.8h, v0.16b,v18.16b
uabal v29.8h, v1.8b, v19.8b
uabal2 v29.8h, v1.16b,v19.16b
uabal v29.8h, v2.8b, v20.8b
uabal2 v29.8h, v2.16b,v20.16b
uabal v29.8h, v3.8b, v21.8b
uabal2 v29.8h, v3.16b,v21.16b
uabal v29.8h, v4.8b, v22.8b
uabal2 v29.8h, v4.16b,v22.16b
uabal v29.8h, v5.8b, v23.8b
uabal2 v29.8h, v5.16b,v23.16b
uabal v29.8h, v6.8b, v24.8b
uabal2 v29.8h, v6.16b,v24.16b
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
#else
.macro LOAD_8X8_2 arg0
ld1 {v16.8b}, [\arg0], x3
ld1 {v17.8b}, [\arg0], x3
@ -232,7 +149,6 @@
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
sxtw x1, w1

View File

@ -33,247 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ZERO_COUNT_IN_2_QUARWORD
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
cmeq $0.8h, $0.8h, #0
cmeq $1.8h, $1.8h, #0
uzp1 $0.16b, $0.16b, $1.16b
ushr $0.16b, $0.16b, 7
addv $2, $0.16b
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
mov $6.16b, $1.16b
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
shrn $1.4h, $4.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
umax $0.8h, $0.8h, $1.8h
umaxv $4, $0.8h
umax $2.8h, $2.8h, $3.8h
umaxv $5, $2.8h
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
sshr $1.2d, $0.2d, #32
add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
zip1 $1.4h, $2.4h, $1.4h
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
// { // input: coef, dst_d, working_d (all 0x01)
cmeq $0.4h, $0.4h, #0
and $0.8b, $0.8b, $2.8b
addv $1, $0.4h
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
uzp2 $1.4s, $0.4s, $0.4s
uzp1 $0.4s, $0.4s, $0.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
uzp2 $1.4s, $2.4s, $2.4s
uzp1 $0.4s, $2.4s, $2.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
zip1 $0.4s, $2.4s, $1.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
trn1 $4.8h, v0.8h, v1.8h
trn2 $5.8h, v0.8h, v1.8h
trn1 $6.8h, v2.8h, v3.8h
trn2 $7.8h, v2.8h, v3.8h
trn1 $0.4s, v4.4s, v6.4s
trn2 $2.4s, v4.4s, v6.4s
trn1 $1.4s, v5.4s, v7.4s
trn2 $3.4s, v5.4s, v7.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT
ld1 {$0.s}[0], [$2], $3
ld1 {$0.s}[1], [$2], $3
ld1 {$0.s}[2], [$2], $3
ld1 {$0.s}[3], [$2]
ld1 {$1.s}[0], [$4], $5
ld1 {$1.s}[1], [$4], $5
ld1 {$1.s}[2], [$4], $5
ld1 {$1.s}[3], [$4]
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
// { // input: src_d[0]~[3], working: [4]~[7]
add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
shl $1.8h, $7.8h, #1
shl $3.8h, $6.8h, #1
add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro LOAD_8x4_DATA_FOR_DCT
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
ld1 {$0.d}[0], [$8], x2
ld1 {$1.d}[0], [$8], x2
ld1 {$2.d}[0], [$8], x2
ld1 {$3.d}[0], [$8], x2
ld1 {$4.d}[0], [$9], x4
ld1 {$5.d}[0], [$9], x4
ld1 {$6.d}[0], [$9], x4
ld1 {$7.d}[0], [$9], x4
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
// { // input: src_d[0]~[3], output: e_d[0]~[3];
add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
sshr $6.8h, $1.8h, #1
sshr $7.8h, $3.8h, #1
sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3];
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro COL_TRANSFORM_0_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
uxtl $3.8h, $0.8b
uxtl2 $4.8h, $0.16b
add $3.8h, $3.8h, $1.8h
add $4.8h, $4.8h, $2.8h
sqxtun $0.8b, $3.8h
sqxtun2 $0.16b,$4.8h
// }
.endm
#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
cmeq \arg0\().8h, \arg0\().8h, #0
@ -518,7 +277,6 @@
sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]

View File

@ -33,29 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ABS_SUB_SUM_16BYTES
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
uabal $0, v0.8b, v1.8b
uabal2 $1, v0.16b,v1.16b
.endm
.macro ABS_SUB_SUM_8x16BYTES
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
uabdl $0, v0.8b, v1.8b
uabdl2 $1, v0.16b,v1.16b
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
.endm
#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
@ -77,7 +54,6 @@
ABS_SUB_SUM_16BYTES \arg0, \arg1
ABS_SUB_SUM_16BYTES \arg0, \arg1
.endm
#endif
/*
* void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,