Merge pull request #1883 from mstorsjo/arm-asm-cleanup

Remove duplication in arm assembly
This commit is contained in:
zhilwang 2015-03-30 01:02:02 -07:00
commit 7751d756b0
36 changed files with 8 additions and 2164 deletions

View File

@ -36,6 +36,8 @@
#ifdef __APPLE__
.text
.macro WELS_ASM_FUNC_BEGIN
.align 2
.arm

View File

@ -31,46 +31,8 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4,:128], $5
vld1.64 {$1}, [$4,:128], $5
vld1.64 {$2}, [$4,:128], $5
vld1.64 {$3}, [$4,:128], $5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4,:128], $5
vst1.64 {$1}, [$4,:128], $5
vst1.64 {$2}, [$4,:128], $5
vst1.64 {$3}, [$4,:128], $5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4], $5
vld1.64 {$1}, [$4], $5
vld1.64 {$2}, [$4], $5
vld1.64 {$3}, [$4], $5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4], $5
vst1.64 {$1}, [$4], $5
vst1.64 {$2}, [$4], $5
vst1.64 {$3}, [$4], $5
// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5
@ -107,8 +69,6 @@
// }
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon

View File

@ -31,129 +31,9 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro JMP_IF_128BITS_IS_ZERO
vorr.s16 $2, $0, $1
vmov r3, r2, $2
orr r3, r3, r2
cmp r3, #0
.endm
.macro MASK_MATRIX
vabd.u8 $6, $1, $2
vcgt.u8 $6, $4, $6
vabd.u8 $4, $0, $1
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
vabd.u8 $4, $3, $2
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
.endm
.macro DIFF_LUMA_LT4_P1_Q1
vmov.i8 $9, #128
vrhadd.u8 $8, $2, $3
vhadd.u8 $8, $0, $8
vsub.s8 $8, $8, $9
vsub.s8 $9, $1, $9
vqsub.s8 $8, $8, $9
vmax.s8 $8, $8, $5
vmin.s8 $8, $8, $6
vabd.u8 $9, $0, $2
vclt.u8 $9, $9, $4
vand.s8 $8, $8, $9
vand.s8 $8, $8, $7
vadd.u8 $8, $1, $8
vabs.s8 $9, $9
.endm
.macro DIFF_LUMA_LT4_P0_Q0
vsubl.u8 $5, $0, $3
vsubl.u8 $6, $2, $1
vshl.s16 $6, $6, #2
vadd.s16 $5, $5, $6
vqrshrn.s16 $4, $5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0
vaddl.u8 q4, $1, $2
vaddl.u8 q5, $3, $4
vadd.u16 q5, q4, q5
vaddl.u8 q4, $0, $1
vshl.u16 q4, q4, #1
vadd.u16 q4, q5, q4
vrshrn.u16 $0, q5, #2
vrshrn.u16 $7, q4, #3
vshl.u16 q5, q5, #1
vsubl.u8 q4, $5, $1
vadd.u16 q5, q4,q5
vaddl.u8 q4, $2, $5
vaddw.u8 q4, q4, $2
vaddw.u8 q4, q4, $3
vrshrn.u16 d10,q5, #3
vrshrn.u16 d8, q4, #2
vbsl.u8 $6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK
vmov $3, $2
vbsl.u8 $3, $0, $1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0
vaddl.u8 $4, $0, $3
vaddw.u8 $5, $4, $1
vaddw.u8 $6, $4, $2
vaddw.u8 $5, $5, $0
vaddw.u8 $6, $6, $3
vrshrn.u16 $7, $5, #2
vrshrn.u16 $8, $6, #2
.endm
.macro LOAD_CHROMA_DATA_4
vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4
vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro LOAD_LUMA_DATA_3
vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4
vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
.endm
.macro STORE_LUMA_DATA_3
vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
vcge.s8 $1, $0, #0
vand $1, $0, $1
vsub.s8 $0, $1, $0
.endm
#else
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
vorr.s16 \arg2, \arg0, \arg1
vmov r3, r2, \arg2
@ -270,7 +150,6 @@
vand \arg1, \arg0, \arg1
vsub.s8 \arg0, \arg1, \arg0
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
vpush {q4-q7}
@ -842,100 +721,6 @@ WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
vst1.64 {d0,d1,d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
sub r6, r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_nzc_check_jump1
sub r6, $0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q8, $0, $1
vabd.s16 q9, $1, $2
vdup.s16 $0, r6
vabd.s16 q10, $2, $3
vabd.s16 q11, $3, $4
vcge.s16 q8, $0
vcge.s16 q9, $0
vcge.s16 q10, $0
vcge.s16 q11, $0
vpadd.i16 d16, d16, d17
vpadd.i16 d17, d18, d19
vpadd.i16 d18, d20, d21
vpadd.i16 d19, d22, d23
vaddhn.i16 $5, q8, q8
vaddhn.i16 $6, q9, q9
.endm
.macro BS_MV_CHECK
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
#else
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
@ -1028,7 +813,6 @@ bs_mv_check_jump1:
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

View File

@ -32,7 +32,6 @@
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon

View File

@ -31,120 +31,8 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro AVERAGE_TWO_8BITS
// { // input:dst_d, src_d A and B; working: q13
vaddl.u8 q13, $2, $1
vrshrn.u16 $0, q13, #1
// }
.endm
.macro FILTER_6TAG_8BITS
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
vaddl.u8 q13, $2, $3 //src[0]+src[1]
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
vqrshrun.s16 $6, q12, #5
// }
.endm
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
vpadd.s16 $0, $0, $0
vpadd.s16 $0, $0, $0
vqrshrun.s16 $0, $4, #5
// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
vaddl.u8 q13, $2, $3 //src[0]+src[1]
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
vqrshrun.s16 $6, q12, #5
vaddl.u8 q13, $2, $6
vrshrn.u16 $6, q13, #1
// }
.endm
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
vaddl.u8 q13, $2, $3 //src[0]+src[1]
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
vqrshrun.s16 $6, q12, #5
vaddl.u8 q13, $3, $6
vrshrn.u16 $6, q13, #1
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
vaddl.u8 q13, $2, $3 //src[0]+src[1]
vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS
// { // input:a, b, c, dst_d;
vsub.s16 $0, $0, $1 //a-b
vshr.s16 $0, $0, #2 //(a-b)/4
vsub.s16 $0, $0, $1 //(a-b)/4-b
vadd.s16 $0, $0, $2 //(a-b)/4-b+c
vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
vqrshrun.s16 $3, $0, #6 //(+32)>>6
// }
.endm
.macro UNPACK_2_16BITS_TO_ABC
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
vext.16 $4, $0, $1, #2 //src[0]
vext.16 $3, $0, $1, #3 //src[1]
vadd.s16 $4, $3 //c=src[0]+src[1]
vext.16 $3, $0, $1, #1 //src[-1]
vext.16 $2, $0, $1, #4 //src[2]
vadd.s16 $3, $2 //b=src[-1]+src[2]
vext.16 $2, $0, $1, #5 //src[3]
vadd.s16 $2, $0 //a=src[-2]+src[3]
// }
.endm
.macro UNPACK_1_IN_8x16BITS_TO_8BITS
// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
vrev64.16 $1, $1
vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
vshr.s64 $1, $2, #16
vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
vsub.s16 $0, $0, $1 //a-b
vshr.s16 $0, $0, #2 //(a-b)/4
vsub.s16 $0, $0, $1 //(a-b)/4-b
vadd.s16 $0, $0, $2 //(a-b)/4-b+c
vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
vqrshrun.s16 $0, $3, #6 //(+32)>>6
// }
.endm
#else
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: q13
vaddl.u8 q13, \arg2, \arg1
@ -163,7 +51,7 @@
// }
.endm
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4 // when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
@ -254,7 +142,6 @@
vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
// }
.endm
#endif
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
push {r4}

View File

@ -32,6 +32,8 @@
#ifdef __APPLE__
.text
.macro WELS_ASM_AARCH64_FUNC_BEGIN
.align 2
.globl _$0

View File

@ -31,83 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.d}[0], [$4], $5
ld1 {$1.d}[0], [$4], $5
ld1 {$2.d}[0], [$4], $5
ld1 {$3.d}[0], [$4], $5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.d}[0], [$4], $5
st1 {$1.d}[0], [$4], $5
st1 {$2.d}[0], [$4], $5
st1 {$3.d}[0], [$4], $5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.8b}, [$4], $5
ld1 {$1.8b}, [$4], $5
ld1 {$2.8b}, [$4], $5
ld1 {$3.8b}, [$4], $5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.8b}, [$4], $5
st1 {$1.8b}, [$4], $5
st1 {$2.8b}, [$4], $5
st1 {$3.8b}, [$4], $5
// }
.endm
.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.2d}, [$4], $5
ld1 {$1.2d}, [$4], $5
ld1 {$2.2d}, [$4], $5
ld1 {$3.2d}, [$4], $5
// }
.endm
.macro STORE16_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.2d}, [$4], $5
st1 {$1.2d}, [$4], $5
st1 {$2.2d}, [$4], $5
st1 {$3.2d}, [$4], $5
// }
.endm
.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
ld1 {$0.16b}, [$4], $5
ld1 {$1.16b}, [$4], $5
ld1 {$2.16b}, [$4], $5
ld1 {$3.16b}, [$4], $5
// }
.endm
.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
st1 {$0.16b}, [$4], $5
st1 {$1.16b}, [$4], $5
st1 {$2.16b}, [$4], $5
st1 {$3.16b}, [$4], $5
// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: $0~$3, src*, src_stride
ld1 {\arg0\().d}[0], [\arg4], \arg5
@ -180,8 +105,6 @@
// }
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon

View File

@ -31,268 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro MASK_MATRIX
uabd $6.16b, $1.16b, $2.16b
cmhi $6.16b, $4.16b, $6.16b
uabd $4.16b, $0.16b, $1.16b
cmhi $4.16b, $5.16b, $4.16b
and $6.16b, $6.16b, $4.16b
uabd $4.16b, $3.16b, $2.16b
cmhi $4.16b, $5.16b, $4.16b
and $6.16b, $6.16b, $4.16b
.endm
.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
//v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
urhadd $8.16b, $2.16b, $3.16b
uhadd $8.16b, $0.16b, $8.16b
usubl $9.8h, $8.8b, $1.8b
sqxtn $9.8b, $9.8h
usubl2 $8.8h, $8.16b, $1.16b
sqxtn2 $9.16b, $8.8h
smax $8.16b, $9.16b, $5.16b
//
smin $8.16b, $8.16b, $6.16b
uabd $9.16b, $0.16b, $2.16b
cmhi $9.16b, $4.16b, $9.16b
and $8.16b, $8.16b, $9.16b
and $8.16b, $8.16b, $7.16b
add $8.16b, $1.16b, $8.16b
abs $9.16b, $9.16b
.endm
.macro DIFF_LUMA_LT4_P0_Q0_1
usubl $5.8h, $0.8b, $3.8b
usubl $6.8h, $2.8b, $1.8b
shl $6.8h, $6.8h, #2
add $5.8h, $5.8h, $6.8h
sqrshrn $4.8b, $5.8h, #3
.endm
.macro DIFF_LUMA_LT4_P0_Q0_2
usubl2 $5.8h, $0.16b, $3.16b
usubl2 $6.8h, $2.16b, $1.16b
shl $6.8h, $6.8h, #2
add $5.8h, $5.8h, $6.8h
sqrshrn2 $4.16b, $5.8h, #3
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
cmge $1.16b, $0.16b, #0
and $1.16b, $0.16b, $1.16b
sub $0.16b, $1.16b, $0.16b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_1
uaddl $8.8h, $1.8b, $2.8b
uaddl $9.8h, $3.8b, $4.8b
add $9.8h, $9.8h, $8.8h
uaddl $8.8h, $0.8b, $1.8b
shl $8.8h, $8.8h, #1
add $8.8h, $9.8h, $8.8h
rshrn $0.8b, $9.8h, #2
rshrn $7.8b, $8.8h, #3
shl $9.8h, $9.8h, #1
usubl $8.8h, $5.8b, $1.8b
add $9.8h, $8.8h, $9.8h
uaddl $8.8h, $2.8b, $5.8b
uaddw $8.8h, $8.8h, $2.8b
uaddw $8.8h, $8.8h, $3.8b
rshrn $9.8b, $9.8h, #3
rshrn $8.8b, $8.8h, #2
bsl $6.8b, $9.8b, $8.8b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_2
uaddl2 $8.8h, $1.16b, $2.16b
uaddl2 $9.8h, $3.16b, $4.16b
add $9.8h, $9.8h, $8.8h
uaddl2 $8.8h, $0.16b, $1.16b
shl $8.8h, $8.8h, #1
add $8.8h, $9.8h, $8.8h
rshrn2 $0.16b, $9.8h, #2
rshrn2 $7.16b, $8.8h, #3
shl $9.8h, $9.8h, #1
usubl2 $8.8h, $5.16b, $1.16b
add $9.8h, $8.8h, $9.8h
uaddl2 $8.8h, $2.16b, $5.16b
uaddw2 $8.8h, $8.8h, $2.16b
uaddw2 $8.8h, $8.8h, $3.16b
rshrn2 $9.16b, $9.8h, #3
rshrn2 $8.16b, $8.8h, #2
bsl $6.16b, $9.16b, $8.16b
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_1
uaddl $4.8h, $0.8b, $3.8b
shl $4.8h, $4.8h, #1
usubl $5.8h, $1.8b, $3.8b
add $5.8h, $5.8h, $4.8h
rshrn $6.8b, $5.8h, #2
usubl $5.8h, $2.8b, $0.8b
add $5.8h, $5.8h, $4.8h
rshrn $7.8b, $5.8h, #2
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_2
uaddl2 $4.8h, $0.16b, $3.16b
shl $4.8h, $4.8h, #1
usubl2 $5.8h, $1.16b, $3.16b
add $5.8h, $5.8h, $4.8h
rshrn2 $6.16b, $5.8h, #2
usubl2 $5.8h, $2.16b, $0.16b
add $5.8h, $5.8h, $4.8h
rshrn2 $7.16b, $5.8h, #2
.endm
.macro DIFF_LUMA_EQ4_MASK
mov $3.16b, $2.16b
bsl $3.16b, $0.16b, $1.16b
.endm
.macro LOAD_LUMA_DATA_3
ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
.macro LOAD_LUMA_DATA_4
ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
.endm
.macro STORE_LUMA_DATA_4
st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
.endm
.macro STORE_LUMA_DATA_3
st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
.macro LOAD_CHROMA_DATA_4
ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
.endm
.macro STORE_CHROMA_DATA_2
st2 {$0.b, $1.b} [$3], [$2], x2
.endm
.macro ZERO_JUMP_END
mov $1, $0.d[0]
mov $2, $0.d[1]
orr $1, $1, $2
cbz $1, $3
.endm
.macro BS_NZC_CHECK
ld1 {v0.16b}, [$0]
//Arrange the input data --- TOP
ands x6, $1, #2
cbz x6, bs_nzc_check_jump0
sub x6, $0, $2, lsl #4
sub x6, x6, $2, lsl #3
add x6, x6, #12
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
ext v1.16b, v1.16b, v0.16b, #12
add $3.16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
ands x6, $1, #1
cbz x6, bs_nzc_check_jump1
sub x6, $0, #21
add x7, x6, #4
ld1 {v1.b} [12], [x6]
add x6, x7, #4
ld1 {v1.b} [13], [x7]
add x7, x6, #4
ld1 {v1.b} [14], [x6]
ld1 {v1.b} [15], [x7]
bs_nzc_check_jump1:
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ext v1.16b, v1.16b, v0.16b, #12
add $4.16b, v0.16b, v1.16b
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
mov w6, #4
sabd v20.8h, $0.8h, $1.8h
sabd v21.8h, $1.8h, $2.8h
dup $0.8h, w6
sabd v22.8h, $2.8h, $3.8h
sabd v23.8h, $3.8h, $4.8h
cmge v20.8h, v20.8h, $0.8h
cmge v21.8h, v21.8h, $0.8h
cmge v22.8h, v22.8h, $0.8h
cmge v23.8h, v23.8h, $0.8h
addp v20.8h, v20.8h, v21.8h
addp v21.8h, v22.8h, v23.8h
addhn $5.8b, v20.8h, v20.8h
addhn2 $5.16b, v21.8h, v21.8h
.endm
.macro BS_MV_CHECK
ldp q0, q1, [$0], #32
ldp q2, q3, [$0]
sub $0, $0, #32
// Arrenge the input data --- TOP
ands x6, $1, #2
cbz x6, bs_mv_check_jump0
sub x6, $0, $2, lsl #6
add x6, x6, #48
ld1 {v4.16b}, [x6]
bs_mv_check_jump0:
BS_COMPARE_MV v4, v0, v1, v2, v3, $3
// Arrange the input data --- LEFT
ands x6, $1, #1
cbz x6, bs_mv_check_jump1
sub x6, $0, #52
add x7, x6, #16
ld1 {v4.s} [0], [x6]
add x6, x7, #16
ld1 {v4.s} [1], [x7]
add x7, x6, #16
ld1 {v4.s} [2], [x6]
ld1 {v4.s} [3], [x7]
bs_mv_check_jump1:
zip1 $5.4s, v0.4s, v2.4s
zip2 $6.4s, v0.4s, v2.4s
zip1 v0.4s, v1.4s, v3.4s
zip2 v2.4s, v1.4s, v3.4s
zip2 v1.4s, $5.4s, v0.4s
zip1 v0.4s, $5.4s, v0.4s
zip2 v3.4s, $6.4s, v2.4s
zip1 v2.4s, $6.4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, $4
.endm
#else
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
@ -550,7 +290,6 @@ bs_mv_check_jump1:
zip1 v2.4s, \arg6\().4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
mov w1, #1

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureLuma_AArch64_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
//for Luma 16x16

View File

@ -31,186 +31,10 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
.align 4
filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
#ifdef __APPLE__
.macro FILTER_6TAG_8BITS1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS2
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $2.8b, $6.8b
rshrn $6.8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $2.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun $6.8b, v18.8h, #5
uaddl v19.8h, $3.8b, $6.8b
rshrn $6.8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 $6.16b, v18.8h, #5
uaddl2 v19.8h, $3.16b, $6.16b
rshrn2 $6.16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1
// { // input:a, b, c, dst_d;
sub $0.8h, $0.8h, $1.8h //a-b
sshr $0.8h, $0.8h, #2 //(a-b)/4
sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2
// { // input:a, b, c, dst_d;
sub $0.8h, $0.8h, $1.8h //a-b
sshr $0.8h, $0.8h, #2 //(a-b)/4
sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
// }
.endm
.macro UNPACK_2_16BITS_TO_ABC
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
ext $4.16b, $0.16b, $1.16b, #4 //src[0]
ext $3.16b, $0.16b, $1.16b, #6 //src[1]
add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
ext $2.16b, $0.16b, $1.16b, #8 //src[2]
add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
ext $2.16b, $0.16b, $1.16b, #10 //src[3]
add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
// }
.endm
.macro AVERAGE_TWO_8BITS1
// { // input:dst_d, src_d A and B; working: v5
uaddl v30.8h, $2.8b, $1.8b
rshrn $0.8b, v30.8h, #1
// }
.endm
.macro AVERAGE_TWO_8BITS2
// { // input:dst_d, src_d A and B; working: v5
uaddl2 v30.8h, $2.16b, $1.16b
rshrn2 $0.16b, v30.8h, #1
// }
.endm
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X},
rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
addv $3, $2.4h
sqrshrun $0.8b, $0.8h, #5
// }
.endm
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
saddlv $5, $3.4s
//sshr $0.2d, $0.2d, #4
sqrshrun $0.2s, $0.2d, #10
uqxtn $0.4h, $0.4s
uqxtn $0.8b, $0.8h
// }
.endm
#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
@ -383,7 +207,6 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
#endif
//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon

View File

@ -31,42 +31,7 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
@ -98,7 +63,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon

View File

@ -32,22 +32,8 @@
#ifdef HAVE_NEON
//Global macro
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
@ -59,7 +45,6 @@
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon

View File

@ -31,42 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
sshr $8.4h, $1.4h, #1
sshr $9.4h, $3.4h, #1
ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3];
saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
sshr $6.4s, $1.4s, #1
sshr $7.4s, $3.4s, #1
sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
@ -99,7 +65,6 @@
add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
// for Luma 4x4

View File

@ -31,23 +31,9 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endm
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
@ -59,7 +45,6 @@
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon

View File

@ -31,70 +31,9 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
//The data sequence will be used
.macro GET_8BYTE_DATA_L0
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endm
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
vabs.s16 d3, d3
//16x16_v
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 $5, d0, $2 //16x16_v
vaba.s16 $5, d1, $8
vaba.s16 $5, d5, $8
vadd.u16 $5, d3
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 $6, d4, $3 //16x16_h
vabs.s16 d2, d2
vabs.s16 d5, d5
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 $6, d2
//16x16_dc_both
vaba.s16 $7, d4, $4 //16x16_dc_both
vadd.u16 $7, d2
.endm
#else
//The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
@ -150,7 +89,6 @@
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
vadd.u16 \arg7, d2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
stmdb sp!, {r4-r7, lr}

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
.macro SATD_16x4

View File

@ -31,254 +31,8 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro LOAD_4x4_DATA_FOR_DCT
// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
vld2.16 {$0[0],$1[0]}, [$4], $5
vld2.16 {$2[0],$3[0]}, [$6], $7
vld2.16 {$0[1],$1[1]}, [$4], $5
vld2.16 {$2[1],$3[1]}, [$6], $7
vld2.16 {$0[2],$1[2]}, [$4], $5
vld2.16 {$2[2],$3[2]}, [$6], $7
vld2.16 {$0[3],$1[3]}, [$4], $5
vld2.16 {$2[3],$3[3]}, [$6], $7
// }
.endm
.macro LOAD_8x8_DATA_FOR_DCT
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
vld1.64 {$0}, [$8], r2
vld1.64 {$4}, [$9], r4
vld1.64 {$1}, [$8], r2
vld1.64 {$5}, [$9], r4
vld1.64 {$2}, [$8], r2
vld1.64 {$6}, [$9], r4
vld1.64 {$3}, [$8], r2
vld1.64 {$7}, [$9], r4
// }
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
// { // input: src_d[0]~[3], working: [4]~[7]
vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
vshl.s16 $1, $7, #1
vshl.s16 $3, $6, #1
vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
veor.s16 $6, $6 // init 0 , and keep 0;
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
vmull.s16 $7, $2, $4
vmull.s16 $8, $3, $5
vshr.s32 $7, #16
vshr.s32 $8, #16
vmovn.s32 $2, $7
vmovn.s32 $3, $8
vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $6, #1
vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
veor.s16 $6, $6 // init 0 , and keep 0;
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
vmull.s16 $7, $2, $4
vmull.s16 $8, $3, $5
vshr.s32 $7, #16
vshr.s32 $8, #16
vmovn.s32 $2, $7
vmovn.s32 $3, $8
vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $6, #1
vmax.s16 $9, $2, $3
vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf , working_d (all 0), working_q
vaba.s16 $1, $0, $3 // f + abs(coef - 0)
vmull.s16 $4, $1, $2 // *= mf
vshr.s32 $4, #16
vmovn.s32 $1, $4 // >> 16
vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $3, #1
vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
// { // input: coef, dst_d, working_d (all 0x01)
vceq.s16 $1, $0, #0
vand.s16 $1, $2
vpadd.s16 $1, $1, $1
vpadd.s16 $1, $1, $1
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
// { // input: coef_0, coef_1, max_q (identy to follow two)
vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
// }
.endm
.macro ZERO_COUNT_IN_2_QUARWORD
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
vceq.s16 $0, #0
vceq.s16 $1, #0
vand.s16 $0, $2
vand.s16 $1, $2
vpadd.s16 $3, $3, $5
vpadd.s16 $4, $4, $6
vpadd.s16 $3, $3, $4 // 8-->4
vpadd.s16 $3, $3, $3
vpadd.s16 $3, $3, $3
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
// { // input: src_d[0]~[3], working_d, dst_d
vshr.s64 $1, $0, #32
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
vtrn.s16 $2, $1
vtrn.s32 $2, $1
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
vshr.s64 $1, $0, #32
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
vtrn.s16 $2, $1
vrev32.16 $1, $1
vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
vrev64.16 $1, $2
vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
vsub.s16 $1, $2, $1
vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
vmovl.u8 $4,$0
vmovl.u8 $5,$1
vadd.s16 $4,$2
vadd.s16 $5,$3
vqmovun.s16 $0,$4
vqmovun.s16 $1,$5
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
// { // input: src_d[0]~[3], output: e_d[0]~[3];
vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
vshr.s16 $6, $1, #1
vshr.s16 $7, $3, #1
vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3];
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_0_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
@ -522,7 +276,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDctT4_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
// for Luma 4x4

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
.macro LOAD_LUMA_DATA
@ -94,93 +93,6 @@
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm
#ifdef __APPLE__
.macro SELECT_BEST_COST
cmp w1, $0
csel $0, $0, w1, $2
cset w7, $1
cmp w2, $0
mov w6, #2
csel $0, $0, w2, $2
csel w7, w7, w6, $2
.endm
.macro SELECT_BEST_COST_PREFER_HIGHER arg0
SELECT_BEST_COST \arg0, ls, hi
.endm
.macro SELECT_BEST_COST_PREFER_LOWER arg0
SELECT_BEST_COST \arg0, lo, hs
.endm
.macro LOAD_CHROMA_DATA
sub x9, $0, x1
ld1 {$1}, [x9] //top_cb
sub x9, $0, #1
ld1 {$2}[8], [x9], x1
ld1 {$2}[9], [x9], x1
ld1 {$2}[10], [x9], x1
ld1 {$2}[11], [x9], x1
ld1 {$2}[12], [x9], x1
ld1 {$2}[13], [x9], x1
ld1 {$2}[14], [x9], x1
ld1 {$2}[15], [x9], x1 //left_cb
.endm
.macro LOAD_8X4_DATA
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
ld1 {v0.8b}, [$0], x3
ld1 {v1.8b}, [$0], x3
ld1 {v0.d}[1], [$0], x3
ld1 {v1.d}[1], [$0], x3
trn1 v2.4s, v0.4s, v1.4s
trn2 v1.4s, v0.4s, v1.4s
trn1 v20.2d, v2.2d, v1.2d
trn2 v21.2d, v2.2d, v1.2d
.endm
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
uadd$9 v0.8h, $0, $1
usub$9 v1.8h, $0, $1
trn1 v3.2d, v0.2d, v1.2d
trn2 v1.2d, v0.2d, v1.2d
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
trn1 v0.4s, v4.4s, v5.4s
trn2 v1.4s, v4.4s, v5.4s
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
trn1 v0.8h, v4.8h, v5.8h
trn2 v1.8h, v4.8h, v5.8h
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
//16x16_v
trn1 v0.2s, v4.2s, v5.2s
trn2 v1.2s, v4.2s, v5.2s
sabal $5, v0.4h, $2
sabal $5, v1.4h, $8.4h
sabal2 $5, v4.8h, $8.8h
sabal2 $5, v5.8h, $8.8h
//16x16_h
ins v3.d[0], v4.d[1]
trn1 v0.4h, v4.4h, v3.4h
trn2 v1.4h, v4.4h, v3.4h
sabal $6, v0.4h, $3
sabdl v4.4s, v1.4h, $8.4h
sabal v4.4s, v5.4h, $8.4h
sabal2 v4.4s, v5.8h, $8.8h
add $6, $6, v4.4s
//16x16_dc_both
sabal $7, v0.4h, $4
add $7, $7, v4.4s
.endm
#else
.macro SELECT_BEST_COST arg0, arg1, arg2
cmp w1, \arg0
csel \arg0, \arg0, w1, \arg2
@ -266,7 +178,6 @@
sabal \arg7, v0.4h, \arg4
add \arg7, \arg7, v4.4s
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
ldr x11, [sp, #0]

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
.macro CALC_AND_STORE_SAD
@ -69,89 +68,6 @@
ld1 {v7.16b}, [x0], x1
.endm
#ifdef __APPLE__
.macro LOAD_8X8_2
ld1 {v16.8b}, [$0], x3
ld1 {v17.8b}, [$0], x3
ld1 {v18.8b}, [$0], x3
ld1 {v19.8b}, [$0], x3
ld1 {v20.8b}, [$0], x3
ld1 {v21.8b}, [$0], x3
ld1 {v22.8b}, [$0], x3
ld1 {v23.8b}, [$0], x3
.endm
.macro CALC_ABS_8X8_1
uab$1l $0, v0.8b, v16.8b
uabal $0, v1.8b, v17.8b
uabal $0, v2.8b, v18.8b
uabal $0, v3.8b, v19.8b
uabal $0, v4.8b, v20.8b
uabal $0, v5.8b, v21.8b
uabal $0, v6.8b, v22.8b
uabal $0, v7.8b, v23.8b
.endm
.macro CALC_ABS_8X8_2
uab$0l v29.8h, v0.8b, v18.8b
uabal v29.8h, v1.8b, v19.8b
uabal v29.8h, v2.8b, v20.8b
uabal v29.8h, v3.8b, v21.8b
uabal v29.8h, v4.8b, v22.8b
uabal v29.8h, v5.8b, v23.8b
uabal v29.8h, v6.8b, v24.8b
uabal v29.8h, v7.8b, v25.8b
.endm
.macro LOAD_16X8_2
ld1 {v16.16b}, [$0], x3
ld1 {v17.16b}, [$0], x3
ld1 {v18.16b}, [$0], x3
ld1 {v19.16b}, [$0], x3
ld1 {v20.16b}, [$0], x3
ld1 {v21.16b}, [$0], x3
ld1 {v22.16b}, [$0], x3
ld1 {v23.16b}, [$0], x3
.endm
.macro CALC_ABS_16X8_1
uab$1l $0, v0.8b, v16.8b
uabal2 $0, v0.16b,v16.16b
uabal $0, v1.8b, v17.8b
uabal2 $0, v1.16b,v17.16b
uabal $0, v2.8b, v18.8b
uabal2 $0, v2.16b,v18.16b
uabal $0, v3.8b, v19.8b
uabal2 $0, v3.16b,v19.16b
uabal $0, v4.8b, v20.8b
uabal2 $0, v4.16b,v20.16b
uabal $0, v5.8b, v21.8b
uabal2 $0, v5.16b,v21.16b
uabal $0, v6.8b, v22.8b
uabal2 $0, v6.16b,v22.16b
uabal $0, v7.8b, v23.8b
uabal2 $0, v7.16b,v23.16b
.endm
.macro CALC_ABS_16X8_2
uab$0l v29.8h, v0.8b, v18.8b
uabal2 v29.8h, v0.16b,v18.16b
uabal v29.8h, v1.8b, v19.8b
uabal2 v29.8h, v1.16b,v19.16b
uabal v29.8h, v2.8b, v20.8b
uabal2 v29.8h, v2.16b,v20.16b
uabal v29.8h, v3.8b, v21.8b
uabal2 v29.8h, v3.16b,v21.16b
uabal v29.8h, v4.8b, v22.8b
uabal2 v29.8h, v4.16b,v22.16b
uabal v29.8h, v5.8b, v23.8b
uabal2 v29.8h, v5.16b,v23.16b
uabal v29.8h, v6.8b, v24.8b
uabal2 v29.8h, v6.16b,v24.16b
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
#else
.macro LOAD_8X8_2 arg0
ld1 {v16.8b}, [\arg0], x3
ld1 {v17.8b}, [\arg0], x3
@ -233,7 +149,6 @@
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
sxtw x1, w1

View File

@ -31,250 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ZERO_COUNT_IN_2_QUARWORD
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
cmeq $0.8h, $0.8h, #0
cmeq $1.8h, $1.8h, #0
uzp1 $0.16b, $0.16b, $1.16b
ushr $0.16b, $0.16b, 7
addv $2, $0.16b
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
mov $6.16b, $1.16b
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
shrn $1.4h, $4.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
umax $0.8h, $0.8h, $1.8h
umaxv $4, $0.8h
umax $2.8h, $2.8h, $3.8h
umaxv $5, $2.8h
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
sshr $1.2d, $0.2d, #32
add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
zip1 $1.4h, $2.4h, $1.4h
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
// { // input: coef, dst_d, working_d (all 0x01)
cmeq $0.4h, $0.4h, #0
and $0.8b, $0.8b, $2.8b
addv $1, $0.4h
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
uzp2 $1.4s, $0.4s, $0.4s
uzp1 $0.4s, $0.4s, $0.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
uzp2 $1.4s, $2.4s, $2.4s
uzp1 $0.4s, $2.4s, $2.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
zip1 $0.4s, $2.4s, $1.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
trn1 $4.8h, v0.8h, v1.8h
trn2 $5.8h, v0.8h, v1.8h
trn1 $6.8h, v2.8h, v3.8h
trn2 $7.8h, v2.8h, v3.8h
trn1 $0.4s, v4.4s, v6.4s
trn2 $2.4s, v4.4s, v6.4s
trn1 $1.4s, v5.4s, v7.4s
trn2 $3.4s, v5.4s, v7.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT
ld1 {$0.s}[0], [$2], $3
ld1 {$0.s}[1], [$2], $3
ld1 {$0.s}[2], [$2], $3
ld1 {$0.s}[3], [$2]
ld1 {$1.s}[0], [$4], $5
ld1 {$1.s}[1], [$4], $5
ld1 {$1.s}[2], [$4], $5
ld1 {$1.s}[3], [$4]
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
// { // input: src_d[0]~[3], working: [4]~[7]
add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
shl $1.8h, $7.8h, #1
shl $3.8h, $6.8h, #1
add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro LOAD_8x4_DATA_FOR_DCT
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
ld1 {$0.d}[0], [$8], x2
ld1 {$1.d}[0], [$8], x2
ld1 {$2.d}[0], [$8], x2
ld1 {$3.d}[0], [$8], x2
ld1 {$4.d}[0], [$9], x4
ld1 {$5.d}[0], [$9], x4
ld1 {$6.d}[0], [$9], x4
ld1 {$7.d}[0], [$9], x4
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
// { // input: src_d[0]~[3], output: e_d[0]~[3];
add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
sshr $6.8h, $1.8h, #1
sshr $7.8h, $3.8h, #1
sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3];
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro COL_TRANSFORM_0_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
uxtl $3.8h, $0.8b
uxtl2 $4.8h, $0.16b
add $3.8h, $3.8h, $1.8h
add $4.8h, $4.8h, $2.8h
sqxtun $0.8b, $3.8h
sqxtun2 $0.16b,$4.8h
// }
.endm
#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
cmeq \arg0\().8h, \arg0\().8h, #0
@ -519,7 +277,6 @@
sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
@ -334,4 +333,4 @@ _hash_width_loop:
subs x2, x2, #1
cbnz x2, _hash_height_loop
WELS_ASM_AARCH64_FUNC_END
#endif
#endif

View File

@ -31,24 +31,14 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
#endif
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

View File

@ -31,41 +31,8 @@
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro ABS_SUB_SUM_16BYTES
vld1.32 {q15}, [$0], $2
vld1.32 {q14}, [$1], $2
vabal.u8 $3, d30, d28
vabal.u8 $4, d31, d29
.endm
.macro ABS_SUB_SUM_8x16BYTES
vld1.32 {q15}, [$0], $2
vld1.32 {q14}, [$1], $2
vabdl.u8 $3, d30, d28
vabdl.u8 $4, d31, d29
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
.endm
.macro SAD_8X16BITS
vadd.u16 d31, $0, $1
vpaddl.u16 d31, d31
vpaddl.u32 $2, d31
.endm
#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
vld1.32 {q15}, [\arg0], \arg2
vld1.32 {q14}, [\arg1], \arg2
@ -93,7 +60,6 @@
vpaddl.u16 d31, d31
vpaddl.u32 \arg2, d31
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSad_neon
@ -160,52 +126,6 @@ vaa_calc_sad_loop1:
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SAD_SD_MAD_16BYTES
vld1.32 {q0}, [$0], $2
vld1.32 {q1}, [$1], $2
vpadal.u8 $3, q0
vpadal.u8 $4, q1
vabd.u8 q0, q0, q1
vmax.u8 $5, q0
vpadal.u8 $6, q0
.endm
.macro SAD_SD_MAD_8x16BYTES
vld1.32 {q0}, [$0], $2
vld1.32 {q1}, [$1], $2
vpaddl.u8 q2, q0
vpaddl.u8 q3, q1
vabd.u8 $3, q0, q1
vpaddl.u8 $4, $3 //abs_diff
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
vsub.u16 $5, q2, q3
.endm
.macro SAD_SD_MAD_CALC
vpmax.u8 d0, $0, $1 //8bytes
vpmax.u8 d0, d0, d0 //4bytes
vpmax.u8 $2, d0, d0 //2bytes
vpaddl.u16 $3, $3
vpaddl.u32 $3, $3
vpaddl.s16 $4, $4
vpaddl.s32 $4, $4
.endm
#else
.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
vld1.32 {q0}, [\arg0], \arg2
vld1.32 {q1}, [\arg1], \arg2
@ -250,7 +170,6 @@ WELS_ASM_FUNC_END
vpaddl.s16 \arg4, \arg4
vpaddl.s32 \arg4, \arg4
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
@ -314,165 +233,6 @@ vaa_calc_sad_bgd_loop1:
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SSD_MUL_SUM_16BYTES_RESET
vmull.u8 $3, $0, $0
vpaddl.u16 $2, $3
vmull.u8 $3, $1, $1
vpadal.u16 $2, $3
.endm
.macro SSD_MUL_SUM_16BYTES
vmull.u8 $3, $0, $0
vpadal.u16 $2, $3
vmull.u8 $3, $1, $1
vpadal.u16 $2, $3
.endm
.macro SAD_SSD_BGD_16
vld1.8 {q0}, [$0], $2 //load cur_row
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
vld1.8 {q1}, [$1], $2 //load ref_row
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end
vld1.8 {q0}, [$0], $1 //load cur_row
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8
vld1.8 {q0}, [$0], $2 //load cur_row
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
vld1.8 {q1}, [$1], $2 //load ref_row
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
//for the begin of a 16x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_16x16
vld1.8 {q0}, [$0], $2 //load cur_row
vld1.8 {q1}, [$1], $2 //load ref_row
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
vld1.8 {q1}, [$1], $2 //load ref_row
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
.endm
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16
vpmax.u8 d10, d10, d11 //4 numbers
vpmax.u8 d10, d10, d10 //2 numbers
vpmax.u8 d10, d10, d10 //1 number1
vmov $0, d10 //d26 d27 keeps the l_mad
//p_sd8x8 fix me
vpaddl.u16 q3, q3
vpaddl.u16 q4, q4
vsub.i32 $1, q3, q4
vpaddl.u32 $1, $1
//psad8x8
vpaddl.u16 $2, $2
vpaddl.u32 $2, $2
//psadframe
vadd.i32 q12, $2
.endm
.macro SAD_SSD_BGD_16x16
//for one 8x16
SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_CALC_8x16 d26, q14, q6
//for another 8x16
SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16_end $0, $2, q7
SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
.macro SSD_SAD_SD_MAD_PADDL
vpaddl.s16 $0, $0
vpaddl.s32 $0, $0
vadd.i32 $1, $1, $2
.endm
#else
.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
vmull.u8 \arg3, \arg0, \arg0
vpaddl.u16 \arg2, \arg3
@ -630,7 +390,6 @@ WELS_ASM_FUNC_END
vpaddl.s32 \arg0, \arg0
vadd.i32 \arg1, \arg1, \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
@ -712,105 +471,6 @@ bne vaa_calc_sad_ssd_bgd_height_loop
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SAD_VAR_16
vld1.8 {q0}, [$0], $2 //load cur_row
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
vld1.8 {q1}, [$1], $2
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END
vld1.8 {q0}, [$0], $1 //load cur_row
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_RESET_16x16
vld1.8 {q0}, [$0], $2 //load cur_row
vld1.8 {q1}, [$1], $2
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
vld1.8 {q1}, [$1], $2
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8
vld1.8 {q0}, [$0], $2 //load cur_row
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
vabd.u8 q2, q0, q1 //abs_diff
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
vld1.8 {q1}, [$1], $2
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16x16
//for one 8x16
SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
//for another 8x16
SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16_END $0, $2, q7
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
vadd.i32 q12, q7
.endm
#else
.macro SAD_VAR_16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
@ -909,7 +569,6 @@ WELS_ASM_FUNC_END
vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
@ -971,62 +630,6 @@ bne vaa_calc_sad_var_height_loop
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro SAD_SSD_16
SAD_VAR_16 $0, $1, $2, $3
SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END
SAD_VAR_16_END $0, $1, $2
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16
SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8
SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16x16
//for one 8x16
SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
//for another 8x16
SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16_END $0, $2, q7
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
vadd.i32 q12, q7
.endm
#else
.macro SAD_SSD_16 arg0, arg1, arg2, arg3
SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
@ -1081,7 +684,6 @@ WELS_ASM_FUNC_END
vadd.i32 q12, q7
.endm
#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon
ld1 {v1.16b}, [x0], x1 //save the ref data (16bytes)

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon
@ -223,4 +222,4 @@ _LAST_ROW_WIDTH:
WELS_ASM_AARCH64_FUNC_END
#endif
#endif

View File

@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN WelsProcessingSampleSad8x8_AArch64_neon
@ -47,4 +46,4 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsProcessingSampleSad8x8_AArch64_neon
fmov w0, s2
WELS_ASM_AARCH64_FUNC_END
#endif
#endif

View File

@ -31,32 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ABS_SUB_SUM_16BYTES
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
uabal $0, v0.8b, v1.8b
uabal2 $1, v0.16b,v1.16b
.endm
.macro ABS_SUB_SUM_8x16BYTES
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
uabdl $0, v0.8b, v1.8b
uabdl2 $1, v0.16b,v1.16b
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
ABS_SUB_SUM_16BYTES $0, $1
.endm
#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
@ -78,7 +54,6 @@
ABS_SUB_SUM_16BYTES \arg0, \arg1
ABS_SUB_SUM_16BYTES \arg0, \arg1
.endm
#endif
/*
* void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,