Avoid unnecessary tabs in macro declarations
This commit is contained in:
parent
1884c06652
commit
ac03b8b503
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4,:128], $5
|
||||
vld1.64 {$1}, [$4,:128], $5
|
||||
@ -44,7 +44,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4,:128], $5
|
||||
vst1.64 {$1}, [$4,:128], $5
|
||||
@ -53,7 +53,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4], $5
|
||||
vld1.64 {$1}, [$4], $5
|
||||
@ -62,7 +62,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4], $5
|
||||
vst1.64 {$1}, [$4], $5
|
||||
@ -71,7 +71,7 @@
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
@ -80,7 +80,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
@ -89,7 +89,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4], \arg5
|
||||
vld1.64 {\arg1}, [\arg4], \arg5
|
||||
@ -98,7 +98,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4], \arg5
|
||||
vst1.64 {\arg1}, [\arg4], \arg5
|
||||
|
@ -36,14 +36,14 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro JMP_IF_128BITS_IS_ZERO
|
||||
.macro JMP_IF_128BITS_IS_ZERO
|
||||
vorr.s16 $2, $0, $1
|
||||
vmov r3, r2, $2
|
||||
orr r3, r3, r2
|
||||
cmp r3, #0
|
||||
.endm
|
||||
|
||||
.macro MASK_MATRIX
|
||||
.macro MASK_MATRIX
|
||||
vabd.u8 $6, $1, $2
|
||||
vcgt.u8 $6, $4, $6
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
.macro DIFF_LUMA_LT4_P1_Q1
|
||||
.macro DIFF_LUMA_LT4_P1_Q1
|
||||
vmov.i8 $9, #128
|
||||
vrhadd.u8 $8, $2, $3
|
||||
vhadd.u8 $8, $0, $8
|
||||
@ -74,7 +74,7 @@
|
||||
vabs.s8 $9, $9
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_LT4_P0_Q0
|
||||
.macro DIFF_LUMA_LT4_P0_Q0
|
||||
vsubl.u8 $5, $0, $3
|
||||
vsubl.u8 $6, $2, $1
|
||||
vshl.s16 $6, $6, #2
|
||||
@ -82,7 +82,7 @@
|
||||
vqrshrn.s16 $4, $5, #3
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_EQ4_P2P1P0
|
||||
.macro DIFF_LUMA_EQ4_P2P1P0
|
||||
vaddl.u8 q4, $1, $2
|
||||
vaddl.u8 q5, $3, $4
|
||||
vadd.u16 q5, q4, q5
|
||||
@ -107,12 +107,12 @@
|
||||
vbsl.u8 $6, d10, d8
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_EQ4_MASK
|
||||
.macro DIFF_LUMA_EQ4_MASK
|
||||
vmov $3, $2
|
||||
vbsl.u8 $3, $0, $1
|
||||
.endm
|
||||
|
||||
.macro DIFF_CHROMA_EQ4_P0Q0
|
||||
.macro DIFF_CHROMA_EQ4_P0Q0
|
||||
vaddl.u8 $4, $0, $3
|
||||
vaddw.u8 $5, $4, $1
|
||||
vaddw.u8 $6, $4, $2
|
||||
@ -123,45 +123,45 @@
|
||||
vrshrn.u16 $8, $6, #2
|
||||
.endm
|
||||
|
||||
.macro LOAD_CHROMA_DATA_4
|
||||
.macro LOAD_CHROMA_DATA_4
|
||||
vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
|
||||
vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
|
||||
.endm
|
||||
|
||||
.macro STORE_CHROMA_DATA_4
|
||||
.macro STORE_CHROMA_DATA_4
|
||||
vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
|
||||
vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
|
||||
.endm
|
||||
|
||||
.macro LOAD_LUMA_DATA_3
|
||||
.macro LOAD_LUMA_DATA_3
|
||||
vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
|
||||
vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
|
||||
.endm
|
||||
|
||||
.macro STORE_LUMA_DATA_4
|
||||
.macro STORE_LUMA_DATA_4
|
||||
vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
|
||||
vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
|
||||
.endm
|
||||
|
||||
.macro STORE_LUMA_DATA_3
|
||||
.macro STORE_LUMA_DATA_3
|
||||
vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
|
||||
vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
|
||||
.endm
|
||||
|
||||
.macro EXTRACT_DELTA_INTO_TWO_PART
|
||||
.macro EXTRACT_DELTA_INTO_TWO_PART
|
||||
vcge.s8 $1, $0, #0
|
||||
vand $1, $0, $1
|
||||
vsub.s8 $0, $1, $0
|
||||
.endm
|
||||
#else
|
||||
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
|
||||
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
|
||||
vorr.s16 \arg2, \arg0, \arg1
|
||||
vmov r3, r2, \arg2
|
||||
orr r3, r3, r2
|
||||
cmp r3, #0
|
||||
.endm
|
||||
|
||||
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
vabd.u8 \arg6, \arg1, \arg2
|
||||
vcgt.u8 \arg6, \arg4, \arg6
|
||||
|
||||
@ -174,7 +174,7 @@
|
||||
vand.u8 \arg6, \arg6, \arg4
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
vmov.i8 \arg9, #128
|
||||
vrhadd.u8 \arg8, \arg2, \arg3
|
||||
vhadd.u8 \arg8, \arg0, \arg8
|
||||
@ -191,7 +191,7 @@
|
||||
vabs.s8 \arg9, \arg9
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
vsubl.u8 \arg5, \arg0, \arg3
|
||||
vsubl.u8 \arg6, \arg2, \arg1
|
||||
vshl.s16 \arg6, \arg6, #2
|
||||
@ -200,7 +200,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
vaddl.u8 q4, \arg1, \arg2
|
||||
vaddl.u8 q5, \arg3, \arg4
|
||||
vadd.u16 q5, q4, q5
|
||||
@ -225,12 +225,12 @@
|
||||
vbsl.u8 \arg6, d10, d8
|
||||
.endm
|
||||
|
||||
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
|
||||
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
|
||||
vmov \arg3, \arg2
|
||||
vbsl.u8 \arg3, \arg0, \arg1
|
||||
.endm
|
||||
|
||||
.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
vaddl.u8 \arg4, \arg0, \arg3
|
||||
vaddw.u8 \arg5, \arg4, \arg1
|
||||
vaddw.u8 \arg6, \arg4, \arg2
|
||||
@ -240,32 +240,32 @@
|
||||
vrshrn.u16 \arg8, \arg6, #2
|
||||
.endm
|
||||
|
||||
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
|
||||
vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
|
||||
.endm
|
||||
|
||||
.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
|
||||
vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
|
||||
.endm
|
||||
|
||||
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
|
||||
vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
|
||||
.endm
|
||||
|
||||
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
|
||||
vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
|
||||
vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
|
||||
.endm
|
||||
|
||||
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
|
||||
vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
|
||||
.endm
|
||||
|
||||
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
|
||||
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
|
||||
vcge.s8 \arg1, \arg0, #0
|
||||
vand \arg1, \arg0, \arg1
|
||||
vsub.s8 \arg0, \arg1, \arg0
|
||||
|
@ -35,14 +35,14 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro AVERAGE_TWO_8BITS
|
||||
.macro AVERAGE_TWO_8BITS
|
||||
// { // input:dst_d, src_d A and B; working: q13
|
||||
vaddl.u8 q13, $2, $1
|
||||
vrshrn.u16 $0, q13, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS
|
||||
.macro FILTER_6TAG_8BITS
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
||||
@ -53,7 +53,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
|
||||
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
|
||||
vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
|
||||
vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
|
||||
@ -64,7 +64,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
||||
@ -77,7 +77,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
||||
@ -90,7 +90,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
|
||||
vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
|
||||
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
||||
@ -100,7 +100,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS
|
||||
// { // input:a, b, c, dst_d;
|
||||
vsub.s16 $0, $0, $1 //a-b
|
||||
vshr.s16 $0, $0, #2 //(a-b)/4
|
||||
@ -112,7 +112,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_2_16BITS_TO_ABC
|
||||
.macro UNPACK_2_16BITS_TO_ABC
|
||||
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
||||
vext.16 $4, $0, $1, #2 //src[0]
|
||||
vext.16 $3, $0, $1, #3 //src[1]
|
||||
@ -127,7 +127,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS
|
||||
// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
|
||||
vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
|
||||
vrev64.16 $1, $1
|
||||
@ -145,14 +145,14 @@
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
|
||||
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
|
||||
// { // input:dst_d, src_d A and B; working: q13
|
||||
vaddl.u8 q13, \arg2, \arg1
|
||||
vrshrn.u16 \arg0, q13, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
||||
@ -163,7 +163,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
|
||||
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
|
||||
vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
|
||||
vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
|
||||
@ -174,7 +174,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
||||
@ -187,7 +187,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
||||
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
||||
@ -200,7 +200,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
|
||||
vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
|
||||
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
||||
@ -210,7 +210,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
|
||||
// { // input:a, b, c, dst_d;
|
||||
vsub.s16 \arg0, \arg0, \arg1 //a-b
|
||||
vshr.s16 \arg0, \arg0, #2 //(a-b)/4
|
||||
@ -222,7 +222,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
|
||||
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
|
||||
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
||||
vext.16 \arg4, \arg0, \arg1, #2 //src[0]
|
||||
vext.16 \arg3, \arg0, \arg1, #3 //src[1]
|
||||
@ -237,7 +237,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
|
||||
// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
|
||||
vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
|
||||
vrev64.16 \arg1, \arg1
|
||||
|
@ -38,7 +38,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
.macro FILTER_6TAG_8BITS1
|
||||
.macro FILTER_6TAG_8BITS1
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
|
||||
@ -49,7 +49,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2
|
||||
.macro FILTER_6TAG_8BITS2
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
|
||||
@ -60,7 +60,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
|
||||
@ -73,7 +73,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
|
||||
@ -86,7 +86,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
|
||||
@ -99,7 +99,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
|
||||
@ -112,7 +112,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS1
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS1
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
|
||||
uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
|
||||
uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
|
||||
@ -122,7 +122,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS2
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS2
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
|
||||
uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
|
||||
uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
|
||||
@ -132,7 +132,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS1
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS1
|
||||
// { // input:a, b, c, dst_d;
|
||||
sub $0.8h, $0.8h, $1.8h //a-b
|
||||
sshr $0.8h, $0.8h, #2 //(a-b)/4
|
||||
@ -144,7 +144,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS2
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS2
|
||||
// { // input:a, b, c, dst_d;
|
||||
sub $0.8h, $0.8h, $1.8h //a-b
|
||||
sshr $0.8h, $0.8h, #2 //(a-b)/4
|
||||
@ -156,7 +156,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_2_16BITS_TO_ABC
|
||||
.macro UNPACK_2_16BITS_TO_ABC
|
||||
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
||||
ext $4.16b, $0.16b, $1.16b, #4 //src[0]
|
||||
ext $3.16b, $0.16b, $1.16b, #6 //src[1]
|
||||
@ -171,21 +171,21 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro AVERAGE_TWO_8BITS1
|
||||
.macro AVERAGE_TWO_8BITS1
|
||||
// { // input:dst_d, src_d A and B; working: v5
|
||||
uaddl v30.8h, $2.8b, $1.8b
|
||||
rshrn $0.8b, v30.8h, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro AVERAGE_TWO_8BITS2
|
||||
.macro AVERAGE_TWO_8BITS2
|
||||
// { // input:dst_d, src_d A and B; working: v5
|
||||
uaddl2 v30.8h, $2.16b, $1.16b
|
||||
rshrn2 $0.16b, v30.8h, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
|
||||
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X},
|
||||
rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
|
||||
uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
|
||||
@ -195,7 +195,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
|
||||
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
|
||||
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
|
||||
ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O
|
||||
ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2]
|
||||
@ -211,7 +211,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
.endm
|
||||
|
||||
#else
|
||||
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
|
||||
@ -222,7 +222,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
|
||||
@ -233,7 +233,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
|
||||
@ -246,7 +246,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
|
||||
@ -259,7 +259,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
|
||||
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
|
||||
@ -272,7 +272,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
|
||||
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
|
||||
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
|
||||
@ -285,7 +285,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
|
||||
uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
|
||||
uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
|
||||
@ -295,7 +295,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
|
||||
uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
|
||||
uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
|
||||
@ -305,7 +305,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
|
||||
// { // input:a, b, c, dst_d;
|
||||
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
|
||||
sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
|
||||
@ -317,7 +317,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
|
||||
.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
|
||||
// { // input:a, b, c, dst_d;
|
||||
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
|
||||
sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
|
||||
@ -329,7 +329,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
|
||||
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
|
||||
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
||||
ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
|
||||
ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
|
||||
@ -344,21 +344,21 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
|
||||
.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
|
||||
// { // input:dst_d, src_d A and B; working: v5
|
||||
uaddl v30.8h, \arg2\().8b, \arg1\().8b
|
||||
rshrn \arg0\().8b, v30.8h, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
|
||||
.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
|
||||
// { // input:dst_d, src_d A and B; working: v5
|
||||
uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
|
||||
rshrn2 \arg0\().16b, v30.8h, #1
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
|
||||
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
|
||||
// when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X},
|
||||
rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
|
||||
@ -369,7 +369,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
|
||||
ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O
|
||||
ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2]
|
||||
|
@ -577,7 +577,7 @@ BITS 32
|
||||
%endmacro
|
||||
|
||||
;all 0 for xmm and mm
|
||||
%macro WELS_Zero 1
|
||||
%macro WELS_Zero 1
|
||||
pxor %1, %1
|
||||
%endmacro
|
||||
|
||||
|
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
#ifdef __APPLE__
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -46,7 +46,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -55,7 +55,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
@ -68,7 +68,7 @@
|
||||
|
||||
#else
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -79,7 +79,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -88,7 +88,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
|
@ -97,7 +97,7 @@ sse2_wd_0x02: times 8 dw 0x02
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro LOAD_COLUMN 6
|
||||
%macro LOAD_COLUMN 6
|
||||
movd %1, [%5]
|
||||
movd %2, [%5+%6]
|
||||
punpcklbw %1, %2
|
||||
@ -143,7 +143,7 @@ sse2_wd_0x02: times 8 dw 0x02
|
||||
pshufd %2, %2, 0
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_COLUMN_C 6
|
||||
%macro LOAD_COLUMN_C 6
|
||||
movd %1, [%5]
|
||||
movd %2, [%5+%6]
|
||||
punpcklbw %1,%2
|
||||
|
@ -35,7 +35,7 @@
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro LOAD_4x4_DATA_FOR_DCT
|
||||
.macro LOAD_4x4_DATA_FOR_DCT
|
||||
// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
|
||||
vld2.16 {$0[0],$1[0]}, [$4], $5
|
||||
vld2.16 {$2[0],$3[0]}, [$6], $7
|
||||
@ -49,7 +49,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_8x8_DATA_FOR_DCT
|
||||
.macro LOAD_8x8_DATA_FOR_DCT
|
||||
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
|
||||
vld1.64 {$0}, [$8], r2
|
||||
vld1.64 {$4}, [$9], r4
|
||||
@ -63,7 +63,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
|
||||
// { // input: src_d[0]~[3], working: [4]~[7]
|
||||
vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
|
||||
vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
|
||||
@ -79,7 +79,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
|
||||
vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
|
||||
vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
|
||||
@ -88,7 +88,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
|
||||
veor.s16 $6, $6 // init 0 , and keep 0;
|
||||
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
|
||||
@ -106,7 +106,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
|
||||
veor.s16 $6, $6 // init 0 , and keep 0;
|
||||
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
|
||||
@ -125,7 +125,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf , working_d (all 0), working_q
|
||||
vaba.s16 $1, $0, $3 // f + abs(coef - 0)
|
||||
vmull.s16 $4, $1, $2 // *= mf
|
||||
@ -139,7 +139,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD
|
||||
// { // input: coef, dst_d, working_d (all 0x01)
|
||||
vceq.s16 $1, $0, #0
|
||||
vand.s16 $1, $2
|
||||
@ -148,7 +148,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro SELECT_MAX_IN_ABS_COEF
|
||||
.macro SELECT_MAX_IN_ABS_COEF
|
||||
// { // input: coef_0, coef_1, max_q (identy to follow two)
|
||||
vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
|
||||
vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
|
||||
@ -156,7 +156,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD
|
||||
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
|
||||
vceq.s16 $0, #0
|
||||
vceq.s16 $1, #0
|
||||
@ -171,7 +171,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS
|
||||
// { // input: src_d[0]~[3], working_d, dst_d
|
||||
vshr.s64 $1, $0, #32
|
||||
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
|
||||
@ -181,7 +181,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro IHDM_4x4_TOTAL_16BITS
|
||||
.macro IHDM_4x4_TOTAL_16BITS
|
||||
// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
|
||||
vshr.s64 $1, $0, #32
|
||||
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
|
||||
@ -198,7 +198,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
|
||||
// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
|
||||
vmovl.u8 $4,$0
|
||||
vmovl.u8 $5,$1
|
||||
@ -209,7 +209,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
|
||||
// { // input: src_d[0]~[3], output: e_d[0]~[3];
|
||||
vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
|
||||
vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
|
||||
@ -220,7 +220,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
|
||||
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -230,7 +230,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
.macro ROW_TRANSFORM_0_STEP
|
||||
.macro ROW_TRANSFORM_0_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3];
|
||||
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -239,7 +239,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -250,7 +250,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -259,7 +259,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_0_STEP
|
||||
.macro COL_TRANSFORM_0_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
@ -268,7 +268,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
@ -279,7 +279,7 @@
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
|
||||
vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
|
||||
vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
|
||||
@ -293,7 +293,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
|
||||
vld1.64 {\arg0}, [\arg8], r2
|
||||
vld1.64 {\arg4}, [\arg9], r4
|
||||
@ -307,7 +307,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], working: [4]~[7]
|
||||
vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
|
||||
vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
|
||||
@ -323,7 +323,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
|
||||
vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
|
||||
vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
|
||||
@ -332,7 +332,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
|
||||
veor.s16 \arg6, \arg6 // init 0 , and keep 0;
|
||||
vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
|
||||
@ -350,7 +350,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
|
||||
veor.s16 \arg6, \arg6 // init 0 , and keep 0;
|
||||
vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
|
||||
@ -369,7 +369,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
|
||||
// { // input: coef, ff (dst), mf , working_d (all 0), working_q
|
||||
vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
|
||||
vmull.s16 \arg4, \arg1, \arg2 // *= mf
|
||||
@ -383,7 +383,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
|
||||
// { // input: coef, dst_d, working_d (all 0x01)
|
||||
vceq.s16 \arg1, \arg0, #0
|
||||
vand.s16 \arg1, \arg2
|
||||
@ -392,7 +392,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
|
||||
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
|
||||
// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
|
||||
vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
|
||||
vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
|
||||
@ -400,7 +400,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
|
||||
vceq.s16 \arg0, #0
|
||||
vceq.s16 \arg1, #0
|
||||
@ -415,7 +415,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
|
||||
// { // input: src_d[0]~[3], working_d, dst_d
|
||||
vshr.s64 \arg1, \arg0, #32
|
||||
vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
|
||||
@ -425,7 +425,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
|
||||
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
|
||||
// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
|
||||
vshr.s64 \arg1, \arg0, #32
|
||||
vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
|
||||
@ -442,7 +442,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
|
||||
vmovl.u8 \arg4,\arg0
|
||||
vmovl.u8 \arg5,\arg1
|
||||
@ -453,7 +453,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], output: e_d[0]~[3];
|
||||
vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
|
||||
vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
|
||||
@ -464,7 +464,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -474,7 +474,7 @@
|
||||
.endm
|
||||
|
||||
|
||||
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3];
|
||||
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -483,7 +483,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
|
||||
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
|
||||
@ -494,7 +494,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
@ -503,7 +503,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
@ -512,7 +512,7 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
|
@ -454,7 +454,7 @@ WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
||||
movdqa %4, %1
|
||||
psubd %4, %2
|
||||
%endmacro
|
||||
%macro SSE2_Load4Col 5
|
||||
%macro SSE2_Load4Col 5
|
||||
movsx r2, WORD[%5]
|
||||
movd %1, r2d
|
||||
movsx r2, WORD[%5 + 0x20]
|
||||
|
@ -108,7 +108,7 @@ mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
|
||||
paddusw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_COLUMN 6
|
||||
%macro LOAD_COLUMN 6
|
||||
movd %1, [%5]
|
||||
movd %2, [%5+%6]
|
||||
punpcklbw %1, %2
|
||||
@ -155,7 +155,7 @@ mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
|
||||
pshufd %2, %2, 0
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_COLUMN_C 6
|
||||
%macro LOAD_COLUMN_C 6
|
||||
movd %1, [%5]
|
||||
movd %2, [%5+%6]
|
||||
punpcklbw %1,%2
|
||||
|
@ -56,7 +56,7 @@ sse2_20 times 8 dw 20
|
||||
;***********************************************************************
|
||||
SECTION .text
|
||||
|
||||
%macro WEIGHT_LINE 9
|
||||
%macro WEIGHT_LINE 9
|
||||
movq %2, %9
|
||||
punpcklbw %2, %7
|
||||
movdqa %8, %2
|
||||
@ -76,7 +76,7 @@ SECTION .text
|
||||
paddusw %5, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
@ -103,7 +103,7 @@ SECTION .text
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
@ -132,7 +132,7 @@ SECTION .text
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user