diff --git a/codec/common/arm/copy_mb_neon.S b/codec/common/arm/copy_mb_neon.S index 39ca12d6..c565a9ee 100644 --- a/codec/common/arm/copy_mb_neon.S +++ b/codec/common/arm/copy_mb_neon.S @@ -36,75 +36,75 @@ #ifdef __APPLE__ .macro LOAD_ALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, src*, src_stride - vld1.64 {$0}, [$4,:128], $5 - vld1.64 {$1}, [$4,:128], $5 - vld1.64 {$2}, [$4,:128], $5 - vld1.64 {$3}, [$4,:128], $5 -// } +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4,:128], $5 + vld1.64 {$1}, [$4,:128], $5 + vld1.64 {$2}, [$4,:128], $5 + vld1.64 {$3}, [$4,:128], $5 +// } .endm .macro STORE_ALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, dst*, dst_stride - vst1.64 {$0}, [$4,:128], $5 - vst1.64 {$1}, [$4,:128], $5 - vst1.64 {$2}, [$4,:128], $5 - vst1.64 {$3}, [$4,:128], $5 -// } +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4,:128], $5 + vst1.64 {$1}, [$4,:128], $5 + vst1.64 {$2}, [$4,:128], $5 + vst1.64 {$3}, [$4,:128], $5 +// } .endm .macro LOAD_UNALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, src*, src_stride - vld1.64 {$0}, [$4], $5 - vld1.64 {$1}, [$4], $5 - vld1.64 {$2}, [$4], $5 - vld1.64 {$3}, [$4], $5 -// } +// { // input: $0~$3, src*, src_stride + vld1.64 {$0}, [$4], $5 + vld1.64 {$1}, [$4], $5 + vld1.64 {$2}, [$4], $5 + vld1.64 {$3}, [$4], $5 +// } .endm .macro STORE_UNALIGNED_DATA_WITH_STRIDE -// { // input: $0~$3, dst*, dst_stride - vst1.64 {$0}, [$4], $5 - vst1.64 {$1}, [$4], $5 - vst1.64 {$2}, [$4], $5 - vst1.64 {$3}, [$4], $5 -// } +// { // input: $0~$3, dst*, dst_stride + vst1.64 {$0}, [$4], $5 + vst1.64 {$1}, [$4], $5 + vst1.64 {$2}, [$4], $5 + vst1.64 {$3}, [$4], $5 +// } .endm #else .macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, src*, src_stride - vld1.64 {\arg0}, [\arg4,:128], \arg5 - vld1.64 {\arg1}, [\arg4,:128], \arg5 - vld1.64 {\arg2}, [\arg4,:128], \arg5 - vld1.64 {\arg3}, [\arg4,:128], \arg5 -// } +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4,:128], \arg5 + vld1.64 {\arg1}, [\arg4,:128], \arg5 + vld1.64 {\arg2}, [\arg4,:128], \arg5 + vld1.64 {\arg3}, [\arg4,:128], \arg5 +// } .endm .macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, dst*, dst_stride - vst1.64 {\arg0}, [\arg4,:128], \arg5 - vst1.64 {\arg1}, [\arg4,:128], \arg5 - vst1.64 {\arg2}, [\arg4,:128], \arg5 - vst1.64 {\arg3}, [\arg4,:128], \arg5 -// } +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4,:128], \arg5 + vst1.64 {\arg1}, [\arg4,:128], \arg5 + vst1.64 {\arg2}, [\arg4,:128], \arg5 + vst1.64 {\arg3}, [\arg4,:128], \arg5 +// } .endm .macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, src*, src_stride - vld1.64 {\arg0}, [\arg4], \arg5 - vld1.64 {\arg1}, [\arg4], \arg5 - vld1.64 {\arg2}, [\arg4], \arg5 - vld1.64 {\arg3}, [\arg4], \arg5 -// } +// { // input: \arg0~\arg3, src*, src_stride + vld1.64 {\arg0}, [\arg4], \arg5 + vld1.64 {\arg1}, [\arg4], \arg5 + vld1.64 {\arg2}, [\arg4], \arg5 + vld1.64 {\arg3}, [\arg4], \arg5 +// } .endm .macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: \arg0~\arg3, dst*, dst_stride - vst1.64 {\arg0}, [\arg4], \arg5 - vst1.64 {\arg1}, [\arg4], \arg5 - vst1.64 {\arg2}, [\arg4], \arg5 - vst1.64 {\arg3}, [\arg4], \arg5 -// } +// { // input: \arg0~\arg3, dst*, dst_stride + vst1.64 {\arg0}, [\arg4], \arg5 + vst1.64 {\arg1}, [\arg4], \arg5 + vst1.64 {\arg2}, [\arg4], \arg5 + vst1.64 {\arg3}, [\arg4], \arg5 +// } .endm #endif @@ -112,89 +112,89 @@ WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon - LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon - LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 + LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 - STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 + STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 - LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 + LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 - STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 + STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon - LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon - LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon - LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1 - LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 + LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3 - STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 + STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1 WELS_ASM_FUNC_END diff --git a/codec/common/arm/deblocking_neon.S b/codec/common/arm/deblocking_neon.S index 57abb83f..24eb0f7b 100644 --- a/codec/common/arm/deblocking_neon.S +++ b/codec/common/arm/deblocking_neon.S @@ -37,858 +37,858 @@ #ifdef __APPLE__ .macro JMP_IF_128BITS_IS_ZERO - vorr.s16 $2, $0, $1 - vmov r3, r2, $2 - orr r3, r3, r2 - cmp r3, #0 + vorr.s16 $2, $0, $1 + vmov r3, r2, $2 + orr r3, r3, r2 + cmp r3, #0 .endm .macro MASK_MATRIX - vabd.u8 $6, $1, $2 - vcgt.u8 $6, $4, $6 + vabd.u8 $6, $1, $2 + vcgt.u8 $6, $4, $6 - vabd.u8 $4, $0, $1 - vclt.u8 $4, $4, $5 - vand.u8 $6, $6, $4 + vabd.u8 $4, $0, $1 + vclt.u8 $4, $4, $5 + vand.u8 $6, $6, $4 - vabd.u8 $4, $3, $2 - vclt.u8 $4, $4, $5 - vand.u8 $6, $6, $4 + vabd.u8 $4, $3, $2 + vclt.u8 $4, $4, $5 + vand.u8 $6, $6, $4 .endm .macro DIFF_LUMA_LT4_P1_Q1 vmov.i8 $9, #128 - vrhadd.u8 $8, $2, $3 - vhadd.u8 $8, $0, $8 - vsub.s8 $8, $8, $9 - vsub.s8 $9, $1, $9 - vqsub.s8 $8, $8, $9 - vmax.s8 $8, $8, $5 - vmin.s8 $8, $8, $6 - vabd.u8 $9, $0, $2 - vclt.u8 $9, $9, $4 - vand.s8 $8, $8, $9 - vand.s8 $8, $8, $7 - vadd.u8 $8, $1, $8 - vabs.s8 $9, $9 + vrhadd.u8 $8, $2, $3 + vhadd.u8 $8, $0, $8 + vsub.s8 $8, $8, $9 + vsub.s8 $9, $1, $9 + vqsub.s8 $8, $8, $9 + vmax.s8 $8, $8, $5 + vmin.s8 $8, $8, $6 + vabd.u8 $9, $0, $2 + vclt.u8 $9, $9, $4 + vand.s8 $8, $8, $9 + vand.s8 $8, $8, $7 + vadd.u8 $8, $1, $8 + vabs.s8 $9, $9 .endm .macro DIFF_LUMA_LT4_P0_Q0 - vsubl.u8 $5, $0, $3 - vsubl.u8 $6, $2, $1 - vshl.s16 $6, $6, #2 - vadd.s16 $5, $5, $6 - vqrshrn.s16 $4, $5, #3 + vsubl.u8 $5, $0, $3 + vsubl.u8 $6, $2, $1 + vshl.s16 $6, $6, #2 + vadd.s16 $5, $5, $6 + vqrshrn.s16 $4, $5, #3 .endm .macro DIFF_LUMA_EQ4_P2P1P0 - vaddl.u8 q4, $1, $2 - vaddl.u8 q5, $3, $4 - vadd.u16 q5, q4, q5 + vaddl.u8 q4, $1, $2 + vaddl.u8 q5, $3, $4 + vadd.u16 q5, q4, q5 - vaddl.u8 q4, $0, $1 - vshl.u16 q4, q4, #1 - vadd.u16 q4, q5, q4 + vaddl.u8 q4, $0, $1 + vshl.u16 q4, q4, #1 + vadd.u16 q4, q5, q4 - vrshrn.u16 $0, q5, #2 - vrshrn.u16 $7, q4, #3 + vrshrn.u16 $0, q5, #2 + vrshrn.u16 $7, q4, #3 - vshl.u16 q5, q5, #1 - vsubl.u8 q4, $5, $1 - vadd.u16 q5, q4,q5 + vshl.u16 q5, q5, #1 + vsubl.u8 q4, $5, $1 + vadd.u16 q5, q4,q5 - vaddl.u8 q4, $2, $5 - vaddw.u8 q4, q4, $2 - vaddw.u8 q4, q4, $3 + vaddl.u8 q4, $2, $5 + vaddw.u8 q4, q4, $2 + vaddw.u8 q4, q4, $3 - vrshrn.u16 d10,q5, #3 - vrshrn.u16 d8, q4, #2 - vbsl.u8 $6, d10, d8 + vrshrn.u16 d10,q5, #3 + vrshrn.u16 d8, q4, #2 + vbsl.u8 $6, d10, d8 .endm .macro DIFF_LUMA_EQ4_MASK - vmov $3, $2 - vbsl.u8 $3, $0, $1 + vmov $3, $2 + vbsl.u8 $3, $0, $1 .endm .macro DIFF_CHROMA_EQ4_P0Q0 - vaddl.u8 $4, $0, $3 - vaddw.u8 $5, $4, $1 - vaddw.u8 $6, $4, $2 - vaddw.u8 $5, $5, $0 + vaddl.u8 $4, $0, $3 + vaddw.u8 $5, $4, $1 + vaddw.u8 $6, $4, $2 + vaddw.u8 $5, $5, $0 - vaddw.u8 $6, $6, $3 - vrshrn.u16 $7, $5, #2 - vrshrn.u16 $8, $6, #2 + vaddw.u8 $6, $6, $3 + vrshrn.u16 $7, $5, #2 + vrshrn.u16 $8, $6, #2 .endm .macro LOAD_CHROMA_DATA_4 - vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 - vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 + vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 + vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 .endm .macro STORE_CHROMA_DATA_4 - vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 - vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 + vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2 + vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2 .endm .macro LOAD_LUMA_DATA_3 - vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1 - vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 + vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1 + vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 .endm .macro STORE_LUMA_DATA_4 - vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1 - vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1 + vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1 + vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1 .endm .macro STORE_LUMA_DATA_3 - vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1 - vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 + vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1 + vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1 .endm .macro EXTRACT_DELTA_INTO_TWO_PART - vcge.s8 $1, $0, #0 - vand $1, $0, $1 - vsub.s8 $0, $1, $0 + vcge.s8 $1, $0, #0 + vand $1, $0, $1 + vsub.s8 $0, $1, $0 .endm #else .macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2 - vorr.s16 \arg2, \arg0, \arg1 - vmov r3, r2, \arg2 - orr r3, r3, r2 - cmp r3, #0 + vorr.s16 \arg2, \arg0, \arg1 + vmov r3, r2, \arg2 + orr r3, r3, r2 + cmp r3, #0 .endm .macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6 - vabd.u8 \arg6, \arg1, \arg2 - vcgt.u8 \arg6, \arg4, \arg6 + vabd.u8 \arg6, \arg1, \arg2 + vcgt.u8 \arg6, \arg4, \arg6 - vabd.u8 \arg4, \arg0, \arg1 - vclt.u8 \arg4, \arg4, \arg5 - vand.u8 \arg6, \arg6, \arg4 + vabd.u8 \arg4, \arg0, \arg1 + vclt.u8 \arg4, \arg4, \arg5 + vand.u8 \arg6, \arg6, \arg4 - vabd.u8 \arg4, \arg3, \arg2 - vclt.u8 \arg4, \arg4, \arg5 - vand.u8 \arg6, \arg6, \arg4 + vabd.u8 \arg4, \arg3, \arg2 + vclt.u8 \arg4, \arg4, \arg5 + vand.u8 \arg6, \arg6, \arg4 .endm .macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 vmov.i8 \arg9, #128 - vrhadd.u8 \arg8, \arg2, \arg3 - vhadd.u8 \arg8, \arg0, \arg8 - vsub.s8 \arg8, \arg8, \arg9 - vsub.s8 \arg9, \arg1, \arg9 + vrhadd.u8 \arg8, \arg2, \arg3 + vhadd.u8 \arg8, \arg0, \arg8 + vsub.s8 \arg8, \arg8, \arg9 + vsub.s8 \arg9, \arg1, \arg9 vqsub.s8 \arg8, \arg8, \arg9 - vmax.s8 \arg8, \arg8, \arg5 - vmin.s8 \arg8, \arg8, \arg6 - vabd.u8 \arg9, \arg0, \arg2 - vclt.u8 \arg9, \arg9, \arg4 - vand.s8 \arg8, \arg8, \arg9 - vand.s8 \arg8, \arg8, \arg7 - vadd.u8 \arg8, \arg1, \arg8 - vabs.s8 \arg9, \arg9 + vmax.s8 \arg8, \arg8, \arg5 + vmin.s8 \arg8, \arg8, \arg6 + vabd.u8 \arg9, \arg0, \arg2 + vclt.u8 \arg9, \arg9, \arg4 + vand.s8 \arg8, \arg8, \arg9 + vand.s8 \arg8, \arg8, \arg7 + vadd.u8 \arg8, \arg1, \arg8 + vabs.s8 \arg9, \arg9 .endm .macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 - vsubl.u8 \arg5, \arg0, \arg3 - vsubl.u8 \arg6, \arg2, \arg1 - vshl.s16 \arg6, \arg6, #2 - vadd.s16 \arg5, \arg5, \arg6 - vqrshrn.s16 \arg4, \arg5, #3 + vsubl.u8 \arg5, \arg0, \arg3 + vsubl.u8 \arg6, \arg2, \arg1 + vshl.s16 \arg6, \arg6, #2 + vadd.s16 \arg5, \arg5, \arg6 + vqrshrn.s16 \arg4, \arg5, #3 .endm .macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 - vaddl.u8 q4, \arg1, \arg2 - vaddl.u8 q5, \arg3, \arg4 - vadd.u16 q5, q4, q5 + vaddl.u8 q4, \arg1, \arg2 + vaddl.u8 q5, \arg3, \arg4 + vadd.u16 q5, q4, q5 - vaddl.u8 q4, \arg0, \arg1 - vshl.u16 q4, q4, #1 - vadd.u16 q4, q5, q4 + vaddl.u8 q4, \arg0, \arg1 + vshl.u16 q4, q4, #1 + vadd.u16 q4, q5, q4 - vrshrn.u16 \arg0, q5, #2 - vrshrn.u16 \arg7, q4, #3 + vrshrn.u16 \arg0, q5, #2 + vrshrn.u16 \arg7, q4, #3 - vshl.u16 q5, q5, #1 - vsubl.u8 q4, \arg5, \arg1 - vadd.u16 q5, q4,q5 + vshl.u16 q5, q5, #1 + vsubl.u8 q4, \arg5, \arg1 + vadd.u16 q5, q4,q5 - vaddl.u8 q4, \arg2, \arg5 - vaddw.u8 q4, q4, \arg2 - vaddw.u8 q4, q4, \arg3 + vaddl.u8 q4, \arg2, \arg5 + vaddw.u8 q4, q4, \arg2 + vaddw.u8 q4, q4, \arg3 - vrshrn.u16 d10,q5, #3 - vrshrn.u16 d8, q4, #2 - vbsl.u8 \arg6, d10, d8 + vrshrn.u16 d10,q5, #3 + vrshrn.u16 d8, q4, #2 + vbsl.u8 \arg6, d10, d8 .endm .macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3 - vmov \arg3, \arg2 - vbsl.u8 \arg3, \arg0, \arg1 + vmov \arg3, \arg2 + vbsl.u8 \arg3, \arg0, \arg1 .endm .macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 - vaddl.u8 \arg4, \arg0, \arg3 - vaddw.u8 \arg5, \arg4, \arg1 - vaddw.u8 \arg6, \arg4, \arg2 - vaddw.u8 \arg5, \arg5, \arg0 - vaddw.u8 \arg6, \arg6, \arg3 - vrshrn.u16 \arg7, \arg5, #2 - vrshrn.u16 \arg8, \arg6, #2 + vaddl.u8 \arg4, \arg0, \arg3 + vaddw.u8 \arg5, \arg4, \arg1 + vaddw.u8 \arg6, \arg4, \arg2 + vaddw.u8 \arg5, \arg5, \arg0 + vaddw.u8 \arg6, \arg6, \arg3 + vrshrn.u16 \arg7, \arg5, #2 + vrshrn.u16 \arg8, \arg6, #2 .endm .macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 - vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 - vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 + vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 + vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 .endm .macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 - vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 - vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 + vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2 + vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2 .endm .macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 - vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1 - vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 + vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1 + vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 .endm .macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5 - vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1 - vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1 + vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1 + vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1 .endm .macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 - vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1 - vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 + vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1 + vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1 .endm .macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1 - vcge.s8 \arg1, \arg0, #0 - vand \arg1, \arg0, \arg1 - vsub.s8 \arg0, \arg1, \arg0 + vcge.s8 \arg1, \arg0, #0 + vand \arg1, \arg0, \arg1 + vsub.s8 \arg0, \arg1, \arg0 .endm #endif WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon - vpush {q4-q7} - vdup.u8 q11, r2 - vdup.u8 q9, r3 + vpush {q4-q7} + vdup.u8 q11, r2 + vdup.u8 q9, r3 - add r2, r1, r1, lsl #1 - sub r2, r0, r2 - vld1.u8 {q0}, [r2], r1 - vld1.u8 {q3}, [r0], r1 - vld1.u8 {q1}, [r2], r1 - vld1.u8 {q4}, [r0], r1 - vld1.u8 {q2}, [r2] - vld1.u8 {q5}, [r0] - sub r2, r2, r1 + add r2, r1, r1, lsl #1 + sub r2, r0, r2 + vld1.u8 {q0}, [r2], r1 + vld1.u8 {q3}, [r0], r1 + vld1.u8 {q1}, [r2], r1 + vld1.u8 {q4}, [r0], r1 + vld1.u8 {q2}, [r2] + vld1.u8 {q5}, [r0] + sub r2, r2, r1 - ldr r3, [sp, #64] - vld1.s8 {d31}, [r3] - vdup.s8 d28, d31[0] - vdup.s8 d30, d31[1] - vdup.s8 d29, d31[2] - vdup.s8 d31, d31[3] - vtrn.32 d28, d30 - vtrn.32 d29, d31 - vcge.s8 q10, q14, #0 + ldr r3, [sp, #64] + vld1.s8 {d31}, [r3] + vdup.s8 d28, d31[0] + vdup.s8 d30, d31[1] + vdup.s8 d29, d31[2] + vdup.s8 d31, d31[3] + vtrn.32 d28, d30 + vtrn.32 d29, d31 + vcge.s8 q10, q14, #0 - MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 - vand.u8 q10, q10, q15 + MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 + vand.u8 q10, q10, q15 - veor q15, q15 - vsub.i8 q15,q15,q14 + veor q15, q15 + vsub.i8 q15,q15,q14 - DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 - vst1.u8 {q6}, [r2], r1 + DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 + vst1.u8 {q6}, [r2], r1 - DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 + DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 - vabs.s8 q12, q12 - vabs.s8 q13, q13 - vadd.u8 q14,q14,q12 - vadd.u8 q14,q14,q13 - veor q15, q15 - vsub.i8 q15,q15,q14 + vabs.s8 q12, q12 + vabs.s8 q13, q13 + vadd.u8 q14,q14,q12 + vadd.u8 q14,q14,q13 + veor q15, q15 + vsub.i8 q15,q15,q14 - DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13 - DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 - vmax.s8 q8, q8, q15 - vmin.s8 q8, q8, q14 - vand.s8 q8, q8, q10 - EXTRACT_DELTA_INTO_TWO_PART q8, q9 - vqadd.u8 q2, q2, q9 - vqsub.u8 q2, q2, q8 - vst1.u8 {q2}, [r2], r1 - vqsub.u8 q3, q3, q9 - vqadd.u8 q3, q3, q8 - vst1.u8 {q3}, [r2] , r1 - vst1.u8 {q7}, [r2] + DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13 + DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 + vmax.s8 q8, q8, q15 + vmin.s8 q8, q8, q14 + vand.s8 q8, q8, q10 + EXTRACT_DELTA_INTO_TWO_PART q8, q9 + vqadd.u8 q2, q2, q9 + vqsub.u8 q2, q2, q8 + vst1.u8 {q2}, [r2], r1 + vqsub.u8 q3, q3, q9 + vqadd.u8 q3, q3, q8 + vst1.u8 {q3}, [r2] , r1 + vst1.u8 {q7}, [r2] - vpop {q4-q7} + vpop {q4-q7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon - vpush {q4-q7} + vpush {q4-q7} - vdup.u8 q5, r2 - vdup.u8 q4, r3 + vdup.u8 q5, r2 + vdup.u8 q4, r3 - sub r3, r0, r1, lsl #2 - vld1.u8 {q8}, [r3], r1 - vld1.u8 {q12}, [r0], r1 - vld1.u8 {q9}, [r3], r1 - vld1.u8 {q13}, [r0], r1 - vld1.u8 {q10}, [r3], r1 - vld1.u8 {q14}, [r0], r1 - vld1.u8 {q11}, [r3] - vld1.u8 {q15}, [r0] - sub r3, r3, r1 , lsl #1 + sub r3, r0, r1, lsl #2 + vld1.u8 {q8}, [r3], r1 + vld1.u8 {q12}, [r0], r1 + vld1.u8 {q9}, [r3], r1 + vld1.u8 {q13}, [r0], r1 + vld1.u8 {q10}, [r3], r1 + vld1.u8 {q14}, [r0], r1 + vld1.u8 {q11}, [r3] + vld1.u8 {q15}, [r0] + sub r3, r3, r1 , lsl #1 - MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 + MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 - mov r2, r2, lsr #2 - add r2, r2, #2 - vdup.u8 q5, r2 - vabd.u8 q0, q11, q12 - vclt.u8 q7, q0, q5 + mov r2, r2, lsr #2 + add r2, r2, #2 + vdup.u8 q5, r2 + vabd.u8 q0, q11, q12 + vclt.u8 q7, q0, q5 - vabd.u8 q1, q9, q11 - vclt.u8 q1, q1, q4 - vand.s8 q1, q1, q7 + vabd.u8 q1, q9, q11 + vclt.u8 q1, q1, q4 + vand.s8 q1, q1, q7 - vabd.u8 q2, q14,q12 - vclt.u8 q2, q2, q4 - vand.s8 q2, q2, q7 - vand.u8 q7, q7, q6 + vabd.u8 q2, q14,q12 + vclt.u8 q2, q2, q4 + vand.s8 q2, q2, q7 + vand.u8 q7, q7, q6 - vmov q3, q1 + vmov q3, q1 - DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0 - DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1 + DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0 + DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1 - vand.u8 q3, q7, q3 - DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 - vst1.u8 {q4}, [r3], r1 - DIFF_LUMA_EQ4_MASK q8,q10, q3, q4 - vst1.u8 {q4}, [r3], r1 - DIFF_LUMA_EQ4_MASK q1,q11, q6, q4 - vst1.u8 {q4}, [r3], r1 + vand.u8 q3, q7, q3 + DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 + vst1.u8 {q4}, [r3], r1 + DIFF_LUMA_EQ4_MASK q8,q10, q3, q4 + vst1.u8 {q4}, [r3], r1 + DIFF_LUMA_EQ4_MASK q1,q11, q6, q4 + vst1.u8 {q4}, [r3], r1 - vmov q0, q2 - DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6 - DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7 + vmov q0, q2 + DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6 + DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7 - vand.u8 q0, q7, q0 - DIFF_LUMA_EQ4_MASK q2, q12, q6, q4 - vst1.u8 {q4}, [r3], r1 - DIFF_LUMA_EQ4_MASK q15, q13, q0, q4 - vst1.u8 {q4}, [r3], r1 - DIFF_LUMA_EQ4_MASK q3, q14, q0, q4 - vst1.u8 {q4}, [r3], r1 + vand.u8 q0, q7, q0 + DIFF_LUMA_EQ4_MASK q2, q12, q6, q4 + vst1.u8 {q4}, [r3], r1 + DIFF_LUMA_EQ4_MASK q15, q13, q0, q4 + vst1.u8 {q4}, [r3], r1 + DIFF_LUMA_EQ4_MASK q3, q14, q0, q4 + vst1.u8 {q4}, [r3], r1 - vpop {q4-q7} + vpop {q4-q7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon - vpush {q4-q7} + vpush {q4-q7} - vdup.u8 q11, r2 - vdup.u8 q9, r3 + vdup.u8 q11, r2 + vdup.u8 q9, r3 - sub r2, r0, #3 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6 - LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7 + sub r2, r0, #3 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6 + LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6 - LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6 + LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7 - vswp d1, d2 - vswp d3, d4 - vswp d1, d4 - vswp d7, d8 - vswp d9, d10 - vswp d7, d10 + vswp d1, d2 + vswp d3, d4 + vswp d1, d4 + vswp d7, d8 + vswp d9, d10 + vswp d7, d10 - sub r0, r0, r1, lsl #4 + sub r0, r0, r1, lsl #4 - ldr r3, [sp, #64] - vld1.s8 {d31}, [r3] - vdup.s8 d28, d31[0] - vdup.s8 d30, d31[1] - vdup.s8 d29, d31[2] - vdup.s8 d31, d31[3] - vtrn.32 d28, d30 - vtrn.32 d29, d31 - vcge.s8 q10, q14, #0 + ldr r3, [sp, #64] + vld1.s8 {d31}, [r3] + vdup.s8 d28, d31[0] + vdup.s8 d30, d31[1] + vdup.s8 d29, d31[2] + vdup.s8 d31, d31[3] + vtrn.32 d28, d30 + vtrn.32 d29, d31 + vcge.s8 q10, q14, #0 - MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 - vand.u8 q10, q10, q15 + MASK_MATRIX q1, q2, q3, q4, q11, q9, q15 + vand.u8 q10, q10, q15 - veor q15, q15 - vsub.i8 q15,q15,q14 + veor q15, q15 + vsub.i8 q15,q15,q14 - DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 - DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 + DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12 + DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13 - vabs.s8 q12, q12 - vabs.s8 q13, q13 - vadd.u8 q14,q14,q12 - vadd.u8 q14,q14,q13 - veor q15, q15 - vsub.i8 q15,q15,q14 + vabs.s8 q12, q12 + vabs.s8 q13, q13 + vadd.u8 q14,q14,q12 + vadd.u8 q14,q14,q13 + veor q15, q15 + vsub.i8 q15,q15,q14 - DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13 - DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 - vmax.s8 q8, q8, q15 - vmin.s8 q8, q8, q14 - vand.s8 q8, q8, q10 - EXTRACT_DELTA_INTO_TWO_PART q8, q9 - vqadd.u8 q2, q2, q9 - vqsub.u8 q2, q2, q8 + DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13 + DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13 + vmax.s8 q8, q8, q15 + vmin.s8 q8, q8, q14 + vand.s8 q8, q8, q10 + EXTRACT_DELTA_INTO_TWO_PART q8, q9 + vqadd.u8 q2, q2, q9 + vqsub.u8 q2, q2, q8 - vqsub.u8 q3, q3, q9 - vqadd.u8 q3, q3, q8 + vqsub.u8 q3, q3, q9 + vqadd.u8 q3, q3, q8 - sub r0, #2 - add r2, r0, r1 - lsl r1, #1 + sub r0, #2 + add r2, r0, r1 + lsl r1, #1 - vmov q1, q6 - vmov q4, q7 + vmov q1, q6 + vmov q4, q7 - vswp q2, q3 - vswp d3, d6 - vswp d5, d8 + vswp q2, q3 + vswp d3, d6 + vswp d5, d8 - STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1 - STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3 - STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5 - STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7 + STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1 + STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3 + STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5 + STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7 - STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1 - STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3 - STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5 - STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7 + STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1 + STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3 + STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5 + STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7 - vpop {q4-q7} + vpop {q4-q7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon - vpush {q4-q7} - vdup.u8 q5, r2 - vdup.u8 q4, r3 + vpush {q4-q7} + vdup.u8 q5, r2 + vdup.u8 q4, r3 - sub r3, r0, #4 // pix -= 4 + sub r3, r0, #4 // pix -= 4 - vld1.u8 {d16}, [r3], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r3], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r3], r1 - vld1.u8 {d21}, [r3], r1 - vld1.u8 {d22}, [r3], r1 - vld1.u8 {d23}, [r3], r1 - vld1.u8 {d24}, [r3], r1 - vld1.u8 {d25}, [r3], r1 - vld1.u8 {d26}, [r3], r1 - vld1.u8 {d27}, [r3], r1 - vld1.u8 {d28}, [r3], r1 - vld1.u8 {d29}, [r3], r1 - vld1.u8 {d30}, [r3], r1 - vld1.u8 {d31}, [r3], r1 + vld1.u8 {d16}, [r3], r1 + vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r3], r1 + vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r3], r1 + vld1.u8 {d21}, [r3], r1 + vld1.u8 {d22}, [r3], r1 + vld1.u8 {d23}, [r3], r1 + vld1.u8 {d24}, [r3], r1 + vld1.u8 {d25}, [r3], r1 + vld1.u8 {d26}, [r3], r1 + vld1.u8 {d27}, [r3], r1 + vld1.u8 {d28}, [r3], r1 + vld1.u8 {d29}, [r3], r1 + vld1.u8 {d30}, [r3], r1 + vld1.u8 {d31}, [r3], r1 - vtrn.u32 d16, d20 - vtrn.u32 d17, d21 - vtrn.u32 d18, d22 - vtrn.u32 d19, d23 - vtrn.u32 d24, d28 - vtrn.u32 d25, d29 - vtrn.u32 d26, d30 - vtrn.u32 d27, d31 + vtrn.u32 d16, d20 + vtrn.u32 d17, d21 + vtrn.u32 d18, d22 + vtrn.u32 d19, d23 + vtrn.u32 d24, d28 + vtrn.u32 d25, d29 + vtrn.u32 d26, d30 + vtrn.u32 d27, d31 - vtrn.u16 d16, d18 - vtrn.u16 d17, d19 - vtrn.u16 d20, d22 - vtrn.u16 d21, d23 - vtrn.u16 d24, d26 - vtrn.u16 d25, d27 - vtrn.u16 d28, d30 - vtrn.u16 d29, d31 + vtrn.u16 d16, d18 + vtrn.u16 d17, d19 + vtrn.u16 d20, d22 + vtrn.u16 d21, d23 + vtrn.u16 d24, d26 + vtrn.u16 d25, d27 + vtrn.u16 d28, d30 + vtrn.u16 d29, d31 - vtrn.u8 d16, d17 - vtrn.u8 d18, d19 - vtrn.u8 d20, d21 - vtrn.u8 d22, d23 - vtrn.u8 d24, d25 - vtrn.u8 d26, d27 - vtrn.u8 d28, d29 - vtrn.u8 d30, d31 + vtrn.u8 d16, d17 + vtrn.u8 d18, d19 + vtrn.u8 d20, d21 + vtrn.u8 d22, d23 + vtrn.u8 d24, d25 + vtrn.u8 d26, d27 + vtrn.u8 d28, d29 + vtrn.u8 d30, d31 - vswp d17, d24 - vswp d19, d26 - vswp d21, d28 - vswp d23, d30 + vswp d17, d24 + vswp d19, d26 + vswp d21, d28 + vswp d23, d30 - vswp q12, q9 - vswp q14, q11 + vswp q12, q9 + vswp q14, q11 - vswp q12, q10 - vswp q13, q11 + vswp q12, q10 + vswp q13, q11 - MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 + MASK_MATRIX q10, q11, q12, q13, q5, q4, q6 - mov r2, r2, lsr #2 - add r2, r2, #2 - vdup.u8 q5, r2 - vabd.u8 q0, q11, q12 - vclt.u8 q7, q0, q5 + mov r2, r2, lsr #2 + add r2, r2, #2 + vdup.u8 q5, r2 + vabd.u8 q0, q11, q12 + vclt.u8 q7, q0, q5 - vabd.u8 q1, q9, q11 - vclt.u8 q1, q1, q4 - vand.s8 q1, q1, q7 + vabd.u8 q1, q9, q11 + vclt.u8 q1, q1, q4 + vand.s8 q1, q1, q7 - vabd.u8 q2, q14,q12 - vclt.u8 q2, q2, q4 - vand.s8 q2, q2, q7 - vand.u8 q7, q7, q6 + vabd.u8 q2, q14,q12 + vclt.u8 q2, q2, q4 + vand.s8 q2, q2, q7 + vand.u8 q7, q7, q6 - vmov q3, q1 + vmov q3, q1 - DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0 - DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1 + DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0 + DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1 - vand.u8 q3, q7, q3 - DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 - vmov q9, q4 - vbsl.u8 q3, q8, q10 - DIFF_LUMA_EQ4_MASK q1,q11, q6, q8 + vand.u8 q3, q7, q3 + DIFF_LUMA_EQ4_MASK q0, q9, q3, q4 + vmov q9, q4 + vbsl.u8 q3, q8, q10 + DIFF_LUMA_EQ4_MASK q1,q11, q6, q8 - vand.u8 q7, q7, q2 + vand.u8 q7, q7, q2 - DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0 - DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1 + DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0 + DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1 - vbsl.u8 q6, q2, q12 - DIFF_LUMA_EQ4_MASK q15, q13, q7, q4 + vbsl.u8 q6, q2, q12 + DIFF_LUMA_EQ4_MASK q15, q13, q7, q4 - vbsl.u8 q7, q0, q14 + vbsl.u8 q7, q0, q14 - vmov q5, q6 - vmov q2, q9 - vmov q6, q4 - vmov q4, q8 + vmov q5, q6 + vmov q2, q9 + vmov q6, q4 + vmov q4, q8 - vswp d8, d6 - vswp d5, d7 - vswp d5, d8 - vswp d14, d12 - vswp d11, d13 - vswp d11, d14 + vswp d8, d6 + vswp d5, d7 + vswp d5, d8 + vswp d14, d12 + vswp d11, d13 + vswp d11, d14 - sub r3, r0, #3 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6 - STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7 + sub r3, r0, #3 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6 + STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6 - STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6 + STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7 - vpop {q4-q7} + vpop {q4-q7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon - vdup.u8 q11, r3 - ldr r3, [sp, #0] + vdup.u8 q11, r3 + ldr r3, [sp, #0] - sub r0, r0, r2 , lsl #1 - sub r1, r1, r2, lsl #1 - vdup.u8 q9, r3 - ldr r3, [sp, #4] + sub r0, r0, r2 , lsl #1 + sub r1, r1, r2, lsl #1 + vdup.u8 q9, r3 + ldr r3, [sp, #4] - vld1.u8 {d0}, [r0], r2 - vld1.u8 {d1}, [r1], r2 - vld1.u8 {d2}, [r0], r2 - vld1.u8 {d3}, [r1], r2 - vld1.u8 {d4}, [r0], r2 - vld1.u8 {d5}, [r1], r2 - vld1.u8 {d6}, [r0] - vld1.u8 {d7}, [r1] + vld1.u8 {d0}, [r0], r2 + vld1.u8 {d1}, [r1], r2 + vld1.u8 {d2}, [r0], r2 + vld1.u8 {d3}, [r1], r2 + vld1.u8 {d4}, [r0], r2 + vld1.u8 {d5}, [r1], r2 + vld1.u8 {d6}, [r0] + vld1.u8 {d7}, [r1] - sub r0, r0, r2, lsl #1 - sub r1, r1, r2, lsl #1 + sub r0, r0, r2, lsl #1 + sub r1, r1, r2, lsl #1 - vld1.s8 {d31}, [r3] - vmovl.u8 q14,d31 - vshl.u64 d29,d28,#8 - vorr d28,d29 - vmov d29, d28 - veor q15, q15 - vsub.i8 q15,q15,q14 + vld1.s8 {d31}, [r3] + vmovl.u8 q14,d31 + vshl.u64 d29,d28,#8 + vorr d28,d29 + vmov d29, d28 + veor q15, q15 + vsub.i8 q15,q15,q14 - MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 + MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 - DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13 - DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13 - vmax.s8 q8, q8, q15 - vmin.s8 q8, q8, q14 + DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13 + DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13 + vmax.s8 q8, q8, q15 + vmin.s8 q8, q8, q14 - vand.s8 q8, q8, q10 - vcge.s8 q14, q14, #0 - vand.s8 q8, q8, q14 - EXTRACT_DELTA_INTO_TWO_PART q8, q10 - vqadd.u8 q1, q1, q10 - vqsub.u8 q1, q1, q8 - vst1.u8 {d2}, [r0], r2 - vst1.u8 {d3}, [r1], r2 - vqsub.u8 q2, q2, q10 - vqadd.u8 q2, q2, q8 - vst1.u8 {d4}, [r0] - vst1.u8 {d5}, [r1] + vand.s8 q8, q8, q10 + vcge.s8 q14, q14, #0 + vand.s8 q8, q8, q14 + EXTRACT_DELTA_INTO_TWO_PART q8, q10 + vqadd.u8 q1, q1, q10 + vqsub.u8 q1, q1, q8 + vst1.u8 {d2}, [r0], r2 + vst1.u8 {d3}, [r1], r2 + vqsub.u8 q2, q2, q10 + vqadd.u8 q2, q2, q8 + vst1.u8 {d4}, [r0] + vst1.u8 {d5}, [r1] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon - vpush {q4-q5} + vpush {q4-q5} - vdup.u8 q11, r3 - ldr r3, [sp, #32] + vdup.u8 q11, r3 + ldr r3, [sp, #32] - sub r0, r0, r2 , lsl #1 - sub r1, r1, r2, lsl #1 - vdup.u8 q9, r3 - vld1.u8 {d0}, [r0], r2 // q0::p1 - vld1.u8 {d1}, [r1], r2 - vld1.u8 {d2}, [r0], r2 // q1::p0 - vld1.u8 {d3}, [r1], r2 - vld1.u8 {d4}, [r0], r2 // q2::q0 - vld1.u8 {d5}, [r1], r2 - vld1.u8 {d6}, [r0] // q3::q1 - vld1.u8 {d7}, [r1] + sub r0, r0, r2 , lsl #1 + sub r1, r1, r2, lsl #1 + vdup.u8 q9, r3 + vld1.u8 {d0}, [r0], r2 // q0::p1 + vld1.u8 {d1}, [r1], r2 + vld1.u8 {d2}, [r0], r2 // q1::p0 + vld1.u8 {d3}, [r1], r2 + vld1.u8 {d4}, [r0], r2 // q2::q0 + vld1.u8 {d5}, [r1], r2 + vld1.u8 {d6}, [r0] // q3::q1 + vld1.u8 {d7}, [r1] - sub r0, r0, r2, lsl #1 // pix = [-1*src_stride] - sub r1, r1, r2, lsl #1 + sub r0, r0, r2, lsl #1 // pix = [-1*src_stride] + sub r1, r1, r2, lsl #1 - MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 + MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 - vmov q11, q10 + vmov q11, q10 - DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0' - DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0' + DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0' + DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0' - vbsl.u8 q10, q15, q1 - vst1.u8 {d20}, [r0], r2 - vst1.u8 {d21}, [r1], r2 + vbsl.u8 q10, q15, q1 + vst1.u8 {d20}, [r0], r2 + vst1.u8 {d21}, [r1], r2 - vbsl.u8 q11, q0, q2 - vst1.u8 {d22}, [r0] - vst1.u8 {d23}, [r1] + vbsl.u8 q11, q0, q2 + vst1.u8 {d22}, [r0] + vst1.u8 {d23}, [r1] - vpop {q4-q5} + vpop {q4-q5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon - vdup.u8 q11, r3 - ldr r3, [sp, #0] + vdup.u8 q11, r3 + ldr r3, [sp, #0] - sub r0, r0, #2 - vdup.u8 q9, r3 - ldr r3, [sp, #4] - sub r1, r1, #2 - vld1.s8 {d31}, [r3] + sub r0, r0, #2 + vdup.u8 q9, r3 + ldr r3, [sp, #4] + sub r1, r1, #2 + vld1.s8 {d31}, [r3] - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 - vswp q1, q2 - vswp d1, d2 - vswp d6, d5 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 + vswp q1, q2 + vswp d1, d2 + vswp d6, d5 - vmovl.u8 q14, d31 - vshl.u64 d29,d28,#8 - vorr d28,d29 - vmov d29, d28 - veor q15, q15 - vsub.i8 q15,q15,q14 + vmovl.u8 q14, d31 + vshl.u64 d29,d28,#8 + vorr d28,d29 + vmov d29, d28 + veor q15, q15 + vsub.i8 q15,q15,q14 - MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 + MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 - DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13 - DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13 - vmax.s8 q8, q8, q15 - vmin.s8 q8, q8, q14 + DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13 + DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13 + vmax.s8 q8, q8, q15 + vmin.s8 q8, q8, q14 - vand.s8 q8, q8, q10 - vcge.s8 q14, q14, #0 - vand.s8 q8, q8, q14 - EXTRACT_DELTA_INTO_TWO_PART q8, q10 - vqadd.u8 q1, q1, q10 - vqsub.u8 q1, q1, q8 - vqsub.u8 q2, q2, q10 - vqadd.u8 q2, q2, q8 + vand.s8 q8, q8, q10 + vcge.s8 q14, q14, #0 + vand.s8 q8, q8, q14 + EXTRACT_DELTA_INTO_TWO_PART q8, q10 + vqadd.u8 q1, q1, q10 + vqsub.u8 q1, q1, q8 + vqsub.u8 q2, q2, q10 + vqadd.u8 q2, q2, q8 - sub r0, r0, r2, lsl #3 - sub r1, r1, r2, lsl #3 - vswp d1, d2 - vswp d6, d5 - vswp q1, q2 + sub r0, r0, r2, lsl #3 + sub r1, r1, r2, lsl #3 + vswp d1, d2 + vswp d6, d5 + vswp q1, q2 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon - vpush {q4-q5} - vdup.u8 q11, r3 - ldr r3, [sp, #32] + vpush {q4-q5} + vdup.u8 q11, r3 + ldr r3, [sp, #32] - sub r0, r0, #2 - sub r1, r1, #2 + sub r0, r0, #2 + sub r1, r1, #2 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 - LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 - vswp q1, q2 - vswp d1, d2 - vswp d6, d5 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 + LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 + vswp q1, q2 + vswp d1, d2 + vswp d6, d5 - vdup.u8 q9, r3 - MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 - vmov q11, q10 + vdup.u8 q9, r3 + MASK_MATRIX q0, q1, q2, q3, q11, q9, q10 + vmov q11, q10 - DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10 - DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11 + DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10 + DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11 - vbsl.u8 q10, q4, q1 - vbsl.u8 q11, q5, q2 - sub r0, r0, r2, lsl #3 // pix: 0th row [-2] - sub r1, r1, r2, lsl #3 + vbsl.u8 q10, q4, q1 + vbsl.u8 q11, q5, q2 + sub r0, r0, r2, lsl #3 // pix: 0th row [-2] + sub r1, r1, r2, lsl #3 - vmov q1, q10 - vmov q2, q11 - vswp d1, d2 - vswp d6, d5 - vswp q1, q2 - // Cb:d0d1d2d3, Cr:d4d5d6d7 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 - STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 + vmov q1, q10 + vmov q2, q11 + vswp d1, d2 + vswp d6, d5 + vswp q1, q2 + // Cb:d0d1d2d3, Cr:d4d5d6d7 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6 + STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7 - vpop {q4-q5} + vpop {q4-q5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon - vld1.64 {d0-d2}, [r0] + vld1.64 {d0-d2}, [r0] - vceq.s8 q0, q0, #0 - vceq.s8 d2, d2, #0 - vmvn q0, q0 - vmvn d2, d2 - vabs.s8 q0, q0 - vabs.s8 d2, d2 + vceq.s8 q0, q0, #0 + vceq.s8 d2, d2, #0 + vmvn q0, q0 + vmvn d2, d2 + vabs.s8 q0, q0 + vabs.s8 d2, d2 - vst1.64 {d0-d2}, [r0] + vst1.64 {d0-d2}, [r0] WELS_ASM_FUNC_END #ifdef __APPLE__ .macro BS_NZC_CHECK vld1.8 {d0,d1}, [$0] /* Arrenge the input data --- TOP */ - ands r6, $1, #2 - beq bs_nzc_check_jump0 + ands r6, $1, #2 + beq bs_nzc_check_jump0 sub r6, $0, $2, lsl #4 - sub r6, $2, lsl #3 + sub r6, $2, lsl #3 add r6, #12 vld1.32 d3[1], [r6] bs_nzc_check_jump0: vext.8 q1, q1, q0, #12 - vadd.u8 $3, q0, q1 + vadd.u8 $3, q0, q1 /* Arrenge the input data --- LEFT */ - ands r6, $1, #1 - beq bs_nzc_check_jump1 + ands r6, $1, #1 + beq bs_nzc_check_jump1 sub r6, $0, #21 - add r7, r6, #4 + add r7, r6, #4 vld1.8 d3[4], [r6] - add r6, r7, #4 + add r6, r7, #4 vld1.8 d3[5], [r7] - add r7, r6, #4 + add r7, r6, #4 vld1.8 d3[6], [r6] vld1.8 d3[7], [r7] bs_nzc_check_jump1: - vzip.8 d0, d1 - vzip.8 d0, d1 + vzip.8 d0, d1 + vzip.8 d0, d1 vext.8 q1, q1, q0, #12 - vadd.u8 $4, q0, q1 + vadd.u8 $4, q0, q1 .endm .macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6 mov r6, #4 vabd.s16 q8, $0, $1 vabd.s16 q9, $1, $2 - vdup.s16 $0, r6 + vdup.s16 $0, r6 vabd.s16 q10, $2, $3 vabd.s16 q11, $3, $4 @@ -897,7 +897,7 @@ bs_nzc_check_jump1: vcge.s16 q10, $0 vcge.s16 q11, $0 - vpadd.i16 d16, d16, d17 + vpadd.i16 d16, d16, d17 vpadd.i16 d17, d18, d19 vpadd.i16 d18, d20, d21 vpadd.i16 d19, d22, d23 @@ -910,8 +910,8 @@ bs_nzc_check_jump1: vldm $0, {q0,q1,q2,q3} /* Arrenge the input data --- TOP */ - ands r6, $1, #2 - beq bs_mv_check_jump0 + ands r6, $1, #2 + beq bs_mv_check_jump0 sub r6, $0, $2, lsl #6 add r6, #48 @@ -921,22 +921,22 @@ bs_mv_check_jump0: BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4 /* Arrenge the input data --- LEFT */ - ands r6, $1, #1 - beq bs_mv_check_jump1 + ands r6, $1, #1 + beq bs_mv_check_jump1 sub r6, $0, #52 add r7, r6, #16 - vld1.32 d8[0], [r6] - add r6, r7, #16 + vld1.32 d8[0], [r6] + add r6, r7, #16 vld1.32 d8[1], [r7] - add r7, r6, #16 + add r7, r6, #16 vld1.32 d9[0], [r6] vld1.32 d9[1], [r7] bs_mv_check_jump1: - vzip.32 q0, q2 - vzip.32 q1, q3 - vzip.32 q0, q1 + vzip.32 q0, q2 + vzip.32 q1, q3 + vzip.32 q0, q1 vzip.32 q2, q3 BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6 .endm @@ -1038,41 +1038,41 @@ bs_mv_check_jump1: WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon - stmdb sp!, {r5-r7} - vpush {q4} + stmdb sp!, {r5-r7} + vpush {q4} - ldr r5, [sp, #28] //Save BS to r5 + ldr r5, [sp, #28] //Save BS to r5 - /* Checking the nzc status */ - BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status + /* Checking the nzc status */ + BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status - /* For checking bS[I] = 2 */ - mov r6, #2 - vcgt.s8 q14, q14, #0 - vdup.u8 q0, r6 - vcgt.s8 q15, q15, #0 + /* For checking bS[I] = 2 */ + mov r6, #2 + vcgt.s8 q14, q14, #0 + vdup.u8 q0, r6 + vcgt.s8 q15, q15, #0 - vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top - vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left + vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top + vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left - /* Checking the mv status*/ - BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status + /* Checking the mv status*/ + BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status - /* For checking bS[I] = 1 */ + /* For checking bS[I] = 1 */ mov r6, #1 - vdup.u8 q0, r6 + vdup.u8 q0, r6 - vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top - vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left + vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top + vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left - /* Check bS[I] is '1' or '2' */ - vmax.u8 q1, q12, q14 - vmax.u8 q0, q13, q15 + /* Check bS[I] is '1' or '2' */ + vmax.u8 q1, q12, q14 + vmax.u8 q0, q13, q15 - //vstm r5, {q0, q1} + //vstm r5, {q0, q1} vst1.32 {q0, q1}, [r5] - vpop {q4} - ldmia sp!, {r5-r7} + vpop {q4} + ldmia sp!, {r5-r7} WELS_ASM_FUNC_END #endif diff --git a/codec/common/arm/expand_picture_neon.S b/codec/common/arm/expand_picture_neon.S index 3fd61d0d..ebb9f579 100644 --- a/codec/common/arm/expand_picture_neon.S +++ b/codec/common/arm/expand_picture_neon.S @@ -37,119 +37,119 @@ WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon stmdb sp!, {r4-r8} - //Save the dst - mov r7, r0 - mov r8, r3 + //Save the dst + mov r7, r0 + mov r8, r3 - add r4, r7, r2 - sub r4, #1 + add r4, r7, r2 + sub r4, #1 //For the left and right expand _expand_picture_luma_loop2: - sub r5, r7, #32 - add r6, r4, #1 + sub r5, r7, #32 + add r6, r4, #1 - vld1.8 {d0[], d1[]}, [r7], r1 - vld1.8 {d2[], d3[]}, [r4], r1 + vld1.8 {d0[], d1[]}, [r7], r1 + vld1.8 {d2[], d3[]}, [r4], r1 - vst1.8 {q0}, [r5]! - vst1.8 {q0}, [r5] - vst1.8 {q1}, [r6]! - vst1.8 {q1}, [r6] - subs r8, #1 - bne _expand_picture_luma_loop2 + vst1.8 {q0}, [r5]! + vst1.8 {q0}, [r5] + vst1.8 {q1}, [r6]! + vst1.8 {q1}, [r6] + subs r8, #1 + bne _expand_picture_luma_loop2 - //for the top and bottom expand - add r2, #64 - sub r0, #32 - mla r4, r1, r3, r0 - sub r4, r1 + //for the top and bottom expand + add r2, #64 + sub r0, #32 + mla r4, r1, r3, r0 + sub r4, r1 _expand_picture_luma_loop0: - mov r5, #32 + mov r5, #32 mls r5, r5, r1, r0 - add r6, r4, r1 - vld1.8 {q0}, [r0]! - vld1.8 {q1}, [r4]! + add r6, r4, r1 + vld1.8 {q0}, [r0]! + vld1.8 {q1}, [r4]! - mov r8, #32 + mov r8, #32 _expand_picture_luma_loop1: - vst1.8 {q0}, [r5], r1 - vst1.8 {q1}, [r6], r1 - subs r8, #1 + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 + subs r8, #1 bne _expand_picture_luma_loop1 - subs r2, #16 - bne _expand_picture_luma_loop0 + subs r2, #16 + bne _expand_picture_luma_loop0 //vldreq.32 d0, [r0] - ldmia sp!, {r4-r8} + ldmia sp!, {r4-r8} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon stmdb sp!, {r4-r9} - //Save the dst - mov r7, r0 - mov r8, r3 + //Save the dst + mov r7, r0 + mov r8, r3 - add r4, r7, r2 - sub r4, #1 + add r4, r7, r2 + sub r4, #1 //For the left and right expand _expand_picture_chroma_loop2: - sub r5, r7, #16 - add r6, r4, #1 + sub r5, r7, #16 + add r6, r4, #1 - vld1.8 {d0[], d1[]}, [r7], r1 - vld1.8 {d2[], d3[]}, [r4], r1 + vld1.8 {d0[], d1[]}, [r7], r1 + vld1.8 {d2[], d3[]}, [r4], r1 - vst1.8 {q0}, [r5] - vst1.8 {q1}, [r6] - subs r8, #1 - bne _expand_picture_chroma_loop2 + vst1.8 {q0}, [r5] + vst1.8 {q1}, [r6] + subs r8, #1 + bne _expand_picture_chroma_loop2 - //for the top and bottom expand - add r2, #32 - mov r9, r2 - bic r2, #15 - sub r0, #16 - mla r4, r1, r3, r0 - sub r4, r1 + //for the top and bottom expand + add r2, #32 + mov r9, r2 + bic r2, #15 + sub r0, #16 + mla r4, r1, r3, r0 + sub r4, r1 _expand_picture_chroma_loop0: - mov r5, #16 - mls r5, r5, r1, r0 - add r6, r4, r1 - vld1.8 {q0}, [r0]! - vld1.8 {q1}, [r4]! + mov r5, #16 + mls r5, r5, r1, r0 + add r6, r4, r1 + vld1.8 {q0}, [r0]! + vld1.8 {q1}, [r4]! - mov r8, #16 + mov r8, #16 _expand_picture_chroma_loop1: - vst1.8 {q0}, [r5], r1 - vst1.8 {q1}, [r6], r1 - subs r8, #1 - bne _expand_picture_chroma_loop1 + vst1.8 {q0}, [r5], r1 + vst1.8 {q1}, [r6], r1 + subs r8, #1 + bne _expand_picture_chroma_loop1 - subs r2, #16 - bne _expand_picture_chroma_loop0 + subs r2, #16 + bne _expand_picture_chroma_loop0 //vldreq.32 d0, [r0] - and r9, #15 - cmp r9, #8 - bne _expand_picture_chroma_end - mov r5, #16 - mls r5, r5, r1, r0 - add r6, r4, r1 - vld1.8 {d0}, [r0]! - vld1.8 {d2}, [r4]! - mov r8, #16 + and r9, #15 + cmp r9, #8 + bne _expand_picture_chroma_end + mov r5, #16 + mls r5, r5, r1, r0 + add r6, r4, r1 + vld1.8 {d0}, [r0]! + vld1.8 {d2}, [r4]! + mov r8, #16 _expand_picture_chroma_loop3: - vst1.8 {d0}, [r5], r1 - vst1.8 {d2}, [r6], r1 - subs r8, #1 - bne _expand_picture_chroma_loop3 + vst1.8 {d0}, [r5], r1 + vst1.8 {d2}, [r6], r1 + subs r8, #1 + bne _expand_picture_chroma_loop3 _expand_picture_chroma_end: - ldmia sp!, {r4-r9} + ldmia sp!, {r4-r9} WELS_ASM_FUNC_END #endif diff --git a/codec/common/arm/mc_neon.S b/codec/common/arm/mc_neon.S index becb7760..702069b9 100644 --- a/codec/common/arm/mc_neon.S +++ b/codec/common/arm/mc_neon.S @@ -36,2175 +36,2175 @@ #ifdef __APPLE__ .macro AVERAGE_TWO_8BITS -// { // input:dst_d, src_d A and B; working: q13 - vaddl.u8 q13, $2, $1 - vrshrn.u16 $0, q13, #1 -// } +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, $2, $1 + vrshrn.u16 $0, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 +// } .endm -.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, - vrev64.8 $2, $0 // X[5][4][3][2][1][0]O - vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* - vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] - vpadd.s16 $0, $0, $0 - vpadd.s16 $0, $0, $0 - vqrshrun.s16 $0, $4, #5 -// } +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, + vrev64.8 $2, $0 // X[5][4][3][2][1][0]O + vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 $0, $0, $0 + vpadd.s16 $0, $0, $0 + vqrshrun.s16 $0, $4, #5 +// } .endm .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 - vaddl.u8 q13, $2, $6 - vrshrn.u16 $6, q13, #1 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $2, $6 + vrshrn.u16 $6, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 $6, q12, #5 - vaddl.u8 q13, $3, $6 - vrshrn.u16 $6, q13, #1 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 $6, q12, #5 + vaddl.u8 q13, $3, $6 + vrshrn.u16 $6, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13 - vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3] - vaddl.u8 q13, $2, $3 //src[0]+src[1] - vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, $1, $4 //src[-1]+src[2] - vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13 + vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, $2, $3 //src[0]+src[1] + vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, $1, $4 //src[-1]+src[2] + vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS -// { // input:a, b, c, dst_d; - vsub.s16 $0, $0, $1 //a-b - vshr.s16 $0, $0, #2 //(a-b)/4 - vsub.s16 $0, $0, $1 //(a-b)/4-b - vadd.s16 $0, $0, $2 //(a-b)/4-b+c - vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 - vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 $3, $0, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $3, $0, #6 //(+32)>>6 +// } .endm .macro UNPACK_2_16BITS_TO_ABC -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; - vext.16 $4, $0, $1, #2 //src[0] - vext.16 $3, $0, $1, #3 //src[1] - vadd.s16 $4, $3 //c=src[0]+src[1] +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + vext.16 $4, $0, $1, #2 //src[0] + vext.16 $3, $0, $1, #3 //src[1] + vadd.s16 $4, $3 //c=src[0]+src[1] - vext.16 $3, $0, $1, #1 //src[-1] - vext.16 $2, $0, $1, #4 //src[2] - vadd.s16 $3, $2 //b=src[-1]+src[2] + vext.16 $3, $0, $1, #1 //src[-1] + vext.16 $2, $0, $1, #4 //src[2] + vadd.s16 $3, $2 //b=src[-1]+src[2] - vext.16 $2, $0, $1, #5 //src[3] - vadd.s16 $2, $0 //a=src[-2]+src[3] -// } + vext.16 $2, $0, $1, #5 //src[3] + vadd.s16 $2, $0 //a=src[-2]+src[3] +// } .endm .macro UNPACK_1_IN_8x16BITS_TO_8BITS -// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) - vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], - vrev64.16 $1, $1 - vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], - vshr.s64 $1, $2, #16 - vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], + vrev64.16 $1, $1 + vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], + vshr.s64 $1, $2, #16 + vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 - vsub.s16 $0, $0, $1 //a-b - vshr.s16 $0, $0, #2 //(a-b)/4 - vsub.s16 $0, $0, $1 //(a-b)/4-b - vadd.s16 $0, $0, $2 //(a-b)/4-b+c - vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 - vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 $0, $3, #6 //(+32)>>6 -// } + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $0, $3, #6 //(+32)>>6 +// } .endm #else .macro AVERAGE_TWO_8BITS arg0, arg1, arg2 -// { // input:dst_d, src_d A and B; working: q13 - vaddl.u8 q13, \arg2, \arg1 - vrshrn.u16 \arg0, q13, #1 -// } +// { // input:dst_d, src_d A and B; working: q13 + vaddl.u8 q13, \arg2, \arg1 + vrshrn.u16 \arg0, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 +// } .endm -.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} - vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O - vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* - vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] - vpadd.s16 \arg0, \arg0, \arg0 - vpadd.s16 \arg0, \arg0, \arg0 - vqrshrun.s16 \arg0, \arg4, #5 -// } +.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} + vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O + vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 \arg0, \arg0, \arg0 + vpadd.s16 \arg0, \arg0, \arg0 + vqrshrun.s16 \arg0, \arg4, #5 +// } .endm .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 - vaddl.u8 q13, \arg2, \arg6 - vrshrn.u16 \arg6, q13, #1 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg2, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 - vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles - vqrshrun.s16 \arg6, q12, #5 - vaddl.u8 q13, \arg3, \arg6 - vrshrn.u16 \arg6, q13, #1 -// } +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 + vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles + vqrshrun.s16 \arg6, q12, #5 + vaddl.u8 q13, \arg3, \arg6 + vrshrn.u16 \arg6, q13, #1 +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13 - vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3] - vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] - vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles - vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] - vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13 + vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3] + vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1] + vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles + vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2] + vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3 -// { // input:a, b, c, dst_d; - vsub.s16 \arg0, \arg0, \arg1 //a-b - vshr.s16 \arg0, \arg0, #2 //(a-b)/4 - vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b - vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c - vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 - vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6 +// } .endm .macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4 -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; - vext.16 \arg4, \arg0, \arg1, #2 //src[0] - vext.16 \arg3, \arg0, \arg1, #3 //src[1] - vadd.s16 \arg4, \arg3 //c=src[0]+src[1] +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + vext.16 \arg4, \arg0, \arg1, #2 //src[0] + vext.16 \arg3, \arg0, \arg1, #3 //src[1] + vadd.s16 \arg4, \arg3 //c=src[0]+src[1] - vext.16 \arg3, \arg0, \arg1, #1 //src[-1] - vext.16 \arg2, \arg0, \arg1, #4 //src[2] - vadd.s16 \arg3,\arg2 //b=src[-1]+src[2] + vext.16 \arg3, \arg0, \arg1, #1 //src[-1] + vext.16 \arg2, \arg0, \arg1, #4 //src[2] + vadd.s16 \arg3,\arg2 //b=src[-1]+src[2] - vext.16 \arg2, \arg0, \arg1, #5 //src[3] - vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] -// } + vext.16 \arg2, \arg0, \arg1, #5 //src[3] + vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] +// } .endm .macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3 -// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) - vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] - vrev64.16 \arg1, \arg1 - vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] - vshr.s64 \arg1, \arg2, #16 - vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] + vrev64.16 \arg1, \arg1 + vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] + vshr.s64 \arg1, \arg2, #16 + vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 - vsub.s16 \arg0, \arg0, \arg1 //a-b - vshr.s16 \arg0, \arg0, #2 //(a-b)/4 - vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b - vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c - vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 - vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 -// } + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 +// } .endm #endif WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w16_h_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q8, q0, q1, #3 //q8=src[1] - vext.8 q9, q0, q1, #4 //q9=src[2] - vext.8 q10, q0, q1, #5 //q10=src[3] + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q8, q0, q1, #3 //q8=src[1] + vext.8 q9, q0, q1, #4 //q9=src[2] + vext.8 q10, q0, q1, #5 //q10=src[3] - FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15 + FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15 - FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15 + FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15 - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - cmp r4, #0 - bne w16_h_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w16_h_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w8_h_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] - pld [r0] + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] - FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15 + FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15 - sub r4, #1 - vst1.u8 {d1}, [r2], r3 + sub r4, #1 + vst1.u8 {d1}, [r2], r3 - cmp r4, #0 - bne w8_h_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w8_h_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon - push {r4, r5, r6} - ldr r6, [sp, #12] + push {r4, r5, r6} + ldr r6, [sp, #12] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w4_h_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q8, q2, q2, #2 //src[1:6 * *] + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q8, q2, q2, #2 //src[1:6 * *] - vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15 + FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15 - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 - sub r6, #2 - cmp r6, #0 - bne w4_h_mc_luma_loop + sub r6, #2 + cmp r6, #0 + bne w4_h_mc_luma_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w16_xy_10_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q8, q0, q1, #3 //q8=src[1] - vext.8 q9, q0, q1, #4 //q9=src[2] - vext.8 q10, q0, q1, #5 //q10=src[3] + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q8, q0, q1, #3 //q8=src[1] + vext.8 q9, q0, q1, #4 //q9=src[2] + vext.8 q10, q0, q1, #5 //q10=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15 - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15 - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - cmp r4, #0 - bne w16_xy_10_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w16_xy_10_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w8_xy_10_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] - pld [r0] + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15 - sub r4, #1 - vst1.u8 {d1}, [r2], r3 + sub r4, #1 + vst1.u8 {d1}, [r2], r3 - cmp r4, #0 - bne w8_xy_10_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w8_xy_10_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon - push {r4, r5, r6} - ldr r6, [sp, #12] + push {r4, r5, r6} + ldr r6, [sp, #12] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w4_xy_10_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q8, q2, q2, #2 //src[1:6 * *] + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q8, q2, q2, #2 //src[1:6 * *] - vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15 - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 - sub r6, #2 - cmp r6, #0 - bne w4_xy_10_mc_luma_loop + sub r6, #2 + cmp r6, #0 + bne w4_xy_10_mc_luma_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w16_xy_30_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] - pld [r0] - pld [r0, #16] + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2] + pld [r0] + pld [r0, #16] - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q8, q0, q1, #3 //q8=src[1] - vext.8 q9, q0, q1, #4 //q9=src[2] - vext.8 q10, q0, q1, #5 //q10=src[3] + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q8, q0, q1, #3 //q8=src[1] + vext.8 q9, q0, q1, #4 //q9=src[2] + vext.8 q10, q0, q1, #5 //q10=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15 - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15 - sub r4, #1 - vst1.u8 {d2, d3}, [r2], r3 //write 16Byte + sub r4, #1 + vst1.u8 {d2, d3}, [r2], r3 //write 16Byte - cmp r4, #0 - bne w16_xy_30_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w16_xy_30_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w8_xy_30_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] - pld [r0] + vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2] + pld [r0] - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15 - sub r4, #1 - vst1.u8 {d1}, [r2], r3 + sub r4, #1 + vst1.u8 {d1}, [r2], r3 - cmp r4, #0 - bne w8_xy_30_mc_luma_loop - pop {r4} + cmp r4, #0 + bne w8_xy_30_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon - push {r4, r5, r6} - ldr r6, [sp, #12] + push {r4, r5, r6} + ldr r6, [sp, #12] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w4_xy_30_mc_luma_loop: - vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] - pld [r0] - vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] - pld [r0] + vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5] + pld [r0] + vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5] + pld [r0] - vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] - vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] - vext.8 q3, q2, q2, #1 //src[0:6 *] - vext.8 q8, q2, q2, #2 //src[1:6 * *] + vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6] + vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6] + vext.8 q3, q2, q2, #1 //src[0:6 *] + vext.8 q8, q2, q2, #2 //src[1:6 * *] - vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] - vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] - vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] - vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] + vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4] + vtrn.32 d6, d7 //d6:[0:3]; d7[1:4] + vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5] + vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15 + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15 - vmov r4, r5, d1 - str r4, [r2], r3 - str r5, [r2], r3 + vmov r4, r5, d1 + str r4, [r2], r3 + str r5, [r2], r3 - sub r6, #2 - cmp r6, #0 - bne w4_xy_30_mc_luma_loop + sub r6, #2 + cmp r6, #0 + bne w4_xy_30_mc_luma_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q8}, [r0], r1 //q8=src[2] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q8}, [r0], r1 //q8=src[2] w16_xy_01_luma_loop: - vld1.u8 {q9}, [r0], r1 //q9=src[3] + vld1.u8 {q9}, [r0], r1 //q9=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q10}, [r2], r3 //write 1st 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q10}, [r2], r3 //write 1st 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q10}, [r2], r3 //write 4th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q10}, [r2], r3 //write 4th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15 - vld1.u8 {q8}, [r0], r1 //read 6th row - vst1.u8 {q10}, [r2], r3 //write 5th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15 + vld1.u8 {q8}, [r0], r1 //read 6th row + vst1.u8 {q10}, [r2], r3 //write 5th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15 - vld1.u8 {q9}, [r0], r1 //read 7th row - vst1.u8 {q10}, [r2], r3 //write 6th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15 + vld1.u8 {q9}, [r0], r1 //read 7th row + vst1.u8 {q10}, [r2], r3 //write 6th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q10}, [r2], r3 //write 7th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q10}, [r2], r3 //write 7th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15 - vst1.u8 {q10}, [r2], r3 //write 8th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15 + vst1.u8 {q10}, [r2], r3 //write 8th 16Byte - //q2, q3, q4, q5, q0 --> q0~q4 - vswp q0, q8 - vswp q0, q2 - vmov q1, q3 - vmov q3, q9 //q0~q4 + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q8 + vswp q0, q2 + vmov q1, q3 + vmov q3, q9 //q0~q4 - sub r4, #8 - cmp r4, #0 - bne w16_xy_01_luma_loop - pop {r4} + sub r4, #8 + cmp r4, #0 + bne w16_xy_01_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] w8_xy_01_mc_luma_loop: - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d16}, [r2], r3 //write 1st 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d16}, [r2], r3 //write 1st 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d16}, [r2], r3 //write 4th 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d16}, [r2], r3 //write 4th 8Byte - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 - sub r4, #4 - cmp r4, #0 - bne w8_xy_01_mc_luma_loop + sub r4, #4 + cmp r4, #0 + bne w8_xy_01_mc_luma_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon - push {r4, r5, r6, r7} - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] + push {r4, r5, r6, r7} + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] w4_xy_01_mc_luma_loop: -// pld [r0] - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 +// pld [r0] + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15 - vmov r4, r5, d16 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15 + vmov r4, r5, d16 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 - FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15 - vmov r5, r6, d16 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15 + vmov r5, r6, d16 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 - sub r7, #4 - cmp r7, #0 - bne w4_xy_01_mc_luma_loop + sub r7, #4 + cmp r7, #0 + bne w4_xy_01_mc_luma_loop - pop {r4, r5, r6, r7} + pop {r4, r5, r6, r7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q8}, [r0], r1 //q8=src[2] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q8}, [r0], r1 //q8=src[2] w16_xy_03_luma_loop: - vld1.u8 {q9}, [r0], r1 //q9=src[3] + vld1.u8 {q9}, [r0], r1 //q9=src[3] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q10}, [r2], r3 //write 1st 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q10}, [r2], r3 //write 1st 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q10}, [r2], r3 //write 4th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q10}, [r2], r3 //write 4th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15 - vld1.u8 {q8}, [r0], r1 //read 6th row - vst1.u8 {q10}, [r2], r3 //write 5th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15 + vld1.u8 {q8}, [r0], r1 //read 6th row + vst1.u8 {q10}, [r2], r3 //write 5th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15 - vld1.u8 {q9}, [r0], r1 //read 7th row - vst1.u8 {q10}, [r2], r3 //write 6th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15 + vld1.u8 {q9}, [r0], r1 //read 7th row + vst1.u8 {q10}, [r2], r3 //write 6th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q10}, [r2], r3 //write 7th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q10}, [r2], r3 //write 7th 16Byte - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15 - vst1.u8 {q10}, [r2], r3 //write 8th 16Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15 + vst1.u8 {q10}, [r2], r3 //write 8th 16Byte - //q2, q3, q8, q9, q0 --> q0~q8 - vswp q0, q8 - vswp q0, q2 - vmov q1, q3 - vmov q3, q9 //q0~q8 + //q2, q3, q8, q9, q0 --> q0~q8 + vswp q0, q8 + vswp q0, q2 + vmov q1, q3 + vmov q3, q9 //q0~q8 - sub r4, #8 - cmp r4, #0 - bne w16_xy_03_luma_loop - pop {r4} + sub r4, #8 + cmp r4, #0 + bne w16_xy_03_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] w8_xy_03_mc_luma_loop: - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d16}, [r2], r3 //write 1st 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d16}, [r2], r3 //write 1st 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte - pld [r0] - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d16}, [r2], r3 //write 4th 8Byte + pld [r0] + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d16}, [r2], r3 //write 4th 8Byte - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 - sub r4, #4 - cmp r4, #0 - bne w8_xy_03_mc_luma_loop + sub r4, #4 + cmp r4, #0 + bne w8_xy_03_mc_luma_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon - push {r4, r5, r6, r7} - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] + push {r4, r5, r6, r7} + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] w4_xy_03_mc_luma_loop: -// pld [r0] - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 +// pld [r0] + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15 - vmov r4, r5, d16 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15 + vmov r4, r5, d16 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 - FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15 - vmov r5, r6, d16 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte + FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15 + vmov r5, r6, d16 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 - sub r7, #4 - cmp r7, #0 - bne w4_xy_03_mc_luma_loop + sub r7, #4 + cmp r7, #0 + bne w4_xy_03_mc_luma_loop - pop {r4, r5, r6, r7} + pop {r4, r5, r6, r7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q8}, [r0], r1 //q8=src[2] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q8}, [r0], r1 //q8=src[2] w16_v_mc_luma_loop: - vld1.u8 {q9}, [r0], r1 //q9=src[3] + vld1.u8 {q9}, [r0], r1 //q9=src[3] - FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q10}, [r2], r3 //write 1st 16Byte + FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q10}, [r2], r3 //write 1st 16Byte - FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte + FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte - FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte + FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte - FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q10}, [r2], r3 //write 4th 16Byte + FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q10}, [r2], r3 //write 4th 16Byte - FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15 - vld1.u8 {q8}, [r0], r1 //read 6th row - vst1.u8 {q10}, [r2], r3 //write 5th 16Byte + FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15 + vld1.u8 {q8}, [r0], r1 //read 6th row + vst1.u8 {q10}, [r2], r3 //write 5th 16Byte - FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15 - vld1.u8 {q9}, [r0], r1 //read 7th row - vst1.u8 {q10}, [r2], r3 //write 6th 16Byte + FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15 + vld1.u8 {q9}, [r0], r1 //read 7th row + vst1.u8 {q10}, [r2], r3 //write 6th 16Byte - FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q10}, [r2], r3 //write 7th 16Byte + FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q10}, [r2], r3 //write 7th 16Byte - FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 - vst1.u8 {q10}, [r2], r3 //write 8th 16Byte + FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 + vst1.u8 {q10}, [r2], r3 //write 8th 16Byte - //q2, q3, q8, q9, q0 --> q0~q8 - vswp q0, q8 - vswp q0, q2 - vmov q1, q3 - vmov q3, q9 //q0~q8 + //q2, q3, q8, q9, q0 --> q0~q8 + vswp q0, q8 + vswp q0, q2 + vmov q1, q3 + vmov q3, q9 //q0~q8 - sub r4, #8 - cmp r4, #0 - bne w16_v_mc_luma_loop - pop {r4} + sub r4, #8 + cmp r4, #0 + bne w16_v_mc_luma_loop + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] w8_v_mc_luma_loop: - pld [r0] - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d16}, [r2], r3 //write 1st 8Byte + pld [r0] + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d16}, [r2], r3 //write 1st 8Byte - pld [r0] - FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte + pld [r0] + FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte - pld [r0] - FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte + pld [r0] + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte - pld [r0] - FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d16}, [r2], r3 //write 4th 8Byte + pld [r0] + FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d16}, [r2], r3 //write 4th 8Byte - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 - sub r4, #4 - cmp r4, #0 - bne w8_v_mc_luma_loop + sub r4, #4 + cmp r4, #0 + bne w8_v_mc_luma_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon - push {r4, r5, r6, r7} - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - ldr r4, [r0], r1 //r4=src[-2] - ldr r5, [r0], r1 //r5=src[-1] + push {r4, r5, r6, r7} + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + ldr r4, [r0], r1 //r4=src[-2] + ldr r5, [r0], r1 //r5=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - ldr r6, [r0], r1 //r6=src[0] - ldr r7, [r0], r1 //r7=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + ldr r6, [r0], r1 //r6=src[0] + ldr r7, [r0], r1 //r7=src[1] - vmov d0, r4, r5 - vmov d1, r5, r6 - vmov d2, r6, r7 + vmov d0, r4, r5 + vmov d1, r5, r6 + vmov d2, r6, r7 - ldr r4, [r0], r1 //r4=src[2] - vmov d3, r7, r4 - ldr r7, [sp, #16] + ldr r4, [r0], r1 //r4=src[2] + vmov d3, r7, r4 + ldr r7, [sp, #16] w4_v_mc_luma_loop: -// pld [r0] - //using reserving r4 - ldr r5, [r0], r1 //r5=src[3] - ldr r6, [r0], r1 //r6=src[0] - vmov d4, r4, r5 - vmov d5, r5, r6 //reserved r6 +// pld [r0] + //using reserving r4 + ldr r5, [r0], r1 //r5=src[3] + ldr r6, [r0], r1 //r6=src[0] + vmov d4, r4, r5 + vmov d5, r5, r6 //reserved r6 - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 - vmov r4, r5, d16 - str r4, [r2], r3 //write 1st 4Byte - str r5, [r2], r3 //write 2nd 4Byte + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 + vmov r4, r5, d16 + str r4, [r2], r3 //write 1st 4Byte + str r5, [r2], r3 //write 2nd 4Byte - ldr r5, [r0], r1 //r5=src[1] - ldr r4, [r0], r1 //r4=src[2] - vmov d0, r6, r5 - vmov d1, r5, r4 //reserved r4 + ldr r5, [r0], r1 //r5=src[1] + ldr r4, [r0], r1 //r4=src[2] + vmov d0, r6, r5 + vmov d1, r5, r4 //reserved r4 - FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 - vmov r5, r6, d16 - str r5, [r2], r3 //write 3rd 4Byte - str r6, [r2], r3 //write 4th 4Byte + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 + vmov r5, r6, d16 + str r5, [r2], r3 //write 3rd 4Byte + str r6, [r2], r3 //write 4th 4Byte - //d4, d5, d0, d1 --> d0, d1, d2, d3 - vmov q1, q0 - vmov q0, q2 + //d4, d5, d0, d1 --> d0, d1, d2, d3 + vmov q1, q0 + vmov q0, q2 - sub r7, #4 - cmp r7, #0 - bne w4_v_mc_luma_loop + sub r7, #4 + cmp r7, #0 + bne w4_v_mc_luma_loop - pop {r4, r5, r6, r7} + pop {r4, r5, r6, r7} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon - push {r4} - vpush {q4-q7} - ldr r4, [sp, #68] + push {r4} + vpush {q4-q7} + ldr r4, [sp, #68] - sub r0, #2 //src[-2] - sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2] - vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0] - vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2] + vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2] w16_hv_mc_luma_loop: - vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {q0}, [r2], r3 //write 16Byte + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2], r3 //write 16Byte - vld1.u8 {d0-d2}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 - vst1.u8 {d3, d4}, [r2], r3 //write 16Byte + vst1.u8 {d3, d4}, [r2], r3 //write 16Byte - vld1.u8 {d3-d5}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 - vst1.u8 {d6, d7}, [r2], r3 //write 16Byte + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2], r3 //write 16Byte - vld1.u8 {d6-d8}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 - vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte - //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 - vswp q0, q6 - vswp q6, q3 - vmov q5, q2 - vmov q2, q8 + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 - vmov d20,d8 - vmov q4, q1 - vmov q1, q7 - vmov d14,d20 + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 - sub r4, #4 - cmp r4, #0 - bne w16_hv_mc_luma_loop - vpop {q4-q7} - pop {r4} + sub r4, #4 + cmp r4, #0 + bne w16_hv_mc_luma_loop + vpop {q4-q7} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon - push {r4} - vpush {q4} - ldr r4, [sp, #20] + push {r4} + vpush {q4} + ldr r4, [sp, #20] - sub r0, #2 //src[-2] - sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2] - vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0] - vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2] + vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2] w8_hv_mc_luma_loop: - vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2], r3 //write 8Byte + vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2], r3 //write 8Byte - vld1.u8 {q0}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2], r3 //write 8Byte + vld1.u8 {q0}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2], r3 //write 8Byte - vld1.u8 {q1}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2], r3 //write 8Byte + vld1.u8 {q1}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2], r3 //write 8Byte - vld1.u8 {q2}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2], r3 //write 8Byte + vld1.u8 {q2}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2], r3 //write 8Byte - //q4~q5, q0~q2, --> q0~q4 - vswp q0, q4 - vswp q2, q4 - vmov q3, q1 - vmov q1, q8 + //q4~q5, q0~q2, --> q0~q4 + vswp q0, q4 + vswp q2, q4 + vmov q3, q1 + vmov q1, q8 - sub r4, #4 - cmp r4, #0 - bne w8_hv_mc_luma_loop - vpop {q4} - pop {r4} + sub r4, #4 + cmp r4, #0 + bne w8_hv_mc_luma_loop + vpop {q4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon - push {r4 ,r5, r6} - vpush {q4-q7} - ldr r6, [sp, #76] + push {r4 ,r5, r6} + vpush {q4-q7} + ldr r6, [sp, #76] - sub r0, #2 //src[-2] - sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2] - vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0] - vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2] + vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2] w4_hv_mc_luma_loop: - vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3] - vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4] + vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4] - //the 1st&2nd row - pld [r0] - pld [r0, r1] - // vertical filtered - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail + //the 1st&2nd row + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail - FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail - UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail - vmov d23, d0 - vmov d25, d14 - vmov d27, d16 + vmov d23, d0 + vmov d25, d14 + vmov d27, d16 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] - vmov r4 ,r5, d22 - str r4, [r2], r3 //write 4Byte - str r5, [r2], r3 //write 4Byte + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte - //the 3rd&4th row - vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3] - vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4] - pld [r0] - pld [r0, r1] - // vertical filtered - FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail + //the 3rd&4th row + vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3] + vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4] + pld [r0] + pld [r0, r1] + // vertical filtered + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail - FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail - UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail + FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail + UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail - vmov d23, d4 - vmov d25, d14 - vmov d27, d16 + vmov d23, d4 + vmov d25, d14 + vmov d27, d16 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] - vmov r4 ,r5, d22 - str r4, [r2], r3 //write 4Byte - str r5, [r2], r3 //write 4Byte + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0] + vmov r4 ,r5, d22 + str r4, [r2], r3 //write 4Byte + str r5, [r2], r3 //write 4Byte - //q4~q6, q0~q1, --> q0~q4 - vswp q4, q0 - vmov q3, q4 - vmov q4, q1 - vmov q1, q5 - vmov q2, q6 + //q4~q6, q0~q1, --> q0~q4 + vswp q4, q0 + vmov q3, q4 + vmov q4, q1 + vmov q1, q5 + vmov q2, q6 - sub r6, #4 - cmp r6, #0 - bne w4_hv_mc_luma_loop + sub r6, #4 + cmp r6, #0 + bne w4_hv_mc_luma_loop - vpop {q4-q7} - pop {r4, r5, r6} + vpop {q4-q7} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] w16_copy_loop: - vld1.u8 {q0}, [r0], r1 - sub r4, #2 - vld1.u8 {q1}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - cmp r4, #0 - vst1.u8 {q1}, [r2], r3 - bne w16_copy_loop + vld1.u8 {q0}, [r0], r1 + sub r4, #2 + vld1.u8 {q1}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + cmp r4, #0 + vst1.u8 {q1}, [r2], r3 + bne w16_copy_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] w8_copy_loop: - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vst1.u8 {d1}, [r2], r3 - sub r4, #2 - cmp r4, #0 - bne w8_copy_loop + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vst1.u8 {d1}, [r2], r3 + sub r4, #2 + cmp r4, #0 + bne w8_copy_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon - push {r4, r5, r6} - ldr r4, [sp, #12] + push {r4, r5, r6} + ldr r4, [sp, #12] w4_copy_loop: - ldr r5, [r0], r1 - ldr r6, [r0], r1 - str r5, [r2], r3 - str r6, [r2], r3 + ldr r5, [r0], r1 + ldr r6, [r0], r1 + str r5, [r2], r3 + str r6, [r2], r3 - sub r4, #2 - cmp r4, #0 - bne w4_copy_loop + sub r4, #2 + cmp r4, #0 + bne w4_copy_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] w16_pix_avg_loop: - vld1.u8 {q0}, [r2]! - vld1.u8 {q1}, [r3]! - vld1.u8 {q2}, [r2]! - vld1.u8 {q3}, [r3]! + vld1.u8 {q0}, [r2]! + vld1.u8 {q1}, [r3]! + vld1.u8 {q2}, [r2]! + vld1.u8 {q3}, [r3]! - vld1.u8 {q8}, [r2]! - vld1.u8 {q9}, [r3]! - vld1.u8 {q10}, [r2]! - vld1.u8 {q11}, [r3]! + vld1.u8 {q8}, [r2]! + vld1.u8 {q9}, [r3]! + vld1.u8 {q10}, [r2]! + vld1.u8 {q11}, [r3]! - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {q0}, [r0], r1 + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {q2}, [r0], r1 + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 - AVERAGE_TWO_8BITS d16, d16, d18 - AVERAGE_TWO_8BITS d17, d17, d19 - vst1.u8 {q8}, [r0], r1 + AVERAGE_TWO_8BITS d16, d16, d18 + AVERAGE_TWO_8BITS d17, d17, d19 + vst1.u8 {q8}, [r0], r1 - AVERAGE_TWO_8BITS d20, d20, d22 - AVERAGE_TWO_8BITS d21, d21, d23 - vst1.u8 {q10}, [r0], r1 + AVERAGE_TWO_8BITS d20, d20, d22 + AVERAGE_TWO_8BITS d21, d21, d23 + vst1.u8 {q10}, [r0], r1 - sub r4, #4 - cmp r4, #0 - bne w16_pix_avg_loop + sub r4, #4 + cmp r4, #0 + bne w16_pix_avg_loop - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon - push {r4, r5} - ldr r4, [sp, #8] - mov r5, #16 + push {r4, r5} + ldr r4, [sp, #8] + mov r5, #16 w8_pix_avg_loop: - vld1.u8 {d0}, [r2], r5 - vld1.u8 {d2}, [r3], r5 - vld1.u8 {d1}, [r2], r5 - vld1.u8 {d3}, [r3], r5 + vld1.u8 {d0}, [r2], r5 + vld1.u8 {d2}, [r3], r5 + vld1.u8 {d1}, [r2], r5 + vld1.u8 {d3}, [r3], r5 - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {d0}, [r0], r1 - vst1.u8 {d1}, [r0], r1 + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {d0}, [r0], r1 + vst1.u8 {d1}, [r0], r1 - vld1.u8 {d4}, [r2], r5 - vld1.u8 {d6}, [r3], r5 - vld1.u8 {d5}, [r2], r5 - vld1.u8 {d7}, [r3], r5 + vld1.u8 {d4}, [r2], r5 + vld1.u8 {d6}, [r3], r5 + vld1.u8 {d5}, [r2], r5 + vld1.u8 {d7}, [r3], r5 - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {d4}, [r0], r1 - vst1.u8 {d5}, [r0], r1 + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {d4}, [r0], r1 + vst1.u8 {d5}, [r0], r1 - sub r4, #4 - cmp r4, #0 - bne w8_pix_avg_loop + sub r4, #4 + cmp r4, #0 + bne w8_pix_avg_loop - pop {r4, r5} + pop {r4, r5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon - push {r4-r8} - ldr r4, [sp, #20] + push {r4-r8} + ldr r4, [sp, #20] w4_pix_avg_loop: - ldr r5, [r2] - ldr r6, [r2, #16] - ldr r7, [r3] - ldr r8, [r3, #16] - add r2, #32 - add r3, #32 + ldr r5, [r2] + ldr r6, [r2, #16] + ldr r7, [r3] + ldr r8, [r3, #16] + add r2, #32 + add r3, #32 - vmov d0, r5, r6 - vmov d1, r7, r8 - AVERAGE_TWO_8BITS d0, d0, d1 - vmov r5, r6, d0 + vmov d0, r5, r6 + vmov d1, r7, r8 + AVERAGE_TWO_8BITS d0, d0, d1 + vmov r5, r6, d0 - str r5, [r0], r1 - str r6, [r0], r1 + str r5, [r0], r1 + str r6, [r0], r1 - sub r4, #2 - cmp r4, #0 - bne w4_pix_avg_loop + sub r4, #2 + cmp r4, #0 + bne w4_pix_avg_loop - pop {r4-r8} + pop {r4-r8} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon - push {r4, r5} - ldr r4, [sp, #8] - ldr r5, [sp, #12] -// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]} -// we can opti it by adding vert only/ hori only cases, to be continue - vld1.u8 {d31}, [r4] //load A/B/C/D - vld1.u8 {q0}, [r0], r1 //src[x] + push {r4, r5} + ldr r4, [sp, #8] + ldr r5, [sp, #12] +// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]} +// we can opti it by adding vert only/ hori only cases, to be continue + vld1.u8 {d31}, [r4] //load A/B/C/D + vld1.u8 {q0}, [r0], r1 //src[x] - vdup.u8 d28, d31[0] //A - vdup.u8 d29, d31[1] //B - vdup.u8 d30, d31[2] //C - vdup.u8 d31, d31[3] //D + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D - vext.u8 d1, d0, d1, #1 //src[x+1] + vext.u8 d1, d0, d1, #1 //src[x+1] -w8_mc_chroma_loop: // each two pxl row - vld1.u8 {q1}, [r0], r1 //src[x+stride] - vld1.u8 {q2}, [r0], r1 //src[x+2*stride] - vext.u8 d3, d2, d3, #1 //src[x+stride+1] - vext.u8 d5, d4, d5, #1 //src[x+2*stride+1] +w8_mc_chroma_loop: // each two pxl row + vld1.u8 {q1}, [r0], r1 //src[x+stride] + vld1.u8 {q2}, [r0], r1 //src[x+2*stride] + vext.u8 d3, d2, d3, #1 //src[x+stride+1] + vext.u8 d5, d4, d5, #1 //src[x+2*stride+1] - vmull.u8 q3, d0, d28 //(src[x] * A) - vmlal.u8 q3, d1, d29 //+=(src[x+1] * B) - vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C) - vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D) - vrshrn.u16 d6, q3, #6 - vst1.u8 d6, [r2], r3 + vmull.u8 q3, d0, d28 //(src[x] * A) + vmlal.u8 q3, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 - vmull.u8 q3, d2, d28 //(src[x] * A) - vmlal.u8 q3, d3, d29 //+=(src[x+1] * B) - vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C) - vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D) - vrshrn.u16 d6, q3, #6 - vst1.u8 d6, [r2], r3 + vmull.u8 q3, d2, d28 //(src[x] * A) + vmlal.u8 q3, d3, d29 //+=(src[x+1] * B) + vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C) + vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D) + vrshrn.u16 d6, q3, #6 + vst1.u8 d6, [r2], r3 - vmov q0, q2 - sub r5, #2 - cmp r5, #0 - bne w8_mc_chroma_loop + vmov q0, q2 + sub r5, #2 + cmp r5, #0 + bne w8_mc_chroma_loop - pop {r4, r5} + pop {r4, r5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r6, [sp, #16] -// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]} -// we can opti it by adding vert only/ hori only cases, to be continue - vld1.u8 {d31}, [r4] //load A/B/C/D + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r6, [sp, #16] +// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]} +// we can opti it by adding vert only/ hori only cases, to be continue + vld1.u8 {d31}, [r4] //load A/B/C/D - vdup.u8 d28, d31[0] //A - vdup.u8 d29, d31[1] //B - vdup.u8 d30, d31[2] //C - vdup.u8 d31, d31[3] //D + vdup.u8 d28, d31[0] //A + vdup.u8 d29, d31[1] //B + vdup.u8 d30, d31[2] //C + vdup.u8 d31, d31[3] //D -w4_mc_chroma_loop: // each two pxl row - vld1.u8 {d0}, [r0], r1 //a::src[x] - vld1.u8 {d2}, [r0], r1 //b::src[x+stride] - vld1.u8 {d4}, [r0] //c::src[x+2*stride] +w4_mc_chroma_loop: // each two pxl row + vld1.u8 {d0}, [r0], r1 //a::src[x] + vld1.u8 {d2}, [r0], r1 //b::src[x+stride] + vld1.u8 {d4}, [r0] //c::src[x+2*stride] - vshr.u64 d1, d0, #8 - vshr.u64 d3, d2, #8 - vshr.u64 d5, d4, #8 + vshr.u64 d1, d0, #8 + vshr.u64 d3, d2, #8 + vshr.u64 d5, d4, #8 - vmov q3, q1 //b::[0:7]+b::[1~8] - vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]} - vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]} + vmov q3, q1 //b::[0:7]+b::[1~8] + vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]} + vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]} - vmull.u8 q1, d0, d28 //(src[x] * A) - vmlal.u8 q1, d1, d29 //+=(src[x+1] * B) - vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C) - vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D) + vmull.u8 q1, d0, d28 //(src[x] * A) + vmlal.u8 q1, d1, d29 //+=(src[x+1] * B) + vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C) + vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D) - vrshrn.u16 d2, q1, #6 - vmov r4, r5, d2 - str r4, [r2], r3 - str r5, [r2], r3 + vrshrn.u16 d2, q1, #6 + vmov r4, r5, d2 + str r4, [r2], r3 + str r5, [r2], r3 - sub r6, #2 - cmp r6, #0 - bne w4_mc_chroma_loop + sub r6, #2 + cmp r6, #0 + bne w4_mc_chroma_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon - push {r4-r5} - mov r4, #20 - mov r5, #1 - sub r4, r4, r4, lsl #(16-2) - lsl r5, #16 - ror r4, #16 - vmov d3, r5, r4 // 0x0014FFFB00010000 + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d3, r5, r4 // 0x0014FFFB00010000 - sub r3, #16 - ldr r4, [sp, #8] + sub r3, #16 + ldr r4, [sp, #8] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w17_h_mc_luma_loop: - vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] - vext.8 q2, q0, q1, #1 //q2=src[-1] - vext.8 q3, q0, q1, #2 //q3=src[0] - vext.8 q8, q0, q1, #3 //q8=src[1] - vext.8 q9, q0, q1, #4 //q9=src[2] - vext.8 q10, q0, q1, #5 //q10=src[3] + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q8, q0, q1, #3 //q8=src[1] + vext.8 q9, q0, q1, #4 //q9=src[2] + vext.8 q10, q0, q1, #5 //q10=src[3] - FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15 + FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15 - FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15 + FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15 - vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte + vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte - vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1 + vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1 - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - sub r4, #1 - cmp r4, #0 - bne w17_h_mc_luma_loop - pop {r4-r5} + sub r4, #1 + cmp r4, #0 + bne w17_h_mc_luma_loop + pop {r4-r5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon - push {r4-r5} - mov r4, #20 - mov r5, #1 - sub r4, r4, r4, lsl #(16-2) - lsl r5, #16 - ror r4, #16 - vmov d7, r5, r4 // 0x0014FFFB00010000 + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d7, r5, r4 // 0x0014FFFB00010000 - sub r3, #8 - ldr r4, [sp, #8] + sub r3, #8 + ldr r4, [sp, #8] - sub r0, #2 - vmov.u16 q14, #0x0014 // 20 - vshr.u16 q15, q14, #2 // 5 + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 w9_h_mc_luma_loop: - vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] - pld [r0] + vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] + pld [r0] - vext.8 d2, d0, d1, #1 //d2=src[-1] - vext.8 d3, d0, d1, #2 //d3=src[0] - vext.8 d4, d0, d1, #3 //d4=src[1] - vext.8 d5, d0, d1, #4 //d5=src[2] - vext.8 d6, d0, d1, #5 //d6=src[3] + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] - FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15 + FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15 - sub r4, #1 - vst1.u8 {d16}, [r2]! //write [0:7] Byte + sub r4, #1 + vst1.u8 {d16}, [r2]! //write [0:7] Byte - vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1 - vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte + vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1 + vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte - cmp r4, #0 - bne w9_h_mc_luma_loop - pop {r4-r5} + cmp r4, #0 + bne w9_h_mc_luma_loop + pop {r4-r5} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //q0=src[-2] - vld1.u8 {q1}, [r0], r1 //q1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //q2=src[0] - vld1.u8 {q3}, [r0], r1 //q3=src[1] - vld1.u8 {q8}, [r0], r1 //q8=src[2] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q8}, [r0], r1 //q8=src[2] w17_v_mc_luma_loop: - vld1.u8 {q9}, [r0], r1 //q9=src[3] + vld1.u8 {q9}, [r0], r1 //q9=src[3] - FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 2nd row - vst1.u8 {q10}, [r2], r3 //write 1st 16Byte + FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q10}, [r2], r3 //write 1st 16Byte - FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 - vld1.u8 {q1}, [r0], r1 //read 3rd row - vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte + FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte - FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15 - vld1.u8 {q2}, [r0], r1 //read 4th row - vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte + FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte - FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15 - vld1.u8 {q3}, [r0], r1 //read 5th row - vst1.u8 {q10}, [r2], r3 //write 4th 16Byte + FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q10}, [r2], r3 //write 4th 16Byte - FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15 - vld1.u8 {q8}, [r0], r1 //read 6th row - vst1.u8 {q10}, [r2], r3 //write 5th 16Byte + FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15 + vld1.u8 {q8}, [r0], r1 //read 6th row + vst1.u8 {q10}, [r2], r3 //write 5th 16Byte - FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15 - vld1.u8 {q9}, [r0], r1 //read 7th row - vst1.u8 {q10}, [r2], r3 //write 6th 16Byte + FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15 + vld1.u8 {q9}, [r0], r1 //read 7th row + vst1.u8 {q10}, [r2], r3 //write 6th 16Byte - FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 - vld1.u8 {q0}, [r0], r1 //read 8th row - vst1.u8 {q10}, [r2], r3 //write 7th 16Byte + FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q10}, [r2], r3 //write 7th 16Byte - FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 - pld [r0] - FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 - vst1.u8 {q10}, [r2], r3 //write 8th 16Byte + FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15 + vst1.u8 {q10}, [r2], r3 //write 8th 16Byte - //q2, q3, q8, q9, q0 --> q0~q8 - vswp q0, q8 - vswp q0, q2 - vmov q1, q3 - vmov q3, q9 //q0~q8 + //q2, q3, q8, q9, q0 --> q0~q8 + vswp q0, q8 + vswp q0, q2 + vmov q1, q3 + vmov q3, q9 //q0~q8 - sub r4, #8 - cmp r4, #1 - bne w17_v_mc_luma_loop - // the last 16Bytes - vld1.u8 {q9}, [r0], r1 //q9=src[3] - FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 - FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 - vst1.u8 {q10}, [r2], r3 //write 1st 16Byte + sub r4, #8 + cmp r4, #1 + bne w17_v_mc_luma_loop + // the last 16Bytes + vld1.u8 {q9}, [r0], r1 //q9=src[3] + FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15 + FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15 + vst1.u8 {q10}, [r2], r3 //write 1st 16Byte - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - sub r0, r0, r1, lsl #1 //src[-2*src_stride] - pld [r0] - pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0}, [r0], r1 //d0=src[-2] - vld1.u8 {d1}, [r0], r1 //d1=src[-1] + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d2}, [r0], r1 //d2=src[0] - vld1.u8 {d3}, [r0], r1 //d3=src[1] + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] - vld1.u8 {d4}, [r0], r1 //d4=src[2] - vld1.u8 {d5}, [r0], r1 //d5=src[3] + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] w9_v_mc_luma_loop: - pld [r0] - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 - vld1.u8 {d0}, [r0], r1 //read 2nd row - vst1.u8 {d16}, [r2], r3 //write 1st 8Byte + pld [r0] + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d16}, [r2], r3 //write 1st 8Byte - pld [r0] - FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15 - vld1.u8 {d1}, [r0], r1 //read 3rd row - vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte + pld [r0] + FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte - pld [r0] - FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 - vld1.u8 {d2}, [r0], r1 //read 4th row - vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte + pld [r0] + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte - pld [r0] - FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15 - vld1.u8 {d3}, [r0], r1 //read 5th row - vst1.u8 {d16}, [r2], r3 //write 4th 8Byte + pld [r0] + FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d16}, [r2], r3 //write 4th 8Byte - //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 - vswp q0, q2 - vswp q1, q2 + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 - sub r4, #4 - cmp r4, #1 - bne w9_v_mc_luma_loop + sub r4, #4 + cmp r4, #1 + bne w9_v_mc_luma_loop - FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 - vst1.u8 {d16}, [r2], r3 //write last 8Byte + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15 + vst1.u8 {d16}, [r2], r3 //write last 8Byte - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon - push {r4} - vpush {q4-q7} - ldr r4, [sp, #68] + push {r4} + vpush {q4-q7} + ldr r4, [sp, #68] - sub r0, #2 //src[-2] - sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] - vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 - vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] - vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] - sub r3, #16 + vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] + sub r3, #16 w17_hv_mc_luma_loop: - vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {d0, d1}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {d0, d1}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - vld1.u8 {d0-d2}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 - vst1.u8 {d3, d4}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] - vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] + vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte - vld1.u8 {d3-d5}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 - vst1.u8 {d6, d7}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] - vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] + vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte - vld1.u8 {d6-d8}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 - vst1.u8 {d9, d10}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] - vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] + vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte - //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 - vswp q0, q6 - vswp q6, q3 - vmov q5, q2 - vmov q2, q8 + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 - vmov d20,d8 - vmov q4, q1 - vmov q1, q7 - vmov d14,d20 + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 - sub r4, #4 - cmp r4, #1 - bne w17_hv_mc_luma_loop - //the last row - vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] - // vertical filtered into q10/q11 - FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] - vst1.u8 {q0}, [r2]! //write 16Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] - vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + sub r4, #4 + cmp r4, #1 + bne w17_hv_mc_luma_loop + //the last row + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte - vpop {q4-q7} - pop {r4} + vpop {q4-q7} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon - push {r4} - vpush {q4} - ldr r4, [sp, #20] + push {r4} + vpush {q4} + ldr r4, [sp, #20] - sub r0, #2 //src[-2] - sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] - pld [r0] - pld [r0, r1] + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] - vmov.u16 q14, #0x0014 // 20 - vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] - vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] - pld [r0] - pld [r0, r1] - vshr.u16 q15, q14, #2 // 5 + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 - vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] - vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] - pld [r0] - pld [r0, r1] - vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] - sub r3, #8 + vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] + sub r3, #8 w9_hv_mc_luma_loop: - vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3] - //the 1st row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] - vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte + vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] + vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte - vld1.u8 {q0}, [r0], r1 //read 2nd row - //the 2nd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] - vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte + vld1.u8 {q0}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] + vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte - vld1.u8 {q1}, [r0], r1 //read 3rd row - //the 3rd row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] - vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte + vld1.u8 {q1}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] + vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte - vld1.u8 {q2}, [r0], r1 //read 4th row - //the 4th row - pld [r0] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] - vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte + vld1.u8 {q2}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] + vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte - //q4~q8, q0~q2, --> q0~q4 - vswp q0, q4 - vswp q2, q4 - vmov q3, q1 - vmov q1, q8 + //q4~q8, q0~q2, --> q0~q4 + vswp q0, q4 + vswp q2, q4 + vmov q3, q1 + vmov q1, q8 - sub r4, #4 - cmp r4, #1 - bne w9_hv_mc_luma_loop - //the last row - vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3] - // vertical filtered into q9/q10 - FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail - FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail - // horizon filtered - UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 - FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] - vst1.u8 d18, [r2]! //write 8Byte - UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] - vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte - vpop {q4} - pop {r4} + sub r4, #4 + cmp r4, #1 + bne w9_hv_mc_luma_loop + //the last row + vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0] + vst1.u8 d18, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0] + vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte + vpop {q4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r5, [sp, #16] - ldr r6, [sp, #20] + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] enc_w16_pix_avg_loop: - vld1.u8 {q0}, [r2], r3 - vld1.u8 {q1}, [r4], r5 - vld1.u8 {q2}, [r2], r3 - vld1.u8 {q3}, [r4], r5 + vld1.u8 {q0}, [r2], r3 + vld1.u8 {q1}, [r4], r5 + vld1.u8 {q2}, [r2], r3 + vld1.u8 {q3}, [r4], r5 - vld1.u8 {q8}, [r2], r3 - vld1.u8 {q9}, [r4], r5 - vld1.u8 {q10}, [r2], r3 - vld1.u8 {q11}, [r4], r5 + vld1.u8 {q8}, [r2], r3 + vld1.u8 {q9}, [r4], r5 + vld1.u8 {q10}, [r2], r3 + vld1.u8 {q11}, [r4], r5 - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {q0}, [r0], r1 + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {q2}, [r0], r1 + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 - AVERAGE_TWO_8BITS d16, d16, d18 - AVERAGE_TWO_8BITS d17, d17, d19 - vst1.u8 {q8}, [r0], r1 + AVERAGE_TWO_8BITS d16, d16, d18 + AVERAGE_TWO_8BITS d17, d17, d19 + vst1.u8 {q8}, [r0], r1 - AVERAGE_TWO_8BITS d20, d20, d22 - AVERAGE_TWO_8BITS d21, d21, d23 - vst1.u8 {q10}, [r0], r1 + AVERAGE_TWO_8BITS d20, d20, d22 + AVERAGE_TWO_8BITS d21, d21, d23 + vst1.u8 {q10}, [r0], r1 - sub r6, #4 - cmp r6, #0 - bne enc_w16_pix_avg_loop + sub r6, #4 + cmp r6, #0 + bne enc_w16_pix_avg_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon - push {r4, r5, r6} - ldr r4, [sp, #12] - ldr r5, [sp, #16] - ldr r6, [sp, #20] + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] enc_w8_pix_avg_loop: - vld1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r4], r5 - vld1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r4], r5 + vld1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r4], r5 + vld1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r4], r5 - AVERAGE_TWO_8BITS d0, d0, d2 - AVERAGE_TWO_8BITS d1, d1, d3 - vst1.u8 {d0}, [r0], r1 - vst1.u8 {d1}, [r0], r1 + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {d0}, [r0], r1 + vst1.u8 {d1}, [r0], r1 - vld1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r4], r5 - vld1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r4], r5 + vld1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r4], r5 + vld1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r4], r5 - AVERAGE_TWO_8BITS d4, d4, d6 - AVERAGE_TWO_8BITS d5, d5, d7 - vst1.u8 {d4}, [r0], r1 - vst1.u8 {d5}, [r0], r1 + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {d4}, [r0], r1 + vst1.u8 {d5}, [r0], r1 - sub r6, #4 - cmp r6, #0 - bne enc_w8_pix_avg_loop + sub r6, #4 + cmp r6, #0 + bne enc_w8_pix_avg_loop - pop {r4, r5, r6} + pop {r4, r5, r6} WELS_ASM_FUNC_END #endif diff --git a/codec/common/arm64/expand_picture_aarch64_neon.S b/codec/common/arm64/expand_picture_aarch64_neon.S index ebcbf0e8..572bd0f9 100644 --- a/codec/common/arm64/expand_picture_aarch64_neon.S +++ b/codec/common/arm64/expand_picture_aarch64_neon.S @@ -53,88 +53,88 @@ _expand_picture_luma_loop2: sub x8, x8, #1 cbnz x8, _expand_picture_luma_loop2 //for the top and bottom expand - add x2, x2, #64 - sub x0, x0, #32 + add x2, x2, #64 + sub x0, x0, #32 madd x4, x1, x3, x0 sub x4, x4, x1 _expand_picture_luma_loop0: - mov x5, #32 + mov x5, #32 msub x5, x5, x1, x0 - add x6, x4, x1 + add x6, x4, x1 ld1 {v0.16b}, [x0], x10 ld1 {v1.16b}, [x4], x10 - mov x8, #32 + mov x8, #32 _expand_picture_luma_loop1: - st1 {v0.16b}, [x5], x1 - st1 {v1.16b}, [x6], x1 - sub x8, x8, #1 + st1 {v0.16b}, [x5], x1 + st1 {v1.16b}, [x6], x1 + sub x8, x8, #1 cbnz x8, _expand_picture_luma_loop1 - sub x2, x2, #16 - cbnz x2, _expand_picture_luma_loop0 + sub x2, x2, #16 + cbnz x2, _expand_picture_luma_loop0 WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon - //Save the dst - mov x7, x0 - mov x8, x3 + //Save the dst + mov x7, x0 + mov x8, x3 mov x10, #16 - add x4, x7, x2 - sub x4, x4, #1 + add x4, x7, x2 + sub x4, x4, #1 //For the left and right expand _expand_picture_chroma_loop2: - sub x5, x7, #16 - add x6, x4, #1 + sub x5, x7, #16 + add x6, x4, #1 - ld1r {v0.16b}, [x7], x1 - ld1r {v1.16b}, [x4], x1 + ld1r {v0.16b}, [x7], x1 + ld1r {v1.16b}, [x4], x1 - st1 {v0.16b}, [x5] - st1 {v1.16b}, [x6] - sub x8, x8, #1 - cbnz x8, _expand_picture_chroma_loop2 + st1 {v0.16b}, [x5] + st1 {v1.16b}, [x6] + sub x8, x8, #1 + cbnz x8, _expand_picture_chroma_loop2 - //for the top and bottom expand - add x2, x2, #32 + //for the top and bottom expand + add x2, x2, #32 // mov x9, x2 mov x11, #15 bic x2, x2, x11 // - sub x0, x0, #16 - madd x4, x1, x3, x0 - sub x4, x4, x1 + sub x0, x0, #16 + madd x4, x1, x3, x0 + sub x4, x4, x1 _expand_picture_chroma_loop0: - mov x5, #16 + mov x5, #16 msub x5, x5, x1, x0 - add x6, x4, x1 - ld1 {v0.16b}, [x0], x10 - ld1 {v1.16b}, [x4], x10 + add x6, x4, x1 + ld1 {v0.16b}, [x0], x10 + ld1 {v1.16b}, [x4], x10 - mov x8, #16 + mov x8, #16 _expand_picture_chroma_loop1: - st1 {v0.16b}, [x5], x1 - st1 {v1.16b}, [x6], x1 - sub x8, x8, #1 + st1 {v0.16b}, [x5], x1 + st1 {v1.16b}, [x6], x1 + sub x8, x8, #1 cbnz x8, _expand_picture_chroma_loop1 - sub x2, x2, #16 - cbnz x2, _expand_picture_chroma_loop0 + sub x2, x2, #16 + cbnz x2, _expand_picture_chroma_loop0 and x9, x9, #15 sub x9, x9, #8 cbnz x9, _expand_picture_chroma_end - mov x5, #16 + mov x5, #16 msub x5, x5, x1, x0 - add x6, x4, x1 - ld1 {v0.8b}, [x0] - ld1 {v1.8b}, [x4] + add x6, x4, x1 + ld1 {v0.8b}, [x0] + ld1 {v1.8b}, [x4] - mov x8, #16 + mov x8, #16 _expand_picture_chroma_loop3: - st1 {v0.8b}, [x5], x1 - st1 {v1.8b}, [x6], x1 - sub x8, x8, #1 + st1 {v0.8b}, [x5], x1 + st1 {v1.8b}, [x6], x1 + sub x8, x8, #1 cbnz x8, _expand_picture_chroma_loop3 _expand_picture_chroma_end: diff --git a/codec/common/arm64/mc_aarch64_neon.S b/codec/common/arm64/mc_aarch64_neon.S index b46dc7aa..c6566f3f 100644 --- a/codec/common/arm64/mc_aarch64_neon.S +++ b/codec/common/arm64/mc_aarch64_neon.S @@ -39,349 +39,349 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 #ifdef __APPLE__ .macro FILTER_6TAG_8BITS1 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] - uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun $6.8b, v18.8h, #5 -// } +// } .endm .macro FILTER_6TAG_8BITS2 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 $6.16b, v18.8h, #5 -// } +// } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] - uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun $6.8b, v18.8h, #5 uaddl v19.8h, $2.8b, $6.8b rshrn $6.8b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 $6.16b, v18.8h, #5 uaddl2 v19.8h, $2.16b, $6.16b rshrn2 $6.16b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] - uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun $6.8b, v18.8h, #5 uaddl v19.8h, $3.8b, $6.8b rshrn $6.8b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 $6.16b, v18.8h, #5 uaddl2 v19.8h, $3.16b, $6.16b rshrn2 $6.16b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS1 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3] - uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1] - mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2] - mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3] + uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1] + mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2] + mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS2 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3] - uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1] - mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2] - mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3] + uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1] + mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2] + mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS1 -// { // input:a, b, c, dst_d; - sub $0.8h, $0.8h, $1.8h //a-b - sshr $0.8h, $0.8h, #2 //(a-b)/4 - sub $0.8h, $0.8h, $1.8h //(a-b)/4-b - add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c - sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 - add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun $3.8b, $0.8h, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + sub $0.8h, $0.8h, $1.8h //a-b + sshr $0.8h, $0.8h, #2 //(a-b)/4 + sub $0.8h, $0.8h, $1.8h //(a-b)/4-b + add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c + sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 + add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun $3.8b, $0.8h, #6 //(+32)>>6 +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS2 -// { // input:a, b, c, dst_d; - sub $0.8h, $0.8h, $1.8h //a-b - sshr $0.8h, $0.8h, #2 //(a-b)/4 - sub $0.8h, $0.8h, $1.8h //(a-b)/4-b - add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c - sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 - add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + sub $0.8h, $0.8h, $1.8h //a-b + sshr $0.8h, $0.8h, #2 //(a-b)/4 + sub $0.8h, $0.8h, $1.8h //(a-b)/4-b + add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c + sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 + add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6 +// } .endm .macro UNPACK_2_16BITS_TO_ABC -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; - ext $4.16b, $0.16b, $1.16b, #4 //src[0] - ext $3.16b, $0.16b, $1.16b, #6 //src[1] - add $4.8h, $4.8h, $3.8h //c=src[0]+src[1] +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + ext $4.16b, $0.16b, $1.16b, #4 //src[0] + ext $3.16b, $0.16b, $1.16b, #6 //src[1] + add $4.8h, $4.8h, $3.8h //c=src[0]+src[1] - ext $3.16b, $0.16b, $1.16b, #2 //src[-1] - ext $2.16b, $0.16b, $1.16b, #8 //src[2] - add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2] + ext $3.16b, $0.16b, $1.16b, #2 //src[-1] + ext $2.16b, $0.16b, $1.16b, #8 //src[2] + add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2] - ext $2.16b, $0.16b, $1.16b, #10 //src[3] - add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3] -// } + ext $2.16b, $0.16b, $1.16b, #10 //src[3] + add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3] +// } .endm .macro AVERAGE_TWO_8BITS1 -// { // input:dst_d, src_d A and B; working: v5 - uaddl v30.8h, $2.8b, $1.8b - rshrn $0.8b, v30.8h, #1 -// } +// { // input:dst_d, src_d A and B; working: v5 + uaddl v30.8h, $2.8b, $1.8b + rshrn $0.8b, v30.8h, #1 +// } .endm .macro AVERAGE_TWO_8BITS2 -// { // input:dst_d, src_d A and B; working: v5 - uaddl2 v30.8h, $2.16b, $1.16b - rshrn2 $0.16b, v30.8h, #1 -// } +// { // input:dst_d, src_d A and B; working: v5 + uaddl2 v30.8h, $2.16b, $1.16b + rshrn2 $0.16b, v30.8h, #1 +// } .endm -.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X}, - rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O - uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]* - mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32] +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X}, + rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O + uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]* + mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32] addv $3, $2.4h sqrshrun $0.8b, $0.8h, #5 -// } +// } .endm .macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23 -// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) +// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2] - rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O + rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]* - smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32] + smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32] saddlv $5, $3.4s //sshr $0.2d, $0.2d, #4 sqrshrun $0.2s, $0.2d, #10 uqxtn $0.4h, $0.4s uqxtn $0.8b, $0.8h - // } + // } .endm #else .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun \arg6\().8b, v18.8h, #5 -// } +// } .endm .macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 \arg6\().16b, v18.8h, #5 -// } +// } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun \arg6\().8b, v18.8h, #5 uaddl v19.8h, \arg2\().8b, \arg6\().8b rshrn \arg6\().8b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 \arg6\().16b, v18.8h, #5 uaddl2 v19.8h, \arg2\().16b, \arg6\().16b rshrn2 \arg6\().16b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun \arg6\().8b, v18.8h, #5 uaddl v19.8h, \arg3\().8b, \arg6\().8b rshrn \arg6\().8b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles sqrshrun2 \arg6\().16b, v18.8h, #5 uaddl2 v19.8h, \arg3\().16b, \arg6\().16b rshrn2 \arg6\().16b, v19.8h, #1 -// } +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3] - uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] - mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] - mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3] + uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] + mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3] - uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] - mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] - mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles -// } +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3] + uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] + mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3 -// { // input:a, b, c, dst_d; - sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b - sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 - sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b - add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c - sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 - add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b + sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b + add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c + sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 + add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6 +// } .endm .macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3 -// { // input:a, b, c, dst_d; - sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b - sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 - sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b - add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c - sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 - add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6 -// } +// { // input:a, b, c, dst_d; + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b + sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b + add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c + sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 + add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6 +// } .endm .macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4 -// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; - ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0] - ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1] - add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1] +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0] + ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1] + add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1] - ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1] - ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2] - add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2] + ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1] + ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2] + add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2] - ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3] - add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3] -// } + ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3] + add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3] +// } .endm .macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2 -// { // input:dst_d, src_d A and B; working: v5 - uaddl v30.8h, \arg2\().8b, \arg1\().8b - rshrn \arg0\().8b, v30.8h, #1 -// } +// { // input:dst_d, src_d A and B; working: v5 + uaddl v30.8h, \arg2\().8b, \arg1\().8b + rshrn \arg0\().8b, v30.8h, #1 +// } .endm .macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2 -// { // input:dst_d, src_d A and B; working: v5 - uaddl2 v30.8h, \arg2\().16b, \arg1\().16b - rshrn2 \arg0\().16b, v30.8h, #1 -// } +// { // input:dst_d, src_d A and B; working: v5 + uaddl2 v30.8h, \arg2\().16b, \arg1\().16b + rshrn2 \arg0\().16b, v30.8h, #1 +// } .endm .macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3 // when width=17/9, used -// { // input: src_d{Y[0][1][2][3][4][5]X}, - rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O - uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]* - mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32] +// { // input: src_d{Y[0][1][2][3][4][5]X}, + rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O + uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]* + mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32] addv \arg3, \arg2\().4h sqrshrun \arg0\().8b, \arg0\().8h, #5 -// } +// } .endm .macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5 -// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) +// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2] - rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O + rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]* - smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32] + smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32] saddlv \arg5, \arg3\().4s //sshr \arg0\().2d, \arg0\().2d, #4 sqrshrun \arg0\().2s, \arg0\().2d, #10 uqxtn \arg0\().4h, \arg0\().4s uqxtn \arg0\().8b, \arg0\().8h - // } + // } .endm #endif @@ -405,7 +405,7 @@ w16_h_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte - cbnz x4, w16_h_mc_luma_loop + cbnz x4, w16_h_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon @@ -426,7 +426,7 @@ w8_h_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte - cbnz x4, w8_h_mc_luma_loop + cbnz x4, w8_h_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon @@ -461,7 +461,7 @@ w4_h_mc_luma_loop: st1 {v20.s}[0], [x2], x3 //write 4Byte st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 - cbnz x4, w4_h_mc_luma_loop + cbnz x4, w4_h_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon @@ -483,7 +483,7 @@ w16_xy_10_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte - cbnz x4, w16_xy_10_mc_luma_loop + cbnz x4, w16_xy_10_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -505,7 +505,7 @@ w8_xy_10_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte - cbnz x4, w8_xy_10_mc_luma_loop + cbnz x4, w8_xy_10_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon @@ -540,7 +540,7 @@ w4_xy_10_mc_luma_loop: st1 {v20.s}[0], [x2], x3 //write 4Byte st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 - cbnz x4, w4_xy_10_mc_luma_loop + cbnz x4, w4_xy_10_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -563,7 +563,7 @@ w16_xy_30_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte - cbnz x4, w16_xy_30_mc_luma_loop + cbnz x4, w16_xy_30_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -585,7 +585,7 @@ w8_xy_30_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte - cbnz x4, w8_xy_30_mc_luma_loop + cbnz x4, w8_xy_30_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon @@ -620,7 +620,7 @@ w4_xy_30_mc_luma_loop: st1 {v20.s}[0], [x2], x3 //write 4Byte st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 - cbnz x4, w4_xy_30_mc_luma_loop + cbnz x4, w4_xy_30_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -703,7 +703,7 @@ w16_xy_01_mc_luma_loop: mov.16b v4, v6 mov.16b v6, v7 sub x4, x4, #8 - cbnz x4, w16_xy_01_mc_luma_loop + cbnz x4, w16_xy_01_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -753,7 +753,7 @@ w8_xy_01_mc_luma_loop: mov.16b v6, v4 mov.16b v4, v7 sub x4, x4, #4 - cbnz x4, w8_xy_01_mc_luma_loop + cbnz x4, w8_xy_01_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -809,7 +809,7 @@ w4_xy_01_mc_luma_loop: mov.8b v5, v21 sub x4, x4, #4 - cbnz x4, w4_xy_01_mc_luma_loop + cbnz x4, w4_xy_01_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -892,7 +892,7 @@ w16_xy_03_mc_luma_loop: mov.16b v4, v6 mov.16b v6, v7 sub x4, x4, #8 - cbnz x4, w16_xy_03_mc_luma_loop + cbnz x4, w16_xy_03_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -942,7 +942,7 @@ w8_xy_03_mc_luma_loop: mov.16b v6, v4 mov.16b v4, v7 sub x4, x4, #4 - cbnz x4, w8_xy_03_mc_luma_loop + cbnz x4, w8_xy_03_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -998,7 +998,7 @@ w4_xy_03_mc_luma_loop: mov.8b v5, v21 sub x4, x4, #4 - cbnz x4, w4_xy_03_mc_luma_loop + cbnz x4, w4_xy_03_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1081,7 +1081,7 @@ w16_xy_02_mc_luma_loop: mov.16b v4, v6 mov.16b v6, v7 sub x4, x4, #8 - cbnz x4, w16_xy_02_mc_luma_loop + cbnz x4, w16_xy_02_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1131,7 +1131,7 @@ w8_xy_02_mc_luma_loop: mov.16b v6, v4 mov.16b v4, v7 sub x4, x4, #4 - cbnz x4, w8_xy_02_mc_luma_loop + cbnz x4, w8_xy_02_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1187,7 +1187,7 @@ w4_xy_02_mc_luma_loop: mov.8b v5, v21 sub x4, x4, #4 - cbnz x4, w4_xy_02_mc_luma_loop + cbnz x4, w4_xy_02_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1220,12 +1220,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line //prfm pldl1strm, [x0, x1] @@ -1234,12 +1234,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line //prfm pldl1strm, [x0, x1] @@ -1248,12 +1248,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line //prfm pldl1strm, [x0, x1] @@ -1262,12 +1262,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line //prfm pldl1strm, [x0, x1] @@ -1276,12 +1276,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line //prfm pldl1strm, [x0, x1] @@ -1290,12 +1290,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line //prfm pldl1strm, [x0, x1] @@ -1304,12 +1304,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line //prfm pldl1strm, [x0, x1] @@ -1318,12 +1318,12 @@ w16_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line mov.16b v5, v11 @@ -1348,7 +1348,7 @@ w16_hv_mc_luma_loop: mov.16b v16, v30 sub x4, x4, #8 - cbnz x4, w16_hv_mc_luma_loop + cbnz x4, w16_hv_mc_luma_loop ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 @@ -1381,8 +1381,8 @@ w8_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line //prfm pldl1strm, [x0, x1] @@ -1391,8 +1391,8 @@ w8_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line //prfm pldl1strm, [x0, x1] @@ -1401,8 +1401,8 @@ w8_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line //prfm pldl1strm, [x0, x1] @@ -1411,8 +1411,8 @@ w8_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line @@ -1424,7 +1424,7 @@ w8_hv_mc_luma_loop: mov.16b v4, v30 sub x4, x4, #4 - cbnz x4, w8_hv_mc_luma_loop + cbnz x4, w8_hv_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1458,12 +1458,12 @@ w4_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 - UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 + UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 + UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 zip1 v24.2d, v24.2d, v28.2d zip1 v25.2d, v25.2d, v29.2d zip1 v26.2d, v26.2d, v30.2d - FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] + FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line @@ -1478,12 +1478,12 @@ w4_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 - UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 + UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 + UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 zip1 v24.2d, v24.2d, v28.2d zip1 v25.2d, v25.2d, v29.2d zip1 v26.2d, v26.2d, v30.2d - FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] + FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line @@ -1495,7 +1495,7 @@ w4_hv_mc_luma_loop: mov.16b v4, v30 sub x4, x4, #4 - cbnz x4, w4_hv_mc_luma_loop + cbnz x4, w4_hv_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon @@ -1509,7 +1509,7 @@ w16_copy_loop: st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line sub x4, x4, #2 - cbnz x4, w16_copy_loop + cbnz x4, w16_copy_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon @@ -1523,7 +1523,7 @@ w8_copy_loop: st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line sub x4, x4, #2 - cbnz x4, w8_copy_loop + cbnz x4, w8_copy_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon @@ -1537,7 +1537,7 @@ w4_copy_loop: st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line sub x4, x4, #2 - cbnz x4, w4_copy_loop + cbnz x4, w4_copy_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon @@ -1570,7 +1570,7 @@ enc_w16_pix_avg_loop: st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line sub x6, x6, #4 - cbnz x6, enc_w16_pix_avg_loop + cbnz x6, enc_w16_pix_avg_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon @@ -1607,7 +1607,7 @@ enc_w8_pix_avg_loop: st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line sub x6, x6, #4 - cbnz x6, enc_w8_pix_avg_loop + cbnz x6, enc_w8_pix_avg_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon @@ -1649,7 +1649,7 @@ w16_pix_avg_loop: st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line sub x6, x6, #4 - cbnz x6, w16_pix_avg_loop + cbnz x6, w16_pix_avg_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon @@ -1686,7 +1686,7 @@ w8_pix_avg_loop: st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line sub x6, x6, #4 - cbnz x6, w8_pix_avg_loop + cbnz x6, w8_pix_avg_loop WELS_ASM_ARCH64_FUNC_END @@ -1707,7 +1707,7 @@ w4_pix_avg_loop: st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line sub x6, x6, #2 - cbnz x6, w4_pix_avg_loop + cbnz x6, w4_pix_avg_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon @@ -1738,7 +1738,7 @@ w8_mc_chroma_loop: mov.16b v0, v18 mov.16b v1, v19 sub x5, x5, #2 - cbnz x5, w8_mc_chroma_loop + cbnz x5, w8_mc_chroma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon @@ -1767,7 +1767,7 @@ w4_mc_chroma_loop: mov.8b v0, v18 mov.8b v1, v19 sub x5, x5, #2 - cbnz x5, w4_mc_chroma_loop + cbnz x5, w4_mc_chroma_loop WELS_ASM_ARCH64_FUNC_END @@ -1793,11 +1793,11 @@ w17_h_mc_luma_loop: st1 {v20.16b}, [x2], x5 //write 16Byte ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 - st1 {v21.b}[0], [x2], x3 //write 16th Byte + FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 + st1 {v21.b}[0], [x2], x3 //write 16th Byte sub x4, x4, #1 - cbnz x4, w17_h_mc_luma_loop + cbnz x4, w17_h_mc_luma_loop WELS_ASM_ARCH64_FUNC_END WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon @@ -1821,11 +1821,11 @@ w9_h_mc_luma_loop: st1 {v20.8b}, [x2], x5 //write 8Byte ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X - FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 - st1 {v21.b}[0], [x2], x3 //write 9th Byte + FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 + st1 {v21.b}[0], [x2], x3 //write 9th Byte sub x4, x4, #1 - cbnz x4, w9_h_mc_luma_loop + cbnz x4, w9_h_mc_luma_loop WELS_ASM_ARCH64_FUNC_END @@ -1863,12 +1863,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line @@ -1879,12 +1879,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line @@ -1895,12 +1895,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line @@ -1911,12 +1911,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line @@ -1927,12 +1927,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line @@ -1943,12 +1943,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line @@ -1959,12 +1959,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line @@ -1975,12 +1975,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line @@ -2007,7 +2007,7 @@ w17_hv_mc_luma_loop: mov.16b v16, v30 sub x4, x4, #8 - cbnz x4, w17_hv_mc_luma_loop + cbnz x4, w17_hv_mc_luma_loop //prfm pldl1strm, [x0, x1] ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] @@ -2015,12 +2015,12 @@ w17_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] // vertical filtered into v21/v22 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 - UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line @@ -2061,8 +2061,8 @@ w9_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line @@ -2073,8 +2073,8 @@ w9_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line @@ -2085,8 +2085,8 @@ w9_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line @@ -2097,8 +2097,8 @@ w9_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line @@ -2112,7 +2112,7 @@ w9_hv_mc_luma_loop: mov.16b v4, v30 sub x4, x4, #4 - cbnz x4, w9_hv_mc_luma_loop + cbnz x4, w9_hv_mc_luma_loop //prfm pldl1strm, [x0, x1] ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] @@ -2120,8 +2120,8 @@ w9_hv_mc_luma_loop: FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 // horizon filtered - UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 - FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line @@ -2207,7 +2207,7 @@ w17_v_mc_luma_loop: mov.16b v4, v6 mov.16b v6, v7 sub x4, x4, #8 - cbnz x4, w17_v_mc_luma_loop + cbnz x4, w17_v_mc_luma_loop //prfm pldl1strm, [x0, x1] ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] @@ -2262,7 +2262,7 @@ w9_v_mc_luma_loop: mov.16b v6, v4 mov.16b v4, v7 sub x4, x4, #4 - cbnz x4, w9_v_mc_luma_loop + cbnz x4, w9_v_mc_luma_loop //prfm pldl1strm, [x0, x1] ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] diff --git a/codec/common/x86/asm_inc.asm b/codec/common/x86/asm_inc.asm index 6a967a8d..8982fa29 100644 --- a/codec/common/x86/asm_inc.asm +++ b/codec/common/x86/asm_inc.asm @@ -44,15 +44,15 @@ ;*********************************************************************** %if 1 - %define MOVDQ movdqa + %define MOVDQ movdqa %else - %define MOVDQ movdqu + %define MOVDQ movdqu %endif %if 1 - %define WELSEMMS emms + %define WELSEMMS emms %else - %define WELSEMMS + %define WELSEMMS %endif @@ -220,7 +220,7 @@ BITS 32 %macro LOAD_1_PARA 0 %ifdef X86_32 - mov r0, [esp + push_num*4 + 4] + mov r0, [esp + push_num*4 + 4] %endif %endmacro @@ -234,8 +234,8 @@ BITS 32 %macro LOAD_3_PARA 0 %ifdef X86_32 mov r0, [esp + push_num*4 + 4] - mov r1, [esp + push_num*4 + 8] - mov r2, [esp + push_num*4 + 12] + mov r1, [esp + push_num*4 + 8] + mov r2, [esp + push_num*4 + 12] %endif %endmacro @@ -267,7 +267,7 @@ BITS 32 %macro LOAD_6_PARA 0 %ifdef X86_32 - push r3 + push r3 push r4 push r5 %assign push_num push_num+3 @@ -310,22 +310,22 @@ BITS 32 %macro LOAD_4_PARA_POP 0 %ifdef X86_32 - pop r3 + pop r3 %endif %endmacro %macro LOAD_5_PARA_POP 0 %ifdef X86_32 pop r4 - pop r3 + pop r3 %endif %endmacro %macro LOAD_6_PARA_POP 0 %ifdef X86_32 pop r5 - pop r4 - pop r3 + pop r4 + pop r3 %endif %endmacro @@ -416,13 +416,13 @@ BITS 32 %macro SIGN_EXTENSION 2 %ifndef X86_32 - movsxd %1, %2 + movsxd %1, %2 %endif %endmacro %macro SIGN_EXTENSIONW 2 %ifndef X86_32 - movsx %1, %2 + movsx %1, %2 %endif %endmacro @@ -438,13 +438,13 @@ BITS 32 %endmacro %macro WELS_AbsW 2 - pxor %2, %2 + pxor %2, %2 psubw %2, %1 pmaxsw %1, %2 %endmacro %macro MMX_XSwap 4 - movq %4, %2 + movq %4, %2 punpckh%1 %4, %3 punpckl%1 %2, %3 %endmacro @@ -485,35 +485,35 @@ BITS 32 ;in: m1, m2, m3, m4, m5, m6, m7, m8 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 %macro SSE2_TransTwo8x8B 9 - movdqa %9, %8 - SSE2_XSawp bw, %1, %2, %8 - SSE2_XSawp bw, %3, %4, %2 - SSE2_XSawp bw, %5, %6, %4 - movdqa %6, %9 - movdqa %9, %4 - SSE2_XSawp bw, %7, %6, %4 + movdqa %9, %8 + SSE2_XSawp bw, %1, %2, %8 + SSE2_XSawp bw, %3, %4, %2 + SSE2_XSawp bw, %5, %6, %4 + movdqa %6, %9 + movdqa %9, %4 + SSE2_XSawp bw, %7, %6, %4 - SSE2_XSawp wd, %1, %3, %6 - SSE2_XSawp wd, %8, %2, %3 - SSE2_XSawp wd, %5, %7, %2 - movdqa %7, %9 - movdqa %9, %3 - SSE2_XSawp wd, %7, %4, %3 + SSE2_XSawp wd, %1, %3, %6 + SSE2_XSawp wd, %8, %2, %3 + SSE2_XSawp wd, %5, %7, %2 + movdqa %7, %9 + movdqa %9, %3 + SSE2_XSawp wd, %7, %4, %3 - SSE2_XSawp dq, %1, %5, %4 - SSE2_XSawp dq, %6, %2, %5 - SSE2_XSawp dq, %8, %7, %2 - movdqa %7, %9 - movdqa %9, %5 - SSE2_XSawp dq, %7, %3, %5 + SSE2_XSawp dq, %1, %5, %4 + SSE2_XSawp dq, %6, %2, %5 + SSE2_XSawp dq, %8, %7, %2 + movdqa %7, %9 + movdqa %9, %5 + SSE2_XSawp dq, %7, %3, %5 - SSE2_XSawp qdq, %1, %8, %3 - SSE2_XSawp qdq, %4, %2, %8 - SSE2_XSawp qdq, %6, %7, %2 - movdqa %7, %9 - movdqa %9, %1 - SSE2_XSawp qdq, %7, %5, %1 - movdqa %5, %9 + SSE2_XSawp qdq, %1, %8, %3 + SSE2_XSawp qdq, %4, %2, %8 + SSE2_XSawp qdq, %6, %7, %2 + movdqa %7, %9 + movdqa %9, %1 + SSE2_XSawp qdq, %7, %5, %1 + movdqa %5, %9 %endmacro ;xmm0, xmm6, xmm7, [eax], [ecx] @@ -528,32 +528,32 @@ BITS 32 ; m2 = m1 + m2, m1 = m1 - m2 %macro SSE2_SumSub 3 - movdqa %3, %2 + movdqa %3, %2 paddw %2, %1 psubw %1, %3 %endmacro -%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d] - mov %3h, %3l - movd %1, e%3x ; i.e, 1% = eax (=b0) - pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0 - pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0 +%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d] + mov %3h, %3l + movd %1, e%3x ; i.e, 1% = eax (=b0) + pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0 + pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0 %endmacro ;copy a dw into a xmm for 8 times %macro SSE2_Copy8Times 2 - movd %1, %2 - punpcklwd %1, %1 - pshufd %1, %1, 0 + movd %1, %2 + punpcklwd %1, %1 + pshufd %1, %1, 0 %endmacro ;copy a db into a xmm for 16 times %macro SSE2_Copy16Times 2 - movd %1, %2 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 - packuswb %1, %1 + movd %1, %2 + pshuflw %1, %1, 0 + punpcklqdq %1, %1 + packuswb %1, %1 %endmacro @@ -564,35 +564,35 @@ BITS 32 ;dw 32,32,32,32,32,32,32,32 for xmm ;dw 32,32,32,32 for mm %macro WELS_DW32 1 - pcmpeqw %1,%1 - psrlw %1,15 - psllw %1,5 + pcmpeqw %1,%1 + psrlw %1,15 + psllw %1,5 %endmacro ;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm ;dw 1, 1, 1, 1 for mm %macro WELS_DW1 1 - pcmpeqw %1,%1 - psrlw %1,15 + pcmpeqw %1,%1 + psrlw %1,15 %endmacro ;all 0 for xmm and mm %macro WELS_Zero 1 - pxor %1, %1 + pxor %1, %1 %endmacro ;dd 1, 1, 1, 1 for xmm ;dd 1, 1 for mm %macro WELS_DD1 1 - pcmpeqw %1,%1 - psrld %1,31 + pcmpeqw %1,%1 + psrld %1,31 %endmacro ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 %macro WELS_DB1 1 - pcmpeqw %1,%1 - psrlw %1,15 - packuswb %1,%1 + pcmpeqw %1,%1 + psrlw %1,15 + packuswb %1,%1 %endmacro diff --git a/codec/common/x86/cpuid.asm b/codec/common/x86/cpuid.asm index 62739d1a..0ac8f7c3 100644 --- a/codec/common/x86/cpuid.asm +++ b/codec/common/x86/cpuid.asm @@ -29,13 +29,13 @@ ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* -;* cpu_mmx.asm +;* cpu_mmx.asm ;* ;* Abstract -;* verify cpuid feature support and cpuid detection +;* verify cpuid feature support and cpuid detection ;* ;* History -;* 04/29/2009 Created +;* 04/29/2009 Created ;* ;*************************************************************************/ @@ -115,13 +115,13 @@ WELS_EXTERN WelsCPUId %elifdef X86_32 WELS_EXTERN WelsCPUId - push ebx - push edi + push ebx + push edi - mov eax, [esp+12] ; operating index + mov eax, [esp+12] ; operating index mov edi, [esp+24] mov ecx, [edi] - cpuid ; cpuid + cpuid ; cpuid ; processing various information return mov edi, [esp+16] @@ -133,7 +133,7 @@ WELS_EXTERN WelsCPUId mov edi, [esp+28] mov [edi], edx - pop edi + pop edi pop ebx ret @@ -145,31 +145,31 @@ WELS_EXTERN WelsCPUId ;**************************************************************************************************** WELS_EXTERN WelsCPUSupportAVX %ifdef WIN64 - mov eax, ecx - mov ecx, edx + mov eax, ecx + mov ecx, edx %elifdef UNIX64 - mov eax, edi - mov ecx, esi + mov eax, edi + mov ecx, esi %else - mov eax, [esp+4] - mov ecx, [esp+8] + mov eax, [esp+4] + mov ecx, [esp+8] %endif - ; refer to detection of AVX addressed in INTEL AVX manual document - and ecx, 018000000H - cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags - jne avx_not_supported - ; processor supports AVX instructions and XGETBV is enabled by OS - mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register - XGETBV ; result in EDX:EAX - and eax, 06H - cmp eax, 06H ; check OS has enabled both XMM and YMM state support - jne avx_not_supported - mov eax, 1 - ret + ; refer to detection of AVX addressed in INTEL AVX manual document + and ecx, 018000000H + cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags + jne avx_not_supported + ; processor supports AVX instructions and XGETBV is enabled by OS + mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register + XGETBV ; result in EDX:EAX + and eax, 06H + cmp eax, 06H ; check OS has enabled both XMM and YMM state support + jne avx_not_supported + mov eax, 1 + ret avx_not_supported: - mov eax, 0 - ret + mov eax, 0 + ret ; need call after cpuid=1 and eax, ecx flag got then @@ -178,35 +178,35 @@ avx_not_supported: ;**************************************************************************************************** WELS_EXTERN WelsCPUSupportFMA %ifdef WIN64 - mov eax, ecx - mov ecx, edx + mov eax, ecx + mov ecx, edx %elifdef UNIX64 - mov eax, edi - mov ecx, esi + mov eax, edi + mov ecx, esi %else - mov eax, [esp+4] - mov ecx, [esp+8] + mov eax, [esp+4] + mov ecx, [esp+8] %endif - ; refer to detection of FMA addressed in INTEL AVX manual document - and ecx, 018001000H - cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags - jne fma_not_supported - ; processor supports AVX,FMA instructions and XGETBV is enabled by OS - mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register - XGETBV ; result in EDX:EAX - and eax, 06H - cmp eax, 06H ; check OS has enabled both XMM and YMM state support - jne fma_not_supported - mov eax, 1 - ret + ; refer to detection of FMA addressed in INTEL AVX manual document + and ecx, 018001000H + cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags + jne fma_not_supported + ; processor supports AVX,FMA instructions and XGETBV is enabled by OS + mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register + XGETBV ; result in EDX:EAX + and eax, 06H + cmp eax, 06H ; check OS has enabled both XMM and YMM state support + jne fma_not_supported + mov eax, 1 + ret fma_not_supported: - mov eax, 0 - ret + mov eax, 0 + ret ;****************************************************************************************** ; void WelsEmms() ;****************************************************************************************** WELS_EXTERN WelsEmms - emms ; empty mmx technology states - ret + emms ; empty mmx technology states + ret diff --git a/codec/common/x86/deblock.asm b/codec/common/x86/deblock.asm index 2015c71e..c3942800 100644 --- a/codec/common/x86/deblock.asm +++ b/codec/common/x86/deblock.asm @@ -57,1580 +57,1580 @@ SECTION .text WELS_EXTERN DeblockLumaLt4V_ssse3 - push rbp - mov r11,[rsp + 16 + 20h] ; pTC - PUSH_XMM 16 - sub rsp,1B0h - lea rbp,[rsp+20h] - movd xmm4,r8d - movd xmm2,r9d - mov qword [rbp+180h],r12 - mov r10,rcx - movsxd r12,edx - add edx,edx - movsxd rdx,edx - sub r10,r12 - movsx r8d,byte [r11] - pxor xmm3,xmm3 - punpcklwd xmm2,xmm2 - movaps [rbp+50h],xmm14 - lea rax,[r12+r12*2] - movdqa xmm14,[rdx+rcx] - neg rax - pshufd xmm0,xmm2,0 - movd xmm2,r8d - movsx edx,byte [r11+1] - movsx r8d,byte [r11+2] - movsx r11d,byte [r11+3] - movaps [rbp+70h],xmm12 - movd xmm1,edx - movaps [rbp+80h],xmm11 - movd xmm12,r8d - movd xmm11,r11d - movdqa xmm5, [rax+rcx] - lea rax,[r12+r12] - punpcklwd xmm12,xmm12 - neg rax - punpcklwd xmm11,xmm11 - movaps [rbp],xmm8 - movdqa xmm8, [r10] - punpcklwd xmm2,xmm2 - punpcklwd xmm1,xmm1 - punpcklqdq xmm12,xmm12 - punpcklqdq xmm11,xmm11 - punpcklqdq xmm2,xmm2 - punpcklqdq xmm1,xmm1 - shufps xmm12,xmm11,88h - movdqa xmm11,xmm8 - movaps [rbp+30h],xmm9 - movdqa xmm9,[rcx] - shufps xmm2,xmm1,88h - movdqa xmm1,xmm5 - punpcklbw xmm11,xmm3 - movaps [rbp+20h],xmm6 - movaps [rbp+60h],xmm13 - movdqa xmm13,xmm11 - movaps [rbp+90h],xmm10 - movdqa xmm10,xmm9 - movdqa xmm6,[rax+rcx] - punpcklbw xmm1,xmm3 - movaps [rbp+0A0h],xmm12 - psubw xmm13,xmm1 - movaps [rbp+40h],xmm15 - movdqa xmm15,xmm14 - movaps [rbp+10h],xmm7 - movdqa xmm7,xmm6 - punpcklbw xmm10,xmm3 - movdqa xmm12,[r12+rcx] - punpcklbw xmm7,xmm3 - punpcklbw xmm12,xmm3 - punpcklbw xmm15,xmm3 - pabsw xmm3,xmm13 - movdqa xmm13,xmm10 - psubw xmm13,xmm15 - movdqa [rbp+0F0h],xmm15 - pabsw xmm15,xmm13 - movdqa xmm13,xmm11 - movdqa [rbp+0B0h],xmm1 - movdqa xmm1,xmm0 - pavgw xmm13,xmm10 - pcmpgtw xmm1,xmm3 - movdqa [rbp+120h],xmm13 - movaps xmm13,xmm2 - punpcklwd xmm4,xmm4 - movdqa xmm3,xmm0 - movdqa [rbp+100h],xmm1 - psubw xmm13,xmm1 - movdqa xmm1,xmm10 - pcmpgtw xmm3,xmm15 - pshufd xmm4,xmm4,0 - psubw xmm1,xmm11 - movdqa [rbp+0D0h],xmm10 - psubw xmm13,xmm3 - movdqa [rbp+110h],xmm3 - pabsw xmm15,xmm1 - movdqa xmm3,xmm4 - psubw xmm10,xmm12 - pcmpgtw xmm3,xmm15 - pabsw xmm15,xmm10 - movdqa xmm10,xmm0 - psllw xmm1,2 - movdqa [rbp+0C0h],xmm11 - psubw xmm11,xmm7 - pcmpgtw xmm10,xmm15 - pabsw xmm11,xmm11 - movdqa xmm15,xmm0 - pand xmm3,xmm10 - pcmpgtw xmm15,xmm11 - movaps xmm11,xmm2 - pxor xmm10,xmm10 - pand xmm3,xmm15 - pcmpgtw xmm11,xmm10 - pcmpeqw xmm10,xmm2 - por xmm11,xmm10 - pand xmm3,xmm11 - movdqa xmm11,xmm7 - psubw xmm11,xmm12 - pxor xmm15,xmm15 - paddw xmm11,xmm1 - psubw xmm15,xmm13 - movdqa [rbp+0E0h],xmm12 - paddw xmm11,[FOUR_16B_SSE2] - pxor xmm12,xmm12 - psraw xmm11,3 - punpckhbw xmm8,xmm12 - pmaxsw xmm15,xmm11 - punpckhbw xmm5,xmm12 - movdqa xmm11,xmm8 - pminsw xmm13,xmm15 - psubw xmm11,xmm5 - punpckhbw xmm9,xmm12 - pand xmm13,xmm3 - movdqa [rbp+130h],xmm13 - pabsw xmm13,xmm11 - punpckhbw xmm14,xmm12 - movdqa xmm11,xmm9 - psubw xmm11,xmm14 - movdqa xmm15,xmm0 - movdqa [rbp+140h],xmm14 - pabsw xmm14,xmm11 - movdqa xmm11,xmm8 - pcmpgtw xmm15,xmm14 - movdqa xmm1,[r12+rcx] - pavgw xmm11,xmm9 - movdqa [rbp+170h],xmm11 - movdqa xmm10,xmm9 - punpckhbw xmm6,xmm12 - psubw xmm10,xmm8 - punpckhbw xmm1,xmm12 - movdqa xmm12,xmm0 - movaps xmm11,[rbp+0A0h] - pcmpgtw xmm12,xmm13 - movaps xmm13,xmm11 - psubw xmm13,xmm12 - movdqa [rbp+160h],xmm15 - psubw xmm13,xmm15 - movdqa xmm15,xmm9 - psubw xmm15,xmm1 - movdqa [rbp+150h],xmm12 - pabsw xmm12,xmm10 - pabsw xmm14,xmm15 - movdqa xmm15,xmm8 - pcmpgtw xmm4,xmm12 - movdqa xmm12,xmm0 - psubw xmm15,xmm6 - pcmpgtw xmm12,xmm14 - pabsw xmm14,xmm15 - psllw xmm10,2 - pcmpgtw xmm0,xmm14 - movdqa xmm14,xmm6 - psubw xmm14,xmm1 - pand xmm4,xmm12 - paddw xmm14,xmm10 - pand xmm4,xmm0 - paddw xmm14,[FOUR_16B_SSE2] - pxor xmm15,xmm15 - movaps xmm12,xmm11 - psubw xmm15,xmm13 - pxor xmm0,xmm0 - psraw xmm14,3 - pcmpgtw xmm12,xmm0 - pcmpeqw xmm0,xmm11 - pmaxsw xmm15,xmm14 - por xmm12,xmm0 - movdqa xmm0,[rbp+120h] - pminsw xmm13,xmm15 - movdqa xmm15,[rbp+0B0h] - movdqa xmm10,xmm7 - pand xmm4,xmm12 - paddw xmm15,xmm0 - pxor xmm12,xmm12 - paddw xmm10,xmm7 - movdqa xmm14,xmm12 - psubw xmm15,xmm10 - psubw xmm14,xmm2 - psraw xmm15,1 - pmaxsw xmm15,xmm14 - movdqa xmm10,xmm6 - pminsw xmm15,xmm2 - paddw xmm10,xmm6 - pand xmm15,xmm3 - psubw xmm12,xmm11 - pand xmm15,[rbp+100h] - pand xmm13,xmm4 - paddw xmm7,xmm15 - paddw xmm8,xmm13 - movdqa xmm15,[rbp+170h] - psubw xmm9,xmm13 - paddw xmm5,xmm15 - psubw xmm5,xmm10 - psraw xmm5,1 - pmaxsw xmm5,xmm12 - pminsw xmm5,xmm11 - pand xmm5,xmm4 - pand xmm5,[rbp+150h] - paddw xmm6,xmm5 - movdqa xmm5,[rbp+0C0h] - packuswb xmm7,xmm6 - movdqa xmm6,[rbp+130h] - paddw xmm5,xmm6 - packuswb xmm5,xmm8 - movdqa xmm8,[rbp+0D0h] - psubw xmm8,xmm6 - movdqa xmm6,[rbp+0F0h] - paddw xmm6,xmm0 - movdqa xmm0,[rbp+0E0h] - packuswb xmm8,xmm9 - movdqa xmm9,xmm0 - paddw xmm9,xmm0 - psubw xmm6,xmm9 - psraw xmm6,1 - pmaxsw xmm14,xmm6 - pminsw xmm2,xmm14 - pand xmm2,xmm3 - pand xmm2,[rbp+110h] - paddw xmm0,xmm2 - movdqa xmm2,[rbp+140h] - paddw xmm2,xmm15 - movdqa xmm15,xmm1 - paddw xmm15,xmm1 - psubw xmm2,xmm15 - psraw xmm2,1 - pmaxsw xmm12,xmm2 - pminsw xmm11,xmm12 - pand xmm11,xmm4 - pand xmm11,[rbp+160h] - paddw xmm1,xmm11 - movdqa [rax+rcx],xmm7 - movdqa [r10],xmm5 - packuswb xmm0,xmm1 - movdqa [rcx],xmm8 - movdqa [r12+rcx],xmm0 - mov r12,qword [rbp+180h] - lea rsp,[rbp+190h] - POP_XMM - pop rbp - ret + push rbp + mov r11,[rsp + 16 + 20h] ; pTC + PUSH_XMM 16 + sub rsp,1B0h + lea rbp,[rsp+20h] + movd xmm4,r8d + movd xmm2,r9d + mov qword [rbp+180h],r12 + mov r10,rcx + movsxd r12,edx + add edx,edx + movsxd rdx,edx + sub r10,r12 + movsx r8d,byte [r11] + pxor xmm3,xmm3 + punpcklwd xmm2,xmm2 + movaps [rbp+50h],xmm14 + lea rax,[r12+r12*2] + movdqa xmm14,[rdx+rcx] + neg rax + pshufd xmm0,xmm2,0 + movd xmm2,r8d + movsx edx,byte [r11+1] + movsx r8d,byte [r11+2] + movsx r11d,byte [r11+3] + movaps [rbp+70h],xmm12 + movd xmm1,edx + movaps [rbp+80h],xmm11 + movd xmm12,r8d + movd xmm11,r11d + movdqa xmm5, [rax+rcx] + lea rax,[r12+r12] + punpcklwd xmm12,xmm12 + neg rax + punpcklwd xmm11,xmm11 + movaps [rbp],xmm8 + movdqa xmm8, [r10] + punpcklwd xmm2,xmm2 + punpcklwd xmm1,xmm1 + punpcklqdq xmm12,xmm12 + punpcklqdq xmm11,xmm11 + punpcklqdq xmm2,xmm2 + punpcklqdq xmm1,xmm1 + shufps xmm12,xmm11,88h + movdqa xmm11,xmm8 + movaps [rbp+30h],xmm9 + movdqa xmm9,[rcx] + shufps xmm2,xmm1,88h + movdqa xmm1,xmm5 + punpcklbw xmm11,xmm3 + movaps [rbp+20h],xmm6 + movaps [rbp+60h],xmm13 + movdqa xmm13,xmm11 + movaps [rbp+90h],xmm10 + movdqa xmm10,xmm9 + movdqa xmm6,[rax+rcx] + punpcklbw xmm1,xmm3 + movaps [rbp+0A0h],xmm12 + psubw xmm13,xmm1 + movaps [rbp+40h],xmm15 + movdqa xmm15,xmm14 + movaps [rbp+10h],xmm7 + movdqa xmm7,xmm6 + punpcklbw xmm10,xmm3 + movdqa xmm12,[r12+rcx] + punpcklbw xmm7,xmm3 + punpcklbw xmm12,xmm3 + punpcklbw xmm15,xmm3 + pabsw xmm3,xmm13 + movdqa xmm13,xmm10 + psubw xmm13,xmm15 + movdqa [rbp+0F0h],xmm15 + pabsw xmm15,xmm13 + movdqa xmm13,xmm11 + movdqa [rbp+0B0h],xmm1 + movdqa xmm1,xmm0 + pavgw xmm13,xmm10 + pcmpgtw xmm1,xmm3 + movdqa [rbp+120h],xmm13 + movaps xmm13,xmm2 + punpcklwd xmm4,xmm4 + movdqa xmm3,xmm0 + movdqa [rbp+100h],xmm1 + psubw xmm13,xmm1 + movdqa xmm1,xmm10 + pcmpgtw xmm3,xmm15 + pshufd xmm4,xmm4,0 + psubw xmm1,xmm11 + movdqa [rbp+0D0h],xmm10 + psubw xmm13,xmm3 + movdqa [rbp+110h],xmm3 + pabsw xmm15,xmm1 + movdqa xmm3,xmm4 + psubw xmm10,xmm12 + pcmpgtw xmm3,xmm15 + pabsw xmm15,xmm10 + movdqa xmm10,xmm0 + psllw xmm1,2 + movdqa [rbp+0C0h],xmm11 + psubw xmm11,xmm7 + pcmpgtw xmm10,xmm15 + pabsw xmm11,xmm11 + movdqa xmm15,xmm0 + pand xmm3,xmm10 + pcmpgtw xmm15,xmm11 + movaps xmm11,xmm2 + pxor xmm10,xmm10 + pand xmm3,xmm15 + pcmpgtw xmm11,xmm10 + pcmpeqw xmm10,xmm2 + por xmm11,xmm10 + pand xmm3,xmm11 + movdqa xmm11,xmm7 + psubw xmm11,xmm12 + pxor xmm15,xmm15 + paddw xmm11,xmm1 + psubw xmm15,xmm13 + movdqa [rbp+0E0h],xmm12 + paddw xmm11,[FOUR_16B_SSE2] + pxor xmm12,xmm12 + psraw xmm11,3 + punpckhbw xmm8,xmm12 + pmaxsw xmm15,xmm11 + punpckhbw xmm5,xmm12 + movdqa xmm11,xmm8 + pminsw xmm13,xmm15 + psubw xmm11,xmm5 + punpckhbw xmm9,xmm12 + pand xmm13,xmm3 + movdqa [rbp+130h],xmm13 + pabsw xmm13,xmm11 + punpckhbw xmm14,xmm12 + movdqa xmm11,xmm9 + psubw xmm11,xmm14 + movdqa xmm15,xmm0 + movdqa [rbp+140h],xmm14 + pabsw xmm14,xmm11 + movdqa xmm11,xmm8 + pcmpgtw xmm15,xmm14 + movdqa xmm1,[r12+rcx] + pavgw xmm11,xmm9 + movdqa [rbp+170h],xmm11 + movdqa xmm10,xmm9 + punpckhbw xmm6,xmm12 + psubw xmm10,xmm8 + punpckhbw xmm1,xmm12 + movdqa xmm12,xmm0 + movaps xmm11,[rbp+0A0h] + pcmpgtw xmm12,xmm13 + movaps xmm13,xmm11 + psubw xmm13,xmm12 + movdqa [rbp+160h],xmm15 + psubw xmm13,xmm15 + movdqa xmm15,xmm9 + psubw xmm15,xmm1 + movdqa [rbp+150h],xmm12 + pabsw xmm12,xmm10 + pabsw xmm14,xmm15 + movdqa xmm15,xmm8 + pcmpgtw xmm4,xmm12 + movdqa xmm12,xmm0 + psubw xmm15,xmm6 + pcmpgtw xmm12,xmm14 + pabsw xmm14,xmm15 + psllw xmm10,2 + pcmpgtw xmm0,xmm14 + movdqa xmm14,xmm6 + psubw xmm14,xmm1 + pand xmm4,xmm12 + paddw xmm14,xmm10 + pand xmm4,xmm0 + paddw xmm14,[FOUR_16B_SSE2] + pxor xmm15,xmm15 + movaps xmm12,xmm11 + psubw xmm15,xmm13 + pxor xmm0,xmm0 + psraw xmm14,3 + pcmpgtw xmm12,xmm0 + pcmpeqw xmm0,xmm11 + pmaxsw xmm15,xmm14 + por xmm12,xmm0 + movdqa xmm0,[rbp+120h] + pminsw xmm13,xmm15 + movdqa xmm15,[rbp+0B0h] + movdqa xmm10,xmm7 + pand xmm4,xmm12 + paddw xmm15,xmm0 + pxor xmm12,xmm12 + paddw xmm10,xmm7 + movdqa xmm14,xmm12 + psubw xmm15,xmm10 + psubw xmm14,xmm2 + psraw xmm15,1 + pmaxsw xmm15,xmm14 + movdqa xmm10,xmm6 + pminsw xmm15,xmm2 + paddw xmm10,xmm6 + pand xmm15,xmm3 + psubw xmm12,xmm11 + pand xmm15,[rbp+100h] + pand xmm13,xmm4 + paddw xmm7,xmm15 + paddw xmm8,xmm13 + movdqa xmm15,[rbp+170h] + psubw xmm9,xmm13 + paddw xmm5,xmm15 + psubw xmm5,xmm10 + psraw xmm5,1 + pmaxsw xmm5,xmm12 + pminsw xmm5,xmm11 + pand xmm5,xmm4 + pand xmm5,[rbp+150h] + paddw xmm6,xmm5 + movdqa xmm5,[rbp+0C0h] + packuswb xmm7,xmm6 + movdqa xmm6,[rbp+130h] + paddw xmm5,xmm6 + packuswb xmm5,xmm8 + movdqa xmm8,[rbp+0D0h] + psubw xmm8,xmm6 + movdqa xmm6,[rbp+0F0h] + paddw xmm6,xmm0 + movdqa xmm0,[rbp+0E0h] + packuswb xmm8,xmm9 + movdqa xmm9,xmm0 + paddw xmm9,xmm0 + psubw xmm6,xmm9 + psraw xmm6,1 + pmaxsw xmm14,xmm6 + pminsw xmm2,xmm14 + pand xmm2,xmm3 + pand xmm2,[rbp+110h] + paddw xmm0,xmm2 + movdqa xmm2,[rbp+140h] + paddw xmm2,xmm15 + movdqa xmm15,xmm1 + paddw xmm15,xmm1 + psubw xmm2,xmm15 + psraw xmm2,1 + pmaxsw xmm12,xmm2 + pminsw xmm11,xmm12 + pand xmm11,xmm4 + pand xmm11,[rbp+160h] + paddw xmm1,xmm11 + movdqa [rax+rcx],xmm7 + movdqa [r10],xmm5 + packuswb xmm0,xmm1 + movdqa [rcx],xmm8 + movdqa [r12+rcx],xmm0 + mov r12,qword [rbp+180h] + lea rsp,[rbp+190h] + POP_XMM + pop rbp + ret WELS_EXTERN DeblockLumaEq4V_ssse3 - mov rax,rsp - push rbx - push rbp - push rsi - push rdi - sub rsp,1D8h - movaps [rax-38h],xmm6 - movaps [rax-48h],xmm7 - movaps [rax-58h],xmm8 - pxor xmm1,xmm1 - movsxd r10,edx - mov rbp,rcx - mov r11d,r8d - mov rdx,rcx - mov rdi,rbp - mov rbx,rbp - movdqa xmm5,[rbp] - movaps [rax-68h],xmm9 - movaps [rax-78h],xmm10 - punpcklbw xmm5,xmm1 - movaps [rax-88h],xmm11 - movaps [rax-98h],xmm12 - movaps [rax-0A8h],xmm13 - movaps [rax-0B8h],xmm14 - movdqa xmm14,[r10+rbp] - movaps [rax-0C8h],xmm15 - lea eax,[r10*4] - movsxd r8,eax - lea eax,[r10+r10*2] - movsxd rcx,eax - lea eax,[r10+r10] - sub rdx,r8 - punpcklbw xmm14,xmm1 - movdqa [rsp+90h],xmm5 - movdqa [rsp+30h],xmm14 - movsxd rsi,eax - movsx eax,r11w - sub rdi,rcx - sub rbx,rsi - mov r8,rbp - sub r8,r10 - movd xmm0,eax - movsx eax,r9w - movdqa xmm12,[rdi] - movdqa xmm6, [rsi+rbp] - movdqa xmm13,[rbx] - punpcklwd xmm0,xmm0 - pshufd xmm11,xmm0,0 - punpcklbw xmm13,xmm1 - punpcklbw xmm6,xmm1 - movdqa xmm8,[r8] - movd xmm0,eax - movdqa xmm10,xmm11 - mov eax,2 - punpcklbw xmm8,xmm1 - punpcklbw xmm12,xmm1 - cwde - punpcklwd xmm0,xmm0 - psraw xmm10,2 - movdqa xmm1,xmm8 - movdqa [rsp+0F0h],xmm13 - movdqa [rsp+0B0h],xmm8 - pshufd xmm7,xmm0,0 - psubw xmm1,xmm13 - movdqa xmm0,xmm5 - movdqa xmm4,xmm7 - movdqa xmm2,xmm7 - psubw xmm0,xmm8 - pabsw xmm3,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm5 - movdqa [rsp+40h],xmm7 - movdqa [rsp+60h],xmm6 - pcmpgtw xmm4,xmm0 - psubw xmm1,xmm14 - pabsw xmm0,xmm1 - pcmpgtw xmm2,xmm0 - pand xmm4,xmm2 - movdqa xmm0,xmm11 - pcmpgtw xmm0,xmm3 - pand xmm4,xmm0 - movd xmm0,eax - movdqa [rsp+20h],xmm4 - punpcklwd xmm0,xmm0 - pshufd xmm2,xmm0,0 - paddw xmm10,xmm2 - movdqa [rsp+0A0h],xmm2 - movdqa xmm15,xmm7 - pxor xmm4,xmm4 - movdqa xmm0,xmm8 - psubw xmm0,xmm12 - mov eax,4 - pabsw xmm0,xmm0 - movdqa xmm1,xmm10 - cwde - pcmpgtw xmm15,xmm0 - pcmpgtw xmm1,xmm3 - movdqa xmm3,xmm7 - movdqa xmm7,[rdx] - movdqa xmm0,xmm5 - psubw xmm0,xmm6 - pand xmm15,xmm1 - punpcklbw xmm7,xmm4 - movdqa xmm9,xmm15 - pabsw xmm0,xmm0 - psllw xmm7,1 - pandn xmm9,xmm12 - pcmpgtw xmm3,xmm0 - paddw xmm7,xmm12 - movd xmm0,eax - pand xmm3,xmm1 - paddw xmm7,xmm12 - punpcklwd xmm0,xmm0 - paddw xmm7,xmm12 - pshufd xmm1,xmm0,0 - paddw xmm7,xmm13 - movdqa xmm0,xmm3 - pandn xmm0,xmm6 - paddw xmm7,xmm8 - movdqa [rsp+70h],xmm1 - paddw xmm7,xmm5 - movdqa [rsp+120h],xmm0 - movdqa xmm0,[rcx+rbp] - punpcklbw xmm0,xmm4 - paddw xmm7,xmm1 - movdqa xmm4,xmm15 - psllw xmm0,1 - psraw xmm7,3 - paddw xmm0,xmm6 - pand xmm7,xmm15 - paddw xmm0,xmm6 - paddw xmm0,xmm6 - paddw xmm0,xmm14 - movdqa xmm6,xmm15 - paddw xmm0,xmm5 - pandn xmm6,xmm13 - paddw xmm0,xmm8 - paddw xmm0,xmm1 - psraw xmm0,3 - movdqa xmm1,xmm12 - paddw xmm1,xmm13 - pand xmm0,xmm3 - movdqa [rsp+100h],xmm0 - movdqa xmm0,xmm8 - paddw xmm0,xmm5 - paddw xmm1,xmm0 - movdqa xmm0,xmm3 - paddw xmm1,xmm2 - psraw xmm1,2 - pandn xmm0,xmm14 - pand xmm4,xmm1 - movdqa [rsp+0E0h],xmm0 - movdqa xmm0,xmm5 - paddw xmm0,xmm8 - movdqa xmm1,[rsp+60h] - paddw xmm1,xmm14 - movdqa xmm14,xmm3 - paddw xmm1,xmm0 - movdqa xmm0,xmm8 - paddw xmm0,[rsp+30h] - paddw xmm1,xmm2 - psraw xmm1,2 - pand xmm14,xmm1 - movdqa xmm1,xmm13 - paddw xmm1,xmm13 - paddw xmm1,xmm0 - paddw xmm1,xmm2 - psraw xmm1,2 - movdqa xmm0,[rsp+30h] - movdqa xmm2,xmm13 - movdqa xmm5,xmm15 - paddw xmm0,[rsp+70h] - pandn xmm5,xmm1 - paddw xmm2,xmm8 - movdqa xmm8,[rsp+90h] - movdqa xmm1,xmm12 - paddw xmm2,xmm8 - psllw xmm2,1 - paddw xmm2,xmm0 - paddw xmm1,xmm2 - movdqa xmm0,xmm8 - movdqa xmm8,xmm3 - movdqa xmm2,[rsp+30h] - paddw xmm0,xmm13 - psraw xmm1,3 - pand xmm15,xmm1 - movdqa xmm1,xmm2 - paddw xmm1,xmm2 - paddw xmm2,[rsp+90h] - paddw xmm2,[rsp+0B0h] - paddw xmm1,xmm0 - movdqa xmm0,xmm13 - movdqa xmm13,[r8] - paddw xmm0, [rsp+70h] - paddw xmm1, [rsp+0A0h] - psllw xmm2,1 - paddw xmm2,xmm0 - psraw xmm1,2 - movdqa xmm0, [rdi] - pandn xmm8,xmm1 - movdqa xmm1, [rsp+60h] - paddw xmm1,xmm2 - movdqa xmm2, [rbx] - psraw xmm1,3 - pand xmm3,xmm1 - movdqa xmm1, [rbp] - movdqa [rsp+0D0h],xmm3 - pxor xmm3,xmm3 - punpckhbw xmm0,xmm3 - punpckhbw xmm1,xmm3 - punpckhbw xmm13,xmm3 - movdqa [rsp+0C0h],xmm0 - movdqa xmm0,[r10+rbp] - movdqa [rsp],xmm1 - punpckhbw xmm0,xmm3 - punpckhbw xmm2,xmm3 - movdqa [rsp+80h],xmm0 - movdqa xmm0,[rsi+rbp] - movdqa [rsp+10h],xmm13 - punpckhbw xmm0,xmm3 - movdqa [rsp+50h],xmm0 - movdqa xmm0,xmm1 - movdqa xmm1,xmm13 - psubw xmm0,xmm13 - psubw xmm1,xmm2 - pabsw xmm3,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,[rsp] - movdqa xmm13,[rsp+40h] - movdqa [rsp+110h],xmm2 - psubw xmm1, [rsp+80h] - pcmpgtw xmm13,xmm0 - pcmpgtw xmm11,xmm3 - pabsw xmm0,xmm1 - pcmpgtw xmm10,xmm3 - movdqa xmm1, [rsp+40h] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 - pcmpgtw xmm2,xmm0 - movdqa xmm0, [rsp+10h] - pand xmm13,xmm2 - pand xmm13,xmm11 - movdqa xmm11,[rsp+0C0h] - psubw xmm0,xmm11 - pabsw xmm0,xmm0 - pcmpgtw xmm3,xmm0 - pand xmm3,xmm10 - movdqa xmm0,[rsp] - psubw xmm0,[rsp+50h] - movdqa xmm2,[rdx] - pabsw xmm0,xmm0 - por xmm7,xmm9 - movdqa xmm9,[rsp+20h] - pcmpgtw xmm1,xmm0 - pand xmm9,xmm7 - movdqa xmm7,[rsp+20h] - movdqa xmm0,xmm7 - pandn xmm0,xmm12 - movdqa xmm12,[rsp+110h] - pand xmm1,xmm10 - movdqa xmm10,[rsp+70h] - movdqa [rsp+40h],xmm1 - movdqa xmm1,xmm13 - por xmm9,xmm0 - pxor xmm0,xmm0 - por xmm4,xmm6 - movdqa xmm6,xmm7 - punpckhbw xmm2,xmm0 - por xmm15,xmm5 - movdqa xmm5,[rsp+20h] - movdqa xmm0,xmm3 - psllw xmm2,1 - pandn xmm0,xmm11 - pand xmm6,xmm4 - movdqa xmm4,[rsp] - paddw xmm2,xmm11 - pand xmm5,xmm15 - movdqa xmm15,[rsp+20h] - paddw xmm2,xmm11 - paddw xmm2,xmm11 - paddw xmm2,xmm12 - paddw xmm2,[rsp+10h] - paddw xmm2,[rsp] - paddw xmm2,xmm10 - psraw xmm2,3 - pand xmm2,xmm3 - por xmm2,xmm0 - pand xmm1,xmm2 - movdqa xmm0,xmm13 - movdqa xmm2,xmm11 - pandn xmm0,xmm11 - paddw xmm2,xmm12 - por xmm1,xmm0 - packuswb xmm9,xmm1 - movdqa xmm0,xmm7 - movdqa xmm7,[rsp+0A0h] - pandn xmm0,[rsp+0F0h] - movdqa xmm1,xmm3 - por xmm6,xmm0 - movdqa xmm0,[rsp+10h] - paddw xmm0,xmm4 - paddw xmm2,xmm0 - paddw xmm2,xmm7 - movdqa xmm0,xmm3 - pandn xmm0,xmm12 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - movdqa xmm2,xmm13 - movdqa xmm0,xmm13 - pand xmm2,xmm1 - pandn xmm0,xmm12 - movdqa xmm1,xmm12 - paddw xmm1,[rsp+10h] - por xmm2,xmm0 - movdqa xmm0,xmm15 - pandn xmm0,[rsp+0B0h] - paddw xmm1,xmm4 - packuswb xmm6,xmm2 - movdqa xmm2,xmm3 - psllw xmm1,1 - por xmm5,xmm0 - movdqa xmm0,[rsp+80h] - paddw xmm0,xmm10 - paddw xmm1,xmm0 - paddw xmm11,xmm1 - psraw xmm11,3 - movdqa xmm1,xmm12 - pand xmm2,xmm11 - paddw xmm1,xmm12 - movdqa xmm11,[rsp+80h] - movdqa xmm0, [rsp+10h] - por xmm14,[rsp+0E0h] - paddw xmm0,xmm11 - movdqa xmm4,xmm15 - paddw xmm1,xmm0 - movdqa xmm0,xmm13 - paddw xmm1,xmm7 - psraw xmm1,2 - pandn xmm3,xmm1 - por xmm2,xmm3 - movdqa xmm1,xmm13 - movdqa xmm3,[rsp+10h] - pandn xmm0,xmm3 - pand xmm1,xmm2 - movdqa xmm2,xmm11 - paddw xmm2,[rsp] - por xmm1,xmm0 - movdqa xmm0,[rsp+0D0h] - por xmm0,xmm8 - paddw xmm2,xmm3 - packuswb xmm5,xmm1 - movdqa xmm8,[rsp+40h] - movdqa xmm1,[rsp+50h] - movdqa xmm3,xmm8 - pand xmm4,xmm0 - psllw xmm2,1 - movdqa xmm0,xmm15 - pandn xmm0,[rsp+90h] - por xmm4,xmm0 - movdqa xmm0,xmm12 - paddw xmm0,xmm10 - paddw xmm2,xmm0 - paddw xmm1,xmm2 - movdqa xmm0,[rsp] - movdqa xmm2,xmm11 - paddw xmm0,xmm12 - movdqa xmm12,[rsp] - paddw xmm2,xmm11 - paddw xmm2,xmm0 - psraw xmm1,3 - movdqa xmm0,xmm8 - pand xmm3,xmm1 - paddw xmm2,xmm7 - movdqa xmm1,xmm13 - psraw xmm2,2 - pandn xmm0,xmm2 - por xmm3,xmm0 - movdqa xmm2,[rsp+50h] - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm3 - paddw xmm2,xmm11 - movdqa xmm3,xmm15 - por xmm1,xmm0 - pand xmm3,xmm14 - movdqa xmm14,[rsp+10h] - movdqa xmm0,xmm15 - pandn xmm0,[rsp+30h] - packuswb xmm4,xmm1 - movdqa xmm1,xmm8 - por xmm3,xmm0 - movdqa xmm0,xmm12 - paddw xmm0,xmm14 - paddw xmm2,xmm0 - paddw xmm2,xmm7 - movdqa xmm0,xmm8 - pandn xmm0,xmm11 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - movdqa xmm2,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm11 - pand xmm2,xmm1 - movdqa xmm1,xmm15 - por xmm2,xmm0 - packuswb xmm3,xmm2 - movdqa xmm0,[rsp+100h] - por xmm0,[rsp+120h] - pand xmm1,xmm0 - movdqa xmm2,[rcx+rbp] - movdqa xmm7,[rsp+50h] - pandn xmm15,[rsp+60h] - lea r11,[rsp+1D8h] - pxor xmm0,xmm0 - por xmm1,xmm15 - movaps xmm15,[r11-0A8h] - movdqa [rdi],xmm9 - movaps xmm9,[r11-48h] - punpckhbw xmm2,xmm0 - psllw xmm2,1 - paddw xmm2,xmm7 - paddw xmm2,xmm7 - movdqa [rbx],xmm6 - movaps xmm6,[r11-18h] - paddw xmm2,xmm7 - paddw xmm2,xmm11 - movaps xmm11,[r11-68h] - paddw xmm2,xmm12 - movaps xmm12,[r11-78h] - paddw xmm2,xmm14 - paddw xmm2,xmm10 - psraw xmm2,3 - movaps xmm10,[r11-58h] - movaps xmm14,[r11-98h] - movdqa xmm0,xmm13 - pand xmm2,xmm8 - pandn xmm8,xmm7 - pandn xmm13,xmm7 - por xmm2,xmm8 - movaps xmm7,[r11-28h] - movaps xmm8,[r11-38h] - movdqa [r8],xmm5 - pand xmm0,xmm2 - por xmm0,xmm13 - packuswb xmm1,xmm0 - movaps xmm13,[r11-88h] - movdqa [rbp],xmm4 - movdqa [r10+rbp],xmm3 - movdqa [rsi+rbp],xmm1 - mov rsp,r11 - pop rdi - pop rsi - pop rbp - pop rbx - ret + mov rax,rsp + push rbx + push rbp + push rsi + push rdi + sub rsp,1D8h + movaps [rax-38h],xmm6 + movaps [rax-48h],xmm7 + movaps [rax-58h],xmm8 + pxor xmm1,xmm1 + movsxd r10,edx + mov rbp,rcx + mov r11d,r8d + mov rdx,rcx + mov rdi,rbp + mov rbx,rbp + movdqa xmm5,[rbp] + movaps [rax-68h],xmm9 + movaps [rax-78h],xmm10 + punpcklbw xmm5,xmm1 + movaps [rax-88h],xmm11 + movaps [rax-98h],xmm12 + movaps [rax-0A8h],xmm13 + movaps [rax-0B8h],xmm14 + movdqa xmm14,[r10+rbp] + movaps [rax-0C8h],xmm15 + lea eax,[r10*4] + movsxd r8,eax + lea eax,[r10+r10*2] + movsxd rcx,eax + lea eax,[r10+r10] + sub rdx,r8 + punpcklbw xmm14,xmm1 + movdqa [rsp+90h],xmm5 + movdqa [rsp+30h],xmm14 + movsxd rsi,eax + movsx eax,r11w + sub rdi,rcx + sub rbx,rsi + mov r8,rbp + sub r8,r10 + movd xmm0,eax + movsx eax,r9w + movdqa xmm12,[rdi] + movdqa xmm6, [rsi+rbp] + movdqa xmm13,[rbx] + punpcklwd xmm0,xmm0 + pshufd xmm11,xmm0,0 + punpcklbw xmm13,xmm1 + punpcklbw xmm6,xmm1 + movdqa xmm8,[r8] + movd xmm0,eax + movdqa xmm10,xmm11 + mov eax,2 + punpcklbw xmm8,xmm1 + punpcklbw xmm12,xmm1 + cwde + punpcklwd xmm0,xmm0 + psraw xmm10,2 + movdqa xmm1,xmm8 + movdqa [rsp+0F0h],xmm13 + movdqa [rsp+0B0h],xmm8 + pshufd xmm7,xmm0,0 + psubw xmm1,xmm13 + movdqa xmm0,xmm5 + movdqa xmm4,xmm7 + movdqa xmm2,xmm7 + psubw xmm0,xmm8 + pabsw xmm3,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm5 + movdqa [rsp+40h],xmm7 + movdqa [rsp+60h],xmm6 + pcmpgtw xmm4,xmm0 + psubw xmm1,xmm14 + pabsw xmm0,xmm1 + pcmpgtw xmm2,xmm0 + pand xmm4,xmm2 + movdqa xmm0,xmm11 + pcmpgtw xmm0,xmm3 + pand xmm4,xmm0 + movd xmm0,eax + movdqa [rsp+20h],xmm4 + punpcklwd xmm0,xmm0 + pshufd xmm2,xmm0,0 + paddw xmm10,xmm2 + movdqa [rsp+0A0h],xmm2 + movdqa xmm15,xmm7 + pxor xmm4,xmm4 + movdqa xmm0,xmm8 + psubw xmm0,xmm12 + mov eax,4 + pabsw xmm0,xmm0 + movdqa xmm1,xmm10 + cwde + pcmpgtw xmm15,xmm0 + pcmpgtw xmm1,xmm3 + movdqa xmm3,xmm7 + movdqa xmm7,[rdx] + movdqa xmm0,xmm5 + psubw xmm0,xmm6 + pand xmm15,xmm1 + punpcklbw xmm7,xmm4 + movdqa xmm9,xmm15 + pabsw xmm0,xmm0 + psllw xmm7,1 + pandn xmm9,xmm12 + pcmpgtw xmm3,xmm0 + paddw xmm7,xmm12 + movd xmm0,eax + pand xmm3,xmm1 + paddw xmm7,xmm12 + punpcklwd xmm0,xmm0 + paddw xmm7,xmm12 + pshufd xmm1,xmm0,0 + paddw xmm7,xmm13 + movdqa xmm0,xmm3 + pandn xmm0,xmm6 + paddw xmm7,xmm8 + movdqa [rsp+70h],xmm1 + paddw xmm7,xmm5 + movdqa [rsp+120h],xmm0 + movdqa xmm0,[rcx+rbp] + punpcklbw xmm0,xmm4 + paddw xmm7,xmm1 + movdqa xmm4,xmm15 + psllw xmm0,1 + psraw xmm7,3 + paddw xmm0,xmm6 + pand xmm7,xmm15 + paddw xmm0,xmm6 + paddw xmm0,xmm6 + paddw xmm0,xmm14 + movdqa xmm6,xmm15 + paddw xmm0,xmm5 + pandn xmm6,xmm13 + paddw xmm0,xmm8 + paddw xmm0,xmm1 + psraw xmm0,3 + movdqa xmm1,xmm12 + paddw xmm1,xmm13 + pand xmm0,xmm3 + movdqa [rsp+100h],xmm0 + movdqa xmm0,xmm8 + paddw xmm0,xmm5 + paddw xmm1,xmm0 + movdqa xmm0,xmm3 + paddw xmm1,xmm2 + psraw xmm1,2 + pandn xmm0,xmm14 + pand xmm4,xmm1 + movdqa [rsp+0E0h],xmm0 + movdqa xmm0,xmm5 + paddw xmm0,xmm8 + movdqa xmm1,[rsp+60h] + paddw xmm1,xmm14 + movdqa xmm14,xmm3 + paddw xmm1,xmm0 + movdqa xmm0,xmm8 + paddw xmm0,[rsp+30h] + paddw xmm1,xmm2 + psraw xmm1,2 + pand xmm14,xmm1 + movdqa xmm1,xmm13 + paddw xmm1,xmm13 + paddw xmm1,xmm0 + paddw xmm1,xmm2 + psraw xmm1,2 + movdqa xmm0,[rsp+30h] + movdqa xmm2,xmm13 + movdqa xmm5,xmm15 + paddw xmm0,[rsp+70h] + pandn xmm5,xmm1 + paddw xmm2,xmm8 + movdqa xmm8,[rsp+90h] + movdqa xmm1,xmm12 + paddw xmm2,xmm8 + psllw xmm2,1 + paddw xmm2,xmm0 + paddw xmm1,xmm2 + movdqa xmm0,xmm8 + movdqa xmm8,xmm3 + movdqa xmm2,[rsp+30h] + paddw xmm0,xmm13 + psraw xmm1,3 + pand xmm15,xmm1 + movdqa xmm1,xmm2 + paddw xmm1,xmm2 + paddw xmm2,[rsp+90h] + paddw xmm2,[rsp+0B0h] + paddw xmm1,xmm0 + movdqa xmm0,xmm13 + movdqa xmm13,[r8] + paddw xmm0, [rsp+70h] + paddw xmm1, [rsp+0A0h] + psllw xmm2,1 + paddw xmm2,xmm0 + psraw xmm1,2 + movdqa xmm0, [rdi] + pandn xmm8,xmm1 + movdqa xmm1, [rsp+60h] + paddw xmm1,xmm2 + movdqa xmm2, [rbx] + psraw xmm1,3 + pand xmm3,xmm1 + movdqa xmm1, [rbp] + movdqa [rsp+0D0h],xmm3 + pxor xmm3,xmm3 + punpckhbw xmm0,xmm3 + punpckhbw xmm1,xmm3 + punpckhbw xmm13,xmm3 + movdqa [rsp+0C0h],xmm0 + movdqa xmm0,[r10+rbp] + movdqa [rsp],xmm1 + punpckhbw xmm0,xmm3 + punpckhbw xmm2,xmm3 + movdqa [rsp+80h],xmm0 + movdqa xmm0,[rsi+rbp] + movdqa [rsp+10h],xmm13 + punpckhbw xmm0,xmm3 + movdqa [rsp+50h],xmm0 + movdqa xmm0,xmm1 + movdqa xmm1,xmm13 + psubw xmm0,xmm13 + psubw xmm1,xmm2 + pabsw xmm3,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,[rsp] + movdqa xmm13,[rsp+40h] + movdqa [rsp+110h],xmm2 + psubw xmm1, [rsp+80h] + pcmpgtw xmm13,xmm0 + pcmpgtw xmm11,xmm3 + pabsw xmm0,xmm1 + pcmpgtw xmm10,xmm3 + movdqa xmm1, [rsp+40h] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 + pcmpgtw xmm2,xmm0 + movdqa xmm0, [rsp+10h] + pand xmm13,xmm2 + pand xmm13,xmm11 + movdqa xmm11,[rsp+0C0h] + psubw xmm0,xmm11 + pabsw xmm0,xmm0 + pcmpgtw xmm3,xmm0 + pand xmm3,xmm10 + movdqa xmm0,[rsp] + psubw xmm0,[rsp+50h] + movdqa xmm2,[rdx] + pabsw xmm0,xmm0 + por xmm7,xmm9 + movdqa xmm9,[rsp+20h] + pcmpgtw xmm1,xmm0 + pand xmm9,xmm7 + movdqa xmm7,[rsp+20h] + movdqa xmm0,xmm7 + pandn xmm0,xmm12 + movdqa xmm12,[rsp+110h] + pand xmm1,xmm10 + movdqa xmm10,[rsp+70h] + movdqa [rsp+40h],xmm1 + movdqa xmm1,xmm13 + por xmm9,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm6 + movdqa xmm6,xmm7 + punpckhbw xmm2,xmm0 + por xmm15,xmm5 + movdqa xmm5,[rsp+20h] + movdqa xmm0,xmm3 + psllw xmm2,1 + pandn xmm0,xmm11 + pand xmm6,xmm4 + movdqa xmm4,[rsp] + paddw xmm2,xmm11 + pand xmm5,xmm15 + movdqa xmm15,[rsp+20h] + paddw xmm2,xmm11 + paddw xmm2,xmm11 + paddw xmm2,xmm12 + paddw xmm2,[rsp+10h] + paddw xmm2,[rsp] + paddw xmm2,xmm10 + psraw xmm2,3 + pand xmm2,xmm3 + por xmm2,xmm0 + pand xmm1,xmm2 + movdqa xmm0,xmm13 + movdqa xmm2,xmm11 + pandn xmm0,xmm11 + paddw xmm2,xmm12 + por xmm1,xmm0 + packuswb xmm9,xmm1 + movdqa xmm0,xmm7 + movdqa xmm7,[rsp+0A0h] + pandn xmm0,[rsp+0F0h] + movdqa xmm1,xmm3 + por xmm6,xmm0 + movdqa xmm0,[rsp+10h] + paddw xmm0,xmm4 + paddw xmm2,xmm0 + paddw xmm2,xmm7 + movdqa xmm0,xmm3 + pandn xmm0,xmm12 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + movdqa xmm2,xmm13 + movdqa xmm0,xmm13 + pand xmm2,xmm1 + pandn xmm0,xmm12 + movdqa xmm1,xmm12 + paddw xmm1,[rsp+10h] + por xmm2,xmm0 + movdqa xmm0,xmm15 + pandn xmm0,[rsp+0B0h] + paddw xmm1,xmm4 + packuswb xmm6,xmm2 + movdqa xmm2,xmm3 + psllw xmm1,1 + por xmm5,xmm0 + movdqa xmm0,[rsp+80h] + paddw xmm0,xmm10 + paddw xmm1,xmm0 + paddw xmm11,xmm1 + psraw xmm11,3 + movdqa xmm1,xmm12 + pand xmm2,xmm11 + paddw xmm1,xmm12 + movdqa xmm11,[rsp+80h] + movdqa xmm0, [rsp+10h] + por xmm14,[rsp+0E0h] + paddw xmm0,xmm11 + movdqa xmm4,xmm15 + paddw xmm1,xmm0 + movdqa xmm0,xmm13 + paddw xmm1,xmm7 + psraw xmm1,2 + pandn xmm3,xmm1 + por xmm2,xmm3 + movdqa xmm1,xmm13 + movdqa xmm3,[rsp+10h] + pandn xmm0,xmm3 + pand xmm1,xmm2 + movdqa xmm2,xmm11 + paddw xmm2,[rsp] + por xmm1,xmm0 + movdqa xmm0,[rsp+0D0h] + por xmm0,xmm8 + paddw xmm2,xmm3 + packuswb xmm5,xmm1 + movdqa xmm8,[rsp+40h] + movdqa xmm1,[rsp+50h] + movdqa xmm3,xmm8 + pand xmm4,xmm0 + psllw xmm2,1 + movdqa xmm0,xmm15 + pandn xmm0,[rsp+90h] + por xmm4,xmm0 + movdqa xmm0,xmm12 + paddw xmm0,xmm10 + paddw xmm2,xmm0 + paddw xmm1,xmm2 + movdqa xmm0,[rsp] + movdqa xmm2,xmm11 + paddw xmm0,xmm12 + movdqa xmm12,[rsp] + paddw xmm2,xmm11 + paddw xmm2,xmm0 + psraw xmm1,3 + movdqa xmm0,xmm8 + pand xmm3,xmm1 + paddw xmm2,xmm7 + movdqa xmm1,xmm13 + psraw xmm2,2 + pandn xmm0,xmm2 + por xmm3,xmm0 + movdqa xmm2,[rsp+50h] + movdqa xmm0,xmm13 + pandn xmm0,xmm12 + pand xmm1,xmm3 + paddw xmm2,xmm11 + movdqa xmm3,xmm15 + por xmm1,xmm0 + pand xmm3,xmm14 + movdqa xmm14,[rsp+10h] + movdqa xmm0,xmm15 + pandn xmm0,[rsp+30h] + packuswb xmm4,xmm1 + movdqa xmm1,xmm8 + por xmm3,xmm0 + movdqa xmm0,xmm12 + paddw xmm0,xmm14 + paddw xmm2,xmm0 + paddw xmm2,xmm7 + movdqa xmm0,xmm8 + pandn xmm0,xmm11 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + movdqa xmm2,xmm13 + movdqa xmm0,xmm13 + pandn xmm0,xmm11 + pand xmm2,xmm1 + movdqa xmm1,xmm15 + por xmm2,xmm0 + packuswb xmm3,xmm2 + movdqa xmm0,[rsp+100h] + por xmm0,[rsp+120h] + pand xmm1,xmm0 + movdqa xmm2,[rcx+rbp] + movdqa xmm7,[rsp+50h] + pandn xmm15,[rsp+60h] + lea r11,[rsp+1D8h] + pxor xmm0,xmm0 + por xmm1,xmm15 + movaps xmm15,[r11-0A8h] + movdqa [rdi],xmm9 + movaps xmm9,[r11-48h] + punpckhbw xmm2,xmm0 + psllw xmm2,1 + paddw xmm2,xmm7 + paddw xmm2,xmm7 + movdqa [rbx],xmm6 + movaps xmm6,[r11-18h] + paddw xmm2,xmm7 + paddw xmm2,xmm11 + movaps xmm11,[r11-68h] + paddw xmm2,xmm12 + movaps xmm12,[r11-78h] + paddw xmm2,xmm14 + paddw xmm2,xmm10 + psraw xmm2,3 + movaps xmm10,[r11-58h] + movaps xmm14,[r11-98h] + movdqa xmm0,xmm13 + pand xmm2,xmm8 + pandn xmm8,xmm7 + pandn xmm13,xmm7 + por xmm2,xmm8 + movaps xmm7,[r11-28h] + movaps xmm8,[r11-38h] + movdqa [r8],xmm5 + pand xmm0,xmm2 + por xmm0,xmm13 + packuswb xmm1,xmm0 + movaps xmm13,[r11-88h] + movdqa [rbp],xmm4 + movdqa [r10+rbp],xmm3 + movdqa [rsi+rbp],xmm1 + mov rsp,r11 + pop rdi + pop rsi + pop rbp + pop rbx + ret WELS_EXTERN DeblockChromaLt4V_ssse3 - mov rax,rsp - push rbx - push rdi - PUSH_XMM 16 - sub rsp,0C8h - mov r10,qword [rax + 30h] ; pTC - pxor xmm1,xmm1 - mov rbx,rcx - movsxd r11,r8d - movsx ecx,byte [r10] - movsx r8d,byte [r10+2] - mov rdi,rdx - movq xmm2,[rbx] - movq xmm9,[r11+rbx] - movsx edx,byte [r10+1] - mov word [rsp+2],cx - mov word [rsp],cx - movsx eax,byte [r10+3] - mov word [rsp+6],dx - mov word [rsp+4],dx - movdqa xmm11,xmm1 - mov word [rsp+0Eh],ax - mov word [rsp+0Ch],ax - lea eax,[r11+r11] - movsxd rcx,eax - mov rax,rbx - mov rdx,rdi - sub rax,rcx - mov word [rsp+0Ah],r8w - mov word [rsp+8],r8w - movdqa xmm6,[rsp] - movdqa xmm7,xmm6 - movq xmm13, [rax] - mov rax,rdi - sub rax,rcx - mov rcx,rbx - pcmpgtw xmm7,xmm1 - psubw xmm11,xmm6 - sub rcx,r11 - sub rdx,r11 - movq xmm0,[rax] - movsx eax,r9w - movq xmm15,[rcx] - punpcklqdq xmm13,xmm0 - movq xmm0, [rdx] - movdqa xmm4,xmm13 - punpcklqdq xmm15,xmm0 - movq xmm0, [rdi] - punpcklbw xmm4,xmm1 - movdqa xmm12,xmm15 - punpcklqdq xmm2,xmm0 - movq xmm0, [r11+rdi] - punpcklbw xmm12,xmm1 - movdqa xmm14,xmm2 - punpcklqdq xmm9,xmm0 - punpckhbw xmm2,xmm1 - punpcklbw xmm14,xmm1 - movd xmm0,eax - movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta - punpckhbw xmm13,xmm1 - punpckhbw xmm15,xmm1 - movdqa xmm3,xmm9 - movdqa [rsp+10h],xmm2 - punpcklwd xmm0,xmm0 - punpckhbw xmm9,xmm1 - punpcklbw xmm3,xmm1 - movdqa xmm1,xmm14 - pshufd xmm10,xmm0,0 - movd xmm0,eax - mov eax,4 - cwde - punpcklwd xmm0,xmm0 - pshufd xmm8,xmm0,0 - movd xmm0,eax - punpcklwd xmm0,xmm0 - pshufd xmm5,xmm0,0 - psubw xmm1,xmm12 - movdqa xmm2,xmm10 - lea r11,[rsp+0C8h] - psllw xmm1,2 - movdqa xmm0,xmm4 - psubw xmm4,xmm12 - psubw xmm0,xmm3 - psubw xmm3,xmm14 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm11 - psraw xmm1,3 - pmaxsw xmm0,xmm1 - pminsw xmm6,xmm0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm12 - psubw xmm0,xmm14 - pabsw xmm0,xmm0 - pcmpgtw xmm2,xmm0 - pabsw xmm0,xmm4 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm3 - movdqa xmm3,[rsp] - pand xmm2,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm1,xmm0 - movdqa xmm0,xmm13 - pand xmm2,xmm1 - psubw xmm0,xmm9 - psubw xmm13,xmm15 - pand xmm2,xmm7 - pand xmm6,xmm2 - paddw xmm12,xmm6 - psubw xmm14,xmm6 - movdqa xmm2,[rsp+10h] - movaps xmm6,[r11-18h] - movdqa xmm1,xmm2 - psubw xmm1,xmm15 - psubw xmm9,xmm2 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm15 - psubw xmm0,xmm2 - psraw xmm1,3 - pmaxsw xmm11,xmm1 - pabsw xmm0,xmm0 - movdqa xmm1,xmm8 - pcmpgtw xmm10,xmm0 - pabsw xmm0,xmm13 - pminsw xmm3,xmm11 - movaps xmm11,[r11-68h] - movaps xmm13,[rsp+40h] - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm9 - movaps xmm9, [r11-48h] - pand xmm10,xmm1 - pcmpgtw xmm8,xmm0 - pand xmm10,xmm8 - pand xmm10,xmm7 - movaps xmm8,[r11-38h] - movaps xmm7,[r11-28h] - pand xmm3,xmm10 - paddw xmm15,xmm3 - psubw xmm2,xmm3 - movaps xmm10,[r11-58h] - packuswb xmm12,xmm15 - movaps xmm15,[rsp+20h] - packuswb xmm14,xmm2 - movq [rcx],xmm12 - movq [rbx],xmm14 - psrldq xmm12,8 - psrldq xmm14,8 - movq [rdx],xmm12 - movaps xmm12,[r11-78h] - movq [rdi],xmm14 - movaps xmm14,[rsp+30h] - mov rsp,r11 - POP_XMM - pop rdi - pop rbx - ret + mov rax,rsp + push rbx + push rdi + PUSH_XMM 16 + sub rsp,0C8h + mov r10,qword [rax + 30h] ; pTC + pxor xmm1,xmm1 + mov rbx,rcx + movsxd r11,r8d + movsx ecx,byte [r10] + movsx r8d,byte [r10+2] + mov rdi,rdx + movq xmm2,[rbx] + movq xmm9,[r11+rbx] + movsx edx,byte [r10+1] + mov word [rsp+2],cx + mov word [rsp],cx + movsx eax,byte [r10+3] + mov word [rsp+6],dx + mov word [rsp+4],dx + movdqa xmm11,xmm1 + mov word [rsp+0Eh],ax + mov word [rsp+0Ch],ax + lea eax,[r11+r11] + movsxd rcx,eax + mov rax,rbx + mov rdx,rdi + sub rax,rcx + mov word [rsp+0Ah],r8w + mov word [rsp+8],r8w + movdqa xmm6,[rsp] + movdqa xmm7,xmm6 + movq xmm13, [rax] + mov rax,rdi + sub rax,rcx + mov rcx,rbx + pcmpgtw xmm7,xmm1 + psubw xmm11,xmm6 + sub rcx,r11 + sub rdx,r11 + movq xmm0,[rax] + movsx eax,r9w + movq xmm15,[rcx] + punpcklqdq xmm13,xmm0 + movq xmm0, [rdx] + movdqa xmm4,xmm13 + punpcklqdq xmm15,xmm0 + movq xmm0, [rdi] + punpcklbw xmm4,xmm1 + movdqa xmm12,xmm15 + punpcklqdq xmm2,xmm0 + movq xmm0, [r11+rdi] + punpcklbw xmm12,xmm1 + movdqa xmm14,xmm2 + punpcklqdq xmm9,xmm0 + punpckhbw xmm2,xmm1 + punpcklbw xmm14,xmm1 + movd xmm0,eax + movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta + punpckhbw xmm13,xmm1 + punpckhbw xmm15,xmm1 + movdqa xmm3,xmm9 + movdqa [rsp+10h],xmm2 + punpcklwd xmm0,xmm0 + punpckhbw xmm9,xmm1 + punpcklbw xmm3,xmm1 + movdqa xmm1,xmm14 + pshufd xmm10,xmm0,0 + movd xmm0,eax + mov eax,4 + cwde + punpcklwd xmm0,xmm0 + pshufd xmm8,xmm0,0 + movd xmm0,eax + punpcklwd xmm0,xmm0 + pshufd xmm5,xmm0,0 + psubw xmm1,xmm12 + movdqa xmm2,xmm10 + lea r11,[rsp+0C8h] + psllw xmm1,2 + movdqa xmm0,xmm4 + psubw xmm4,xmm12 + psubw xmm0,xmm3 + psubw xmm3,xmm14 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm11 + psraw xmm1,3 + pmaxsw xmm0,xmm1 + pminsw xmm6,xmm0 + movdqa xmm1,xmm8 + movdqa xmm0,xmm12 + psubw xmm0,xmm14 + pabsw xmm0,xmm0 + pcmpgtw xmm2,xmm0 + pabsw xmm0,xmm4 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm3 + movdqa xmm3,[rsp] + pand xmm2,xmm1 + movdqa xmm1,xmm8 + pcmpgtw xmm1,xmm0 + movdqa xmm0,xmm13 + pand xmm2,xmm1 + psubw xmm0,xmm9 + psubw xmm13,xmm15 + pand xmm2,xmm7 + pand xmm6,xmm2 + paddw xmm12,xmm6 + psubw xmm14,xmm6 + movdqa xmm2,[rsp+10h] + movaps xmm6,[r11-18h] + movdqa xmm1,xmm2 + psubw xmm1,xmm15 + psubw xmm9,xmm2 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm15 + psubw xmm0,xmm2 + psraw xmm1,3 + pmaxsw xmm11,xmm1 + pabsw xmm0,xmm0 + movdqa xmm1,xmm8 + pcmpgtw xmm10,xmm0 + pabsw xmm0,xmm13 + pminsw xmm3,xmm11 + movaps xmm11,[r11-68h] + movaps xmm13,[rsp+40h] + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm9 + movaps xmm9, [r11-48h] + pand xmm10,xmm1 + pcmpgtw xmm8,xmm0 + pand xmm10,xmm8 + pand xmm10,xmm7 + movaps xmm8,[r11-38h] + movaps xmm7,[r11-28h] + pand xmm3,xmm10 + paddw xmm15,xmm3 + psubw xmm2,xmm3 + movaps xmm10,[r11-58h] + packuswb xmm12,xmm15 + movaps xmm15,[rsp+20h] + packuswb xmm14,xmm2 + movq [rcx],xmm12 + movq [rbx],xmm14 + psrldq xmm12,8 + psrldq xmm14,8 + movq [rdx],xmm12 + movaps xmm12,[r11-78h] + movq [rdi],xmm14 + movaps xmm14,[rsp+30h] + mov rsp,r11 + POP_XMM + pop rdi + pop rbx + ret WELS_EXTERN DeblockChromaEq4V_ssse3 - mov rax,rsp - push rbx - PUSH_XMM 15 - sub rsp,90h - pxor xmm1,xmm1 - mov r11,rcx - mov rbx,rdx - mov r10d,r9d - movq xmm13,[r11] - lea eax,[r8+r8] - movsxd r9,eax - mov rax,rcx - sub rax,r9 - movq xmm14,[rax] - mov rax,rdx - sub rax,r9 - movq xmm0,[rax] - movsxd rax,r8d - sub rcx,rax - sub rdx,rax - movq xmm12,[rax+r11] - movq xmm10,[rcx] - punpcklqdq xmm14,xmm0 - movdqa xmm8,xmm14 - movq xmm0,[rdx] - punpcklbw xmm8,xmm1 - punpckhbw xmm14,xmm1 - punpcklqdq xmm10,xmm0 - movq xmm0,[rbx] - movdqa xmm5,xmm10 - punpcklqdq xmm13,xmm0 - movq xmm0, [rax+rbx] - punpcklbw xmm5,xmm1 - movsx eax,r10w - movdqa xmm9,xmm13 - punpcklqdq xmm12,xmm0 - punpcklbw xmm9,xmm1 - punpckhbw xmm10,xmm1 - movd xmm0,eax - movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta - punpckhbw xmm13,xmm1 - movdqa xmm7,xmm12 - punpcklwd xmm0,xmm0 - punpckhbw xmm12,xmm1 - pshufd xmm11,xmm0,0 - punpcklbw xmm7,xmm1 - movd xmm0,eax - movdqa xmm1,xmm8 - psubw xmm1,xmm5 - punpcklwd xmm0,xmm0 - movdqa xmm6,xmm11 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm5 - psubw xmm0,xmm9 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm10 - movdqa xmm1,xmm14 - psubw xmm0,xmm13 - psubw xmm1,xmm10 - pabsw xmm0,xmm0 - pcmpgtw xmm11,xmm0 - pabsw xmm0,xmm1 - pcmpgtw xmm2,xmm0 - pand xmm11,xmm2 - movdqa xmm0,xmm12 - movdqa xmm4,xmm6 - movdqa xmm1,xmm8 - mov eax,2 - cwde - paddw xmm1,xmm8 - psubw xmm0,xmm13 - paddw xmm1,xmm5 - pabsw xmm0,xmm0 - movdqa xmm2,xmm14 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm14 - movd xmm0,eax - pand xmm11,xmm3 - paddw xmm7,xmm7 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - paddw xmm2,xmm12 - paddw xmm12,xmm12 - pshufd xmm3,xmm0,0 - paddw xmm7,xmm9 - paddw xmm12,xmm13 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm5 - paddw xmm7,xmm8 - psraw xmm1,2 - paddw xmm12,xmm14 - paddw xmm7,xmm3 - movaps xmm14,[rsp] - pand xmm4,xmm1 - paddw xmm12,xmm3 - psraw xmm7,2 - movdqa xmm1,xmm11 - por xmm4,xmm0 - psraw xmm12,2 - paddw xmm2,xmm3 - movdqa xmm0,xmm11 - pandn xmm0,xmm10 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - packuswb xmm4,xmm1 - movdqa xmm0,xmm11 - movdqa xmm1,xmm6 - pand xmm1,xmm7 - movaps xmm7,[rsp+70h] - movq [rcx],xmm4 - pandn xmm6,xmm9 - pandn xmm11,xmm13 - pand xmm0,xmm12 - por xmm1,xmm6 - por xmm0,xmm11 - psrldq xmm4,8 - packuswb xmm1,xmm0 - movq [r11],xmm1 - psrldq xmm1,8 - movq [rdx],xmm4 - lea r11,[rsp+90h] - movaps xmm6,[r11-10h] - movaps xmm8,[r11-30h] - movaps xmm9,[r11-40h] - movq [rbx],xmm1 - movaps xmm10,[r11-50h] - movaps xmm11,[r11-60h] - movaps xmm12,[r11-70h] - movaps xmm13,[r11-80h] - mov rsp,r11 - POP_XMM - pop rbx - ret + mov rax,rsp + push rbx + PUSH_XMM 15 + sub rsp,90h + pxor xmm1,xmm1 + mov r11,rcx + mov rbx,rdx + mov r10d,r9d + movq xmm13,[r11] + lea eax,[r8+r8] + movsxd r9,eax + mov rax,rcx + sub rax,r9 + movq xmm14,[rax] + mov rax,rdx + sub rax,r9 + movq xmm0,[rax] + movsxd rax,r8d + sub rcx,rax + sub rdx,rax + movq xmm12,[rax+r11] + movq xmm10,[rcx] + punpcklqdq xmm14,xmm0 + movdqa xmm8,xmm14 + movq xmm0,[rdx] + punpcklbw xmm8,xmm1 + punpckhbw xmm14,xmm1 + punpcklqdq xmm10,xmm0 + movq xmm0,[rbx] + movdqa xmm5,xmm10 + punpcklqdq xmm13,xmm0 + movq xmm0, [rax+rbx] + punpcklbw xmm5,xmm1 + movsx eax,r10w + movdqa xmm9,xmm13 + punpcklqdq xmm12,xmm0 + punpcklbw xmm9,xmm1 + punpckhbw xmm10,xmm1 + movd xmm0,eax + movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta + punpckhbw xmm13,xmm1 + movdqa xmm7,xmm12 + punpcklwd xmm0,xmm0 + punpckhbw xmm12,xmm1 + pshufd xmm11,xmm0,0 + punpcklbw xmm7,xmm1 + movd xmm0,eax + movdqa xmm1,xmm8 + psubw xmm1,xmm5 + punpcklwd xmm0,xmm0 + movdqa xmm6,xmm11 + pshufd xmm3,xmm0,0 + movdqa xmm0,xmm5 + psubw xmm0,xmm9 + movdqa xmm2,xmm3 + pabsw xmm0,xmm0 + pcmpgtw xmm6,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm3 + pcmpgtw xmm2,xmm0 + pand xmm6,xmm2 + movdqa xmm0,xmm7 + movdqa xmm2,xmm3 + psubw xmm0,xmm9 + pabsw xmm0,xmm0 + pcmpgtw xmm1,xmm0 + pand xmm6,xmm1 + movdqa xmm0,xmm10 + movdqa xmm1,xmm14 + psubw xmm0,xmm13 + psubw xmm1,xmm10 + pabsw xmm0,xmm0 + pcmpgtw xmm11,xmm0 + pabsw xmm0,xmm1 + pcmpgtw xmm2,xmm0 + pand xmm11,xmm2 + movdqa xmm0,xmm12 + movdqa xmm4,xmm6 + movdqa xmm1,xmm8 + mov eax,2 + cwde + paddw xmm1,xmm8 + psubw xmm0,xmm13 + paddw xmm1,xmm5 + pabsw xmm0,xmm0 + movdqa xmm2,xmm14 + paddw xmm1,xmm7 + pcmpgtw xmm3,xmm0 + paddw xmm2,xmm14 + movd xmm0,eax + pand xmm11,xmm3 + paddw xmm7,xmm7 + paddw xmm2,xmm10 + punpcklwd xmm0,xmm0 + paddw xmm2,xmm12 + paddw xmm12,xmm12 + pshufd xmm3,xmm0,0 + paddw xmm7,xmm9 + paddw xmm12,xmm13 + movdqa xmm0,xmm6 + paddw xmm1,xmm3 + pandn xmm0,xmm5 + paddw xmm7,xmm8 + psraw xmm1,2 + paddw xmm12,xmm14 + paddw xmm7,xmm3 + movaps xmm14,[rsp] + pand xmm4,xmm1 + paddw xmm12,xmm3 + psraw xmm7,2 + movdqa xmm1,xmm11 + por xmm4,xmm0 + psraw xmm12,2 + paddw xmm2,xmm3 + movdqa xmm0,xmm11 + pandn xmm0,xmm10 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + packuswb xmm4,xmm1 + movdqa xmm0,xmm11 + movdqa xmm1,xmm6 + pand xmm1,xmm7 + movaps xmm7,[rsp+70h] + movq [rcx],xmm4 + pandn xmm6,xmm9 + pandn xmm11,xmm13 + pand xmm0,xmm12 + por xmm1,xmm6 + por xmm0,xmm11 + psrldq xmm4,8 + packuswb xmm1,xmm0 + movq [r11],xmm1 + psrldq xmm1,8 + movq [rdx],xmm4 + lea r11,[rsp+90h] + movaps xmm6,[r11-10h] + movaps xmm8,[r11-30h] + movaps xmm9,[r11-40h] + movq [rbx],xmm1 + movaps xmm10,[r11-50h] + movaps xmm11,[r11-60h] + movaps xmm12,[r11-70h] + movaps xmm13,[r11-80h] + mov rsp,r11 + POP_XMM + pop rbx + ret WELS_EXTERN DeblockChromaEq4H_ssse3 - mov rax,rsp - mov [rax+20h],rbx - push rdi - PUSH_XMM 16 - sub rsp,140h - mov rdi,rdx - lea eax,[r8*4] - movsxd r10,eax - mov eax,[rcx-2] - mov [rsp+10h],eax - lea rbx,[r10+rdx-2] - lea r11,[r10+rcx-2] - movdqa xmm5,[rsp+10h] - movsxd r10,r8d - mov eax,[r10+rcx-2] - lea rdx,[r10+r10*2] - mov [rsp+20h],eax - mov eax,[rcx+r10*2-2] - mov [rsp+30h],eax - mov eax,[rdx+rcx-2] - movdqa xmm2,[rsp+20h] - mov [rsp+40h],eax - mov eax, [rdi-2] - movdqa xmm4,[rsp+30h] - mov [rsp+50h],eax - mov eax,[r10+rdi-2] - movdqa xmm3,[rsp+40h] - mov [rsp+60h],eax - mov eax,[rdi+r10*2-2] - punpckldq xmm5,[rsp+50h] - mov [rsp+70h],eax - mov eax, [rdx+rdi-2] - punpckldq xmm2, [rsp+60h] - mov [rsp+80h],eax - mov eax,[r11] - punpckldq xmm4, [rsp+70h] - mov [rsp+50h],eax - mov eax,[rbx] - punpckldq xmm3,[rsp+80h] - mov [rsp+60h],eax - mov eax,[r10+r11] - movdqa xmm0, [rsp+50h] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[r10+rbx] - movdqa xmm0,[rsp+50h] - movdqa xmm1,xmm5 - mov [rsp+60h],eax - mov eax,[r11+r10*2] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[rbx+r10*2] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - mov eax, [rdx+r11] - movdqa xmm15,xmm1 - punpckldq xmm0,[rsp+60h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax, [rdx+rbx] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm15,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm12,xmm15 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm12,xmm0 - punpckhdq xmm15,xmm0 - movdqa xmm0,xmm1 - movdqa xmm11,xmm12 - punpckldq xmm0,xmm5 - punpckhdq xmm1,xmm5 - punpcklqdq xmm11,xmm0 - punpckhqdq xmm12,xmm0 - movsx eax,r9w - movdqa xmm14,xmm15 - punpcklqdq xmm14,xmm1 - punpckhqdq xmm15,xmm1 - pxor xmm1,xmm1 - movd xmm0,eax - movdqa xmm4,xmm12 - movdqa xmm8,xmm11 - movsx eax,word [rsp+170h + 160] ; iBeta - punpcklwd xmm0,xmm0 - punpcklbw xmm4,xmm1 - punpckhbw xmm12,xmm1 - movdqa xmm9,xmm14 - movdqa xmm7,xmm15 - movdqa xmm10,xmm15 - pshufd xmm13,xmm0,0 - punpcklbw xmm9,xmm1 - punpckhbw xmm14,xmm1 - movdqa xmm6,xmm13 - movd xmm0,eax - movdqa [rsp],xmm11 - mov eax,2 - cwde - punpckhbw xmm11,xmm1 - punpckhbw xmm10,xmm1 - punpcklbw xmm7,xmm1 - punpcklwd xmm0,xmm0 - punpcklbw xmm8,xmm1 - pshufd xmm3,xmm0,0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm4 - psubw xmm0,xmm9 - psubw xmm1,xmm4 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm12 - movdqa xmm1,xmm11 - psubw xmm0,xmm14 - psubw xmm1,xmm12 - movdqa xmm5,xmm6 - pabsw xmm0,xmm0 - pcmpgtw xmm13,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm2,xmm0 - paddw xmm1,xmm8 - movdqa xmm0,xmm10 - pand xmm13,xmm2 - psubw xmm0,xmm14 - paddw xmm1,xmm4 - movdqa xmm2,xmm11 - pabsw xmm0,xmm0 - paddw xmm2,xmm11 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm12 - movd xmm0,eax - pand xmm13,xmm3 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm4 - paddw xmm2,xmm3 - psraw xmm1,2 - pand xmm5,xmm1 - por xmm5,xmm0 - paddw xmm7,xmm7 - paddw xmm10,xmm10 - psraw xmm2,2 - movdqa xmm1,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm2 - paddw xmm7,xmm9 - por xmm1,xmm0 - paddw xmm10,xmm14 - paddw xmm7,xmm8 - movdqa xmm0,xmm13 - packuswb xmm5,xmm1 - paddw xmm7,xmm3 - paddw xmm10,xmm11 - movdqa xmm1,xmm6 - paddw xmm10,xmm3 - pandn xmm6,xmm9 - psraw xmm7,2 - pand xmm1,xmm7 - psraw xmm10,2 - pandn xmm13,xmm14 - pand xmm0,xmm10 - por xmm1,xmm6 - movdqa xmm6,[rsp] - movdqa xmm4,xmm6 - por xmm0,xmm13 - punpcklbw xmm4,xmm5 - punpckhbw xmm6,xmm5 - movdqa xmm3,xmm4 - packuswb xmm1,xmm0 - movdqa xmm0,xmm1 - punpckhbw xmm1,xmm15 - punpcklbw xmm0,xmm15 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm6 - movdqa xmm2,xmm3 - punpcklwd xmm0,xmm1 - punpckhwd xmm6,xmm1 - movdqa xmm1,xmm4 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm6 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm6 - punpckhqdq xmm2,xmm1 - movdqa [rsp+10h],xmm0 - movdqa [rsp+60h],xmm2 - movdqa xmm0,xmm3 - mov eax,[rsp+10h] - mov [rcx-2],eax - mov eax,[rsp+60h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [r10+rcx-2],eax - movdqa [rsp+20h],xmm0 - mov eax, [rsp+20h] - movdqa [rsp+70h],xmm3 - mov [rcx+r10*2-2],eax - mov eax,[rsp+70h] - mov [rdx+rcx-2],eax - mov eax,[rsp+18h] - mov [r11],eax - mov eax,[rsp+68h] - mov [r10+r11],eax - mov eax,[rsp+28h] - mov [r11+r10*2],eax - mov eax,[rsp+78h] - mov [rdx+r11],eax - mov eax,[rsp+14h] - mov [rdi-2],eax - mov eax,[rsp+64h] - mov [r10+rdi-2],eax - mov eax,[rsp+24h] - mov [rdi+r10*2-2],eax - mov eax, [rsp+74h] - mov [rdx+rdi-2],eax - mov eax, [rsp+1Ch] - mov [rbx],eax - mov eax, [rsp+6Ch] - mov [r10+rbx],eax - mov eax,[rsp+2Ch] - mov [rbx+r10*2],eax - mov eax,[rsp+7Ch] - mov [rdx+rbx],eax - lea rsp,[rsp+140h] - POP_XMM - mov rbx, [rsp+28h] - pop rdi - ret + mov rax,rsp + mov [rax+20h],rbx + push rdi + PUSH_XMM 16 + sub rsp,140h + mov rdi,rdx + lea eax,[r8*4] + movsxd r10,eax + mov eax,[rcx-2] + mov [rsp+10h],eax + lea rbx,[r10+rdx-2] + lea r11,[r10+rcx-2] + movdqa xmm5,[rsp+10h] + movsxd r10,r8d + mov eax,[r10+rcx-2] + lea rdx,[r10+r10*2] + mov [rsp+20h],eax + mov eax,[rcx+r10*2-2] + mov [rsp+30h],eax + mov eax,[rdx+rcx-2] + movdqa xmm2,[rsp+20h] + mov [rsp+40h],eax + mov eax, [rdi-2] + movdqa xmm4,[rsp+30h] + mov [rsp+50h],eax + mov eax,[r10+rdi-2] + movdqa xmm3,[rsp+40h] + mov [rsp+60h],eax + mov eax,[rdi+r10*2-2] + punpckldq xmm5,[rsp+50h] + mov [rsp+70h],eax + mov eax, [rdx+rdi-2] + punpckldq xmm2, [rsp+60h] + mov [rsp+80h],eax + mov eax,[r11] + punpckldq xmm4, [rsp+70h] + mov [rsp+50h],eax + mov eax,[rbx] + punpckldq xmm3,[rsp+80h] + mov [rsp+60h],eax + mov eax,[r10+r11] + movdqa xmm0, [rsp+50h] + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm5,xmm0 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax,[r10+rbx] + movdqa xmm0,[rsp+50h] + movdqa xmm1,xmm5 + mov [rsp+60h],eax + mov eax,[r11+r10*2] + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm2,xmm0 + punpcklbw xmm1,xmm2 + punpckhbw xmm5,xmm2 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax,[rbx+r10*2] + movdqa xmm0,[rsp+50h] + mov [rsp+60h],eax + mov eax, [rdx+r11] + movdqa xmm15,xmm1 + punpckldq xmm0,[rsp+60h] + punpcklqdq xmm4,xmm0 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax, [rdx+rbx] + movdqa xmm0,[rsp+50h] + mov [rsp+60h],eax + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm3,xmm0 + movdqa xmm0,xmm4 + punpcklbw xmm0,xmm3 + punpckhbw xmm4,xmm3 + punpcklwd xmm15,xmm0 + punpckhwd xmm1,xmm0 + movdqa xmm0,xmm5 + movdqa xmm12,xmm15 + punpcklwd xmm0,xmm4 + punpckhwd xmm5,xmm4 + punpckldq xmm12,xmm0 + punpckhdq xmm15,xmm0 + movdqa xmm0,xmm1 + movdqa xmm11,xmm12 + punpckldq xmm0,xmm5 + punpckhdq xmm1,xmm5 + punpcklqdq xmm11,xmm0 + punpckhqdq xmm12,xmm0 + movsx eax,r9w + movdqa xmm14,xmm15 + punpcklqdq xmm14,xmm1 + punpckhqdq xmm15,xmm1 + pxor xmm1,xmm1 + movd xmm0,eax + movdqa xmm4,xmm12 + movdqa xmm8,xmm11 + movsx eax,word [rsp+170h + 160] ; iBeta + punpcklwd xmm0,xmm0 + punpcklbw xmm4,xmm1 + punpckhbw xmm12,xmm1 + movdqa xmm9,xmm14 + movdqa xmm7,xmm15 + movdqa xmm10,xmm15 + pshufd xmm13,xmm0,0 + punpcklbw xmm9,xmm1 + punpckhbw xmm14,xmm1 + movdqa xmm6,xmm13 + movd xmm0,eax + movdqa [rsp],xmm11 + mov eax,2 + cwde + punpckhbw xmm11,xmm1 + punpckhbw xmm10,xmm1 + punpcklbw xmm7,xmm1 + punpcklwd xmm0,xmm0 + punpcklbw xmm8,xmm1 + pshufd xmm3,xmm0,0 + movdqa xmm1,xmm8 + movdqa xmm0,xmm4 + psubw xmm0,xmm9 + psubw xmm1,xmm4 + movdqa xmm2,xmm3 + pabsw xmm0,xmm0 + pcmpgtw xmm6,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm3 + pcmpgtw xmm2,xmm0 + pand xmm6,xmm2 + movdqa xmm0,xmm7 + movdqa xmm2,xmm3 + psubw xmm0,xmm9 + pabsw xmm0,xmm0 + pcmpgtw xmm1,xmm0 + pand xmm6,xmm1 + movdqa xmm0,xmm12 + movdqa xmm1,xmm11 + psubw xmm0,xmm14 + psubw xmm1,xmm12 + movdqa xmm5,xmm6 + pabsw xmm0,xmm0 + pcmpgtw xmm13,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm8 + pcmpgtw xmm2,xmm0 + paddw xmm1,xmm8 + movdqa xmm0,xmm10 + pand xmm13,xmm2 + psubw xmm0,xmm14 + paddw xmm1,xmm4 + movdqa xmm2,xmm11 + pabsw xmm0,xmm0 + paddw xmm2,xmm11 + paddw xmm1,xmm7 + pcmpgtw xmm3,xmm0 + paddw xmm2,xmm12 + movd xmm0,eax + pand xmm13,xmm3 + paddw xmm2,xmm10 + punpcklwd xmm0,xmm0 + pshufd xmm3,xmm0,0 + movdqa xmm0,xmm6 + paddw xmm1,xmm3 + pandn xmm0,xmm4 + paddw xmm2,xmm3 + psraw xmm1,2 + pand xmm5,xmm1 + por xmm5,xmm0 + paddw xmm7,xmm7 + paddw xmm10,xmm10 + psraw xmm2,2 + movdqa xmm1,xmm13 + movdqa xmm0,xmm13 + pandn xmm0,xmm12 + pand xmm1,xmm2 + paddw xmm7,xmm9 + por xmm1,xmm0 + paddw xmm10,xmm14 + paddw xmm7,xmm8 + movdqa xmm0,xmm13 + packuswb xmm5,xmm1 + paddw xmm7,xmm3 + paddw xmm10,xmm11 + movdqa xmm1,xmm6 + paddw xmm10,xmm3 + pandn xmm6,xmm9 + psraw xmm7,2 + pand xmm1,xmm7 + psraw xmm10,2 + pandn xmm13,xmm14 + pand xmm0,xmm10 + por xmm1,xmm6 + movdqa xmm6,[rsp] + movdqa xmm4,xmm6 + por xmm0,xmm13 + punpcklbw xmm4,xmm5 + punpckhbw xmm6,xmm5 + movdqa xmm3,xmm4 + packuswb xmm1,xmm0 + movdqa xmm0,xmm1 + punpckhbw xmm1,xmm15 + punpcklbw xmm0,xmm15 + punpcklwd xmm3,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm0,xmm6 + movdqa xmm2,xmm3 + punpcklwd xmm0,xmm1 + punpckhwd xmm6,xmm1 + movdqa xmm1,xmm4 + punpckldq xmm2,xmm0 + punpckhdq xmm3,xmm0 + punpckldq xmm1,xmm6 + movdqa xmm0,xmm2 + punpcklqdq xmm0,xmm1 + punpckhdq xmm4,xmm6 + punpckhqdq xmm2,xmm1 + movdqa [rsp+10h],xmm0 + movdqa [rsp+60h],xmm2 + movdqa xmm0,xmm3 + mov eax,[rsp+10h] + mov [rcx-2],eax + mov eax,[rsp+60h] + punpcklqdq xmm0,xmm4 + punpckhqdq xmm3,xmm4 + mov [r10+rcx-2],eax + movdqa [rsp+20h],xmm0 + mov eax, [rsp+20h] + movdqa [rsp+70h],xmm3 + mov [rcx+r10*2-2],eax + mov eax,[rsp+70h] + mov [rdx+rcx-2],eax + mov eax,[rsp+18h] + mov [r11],eax + mov eax,[rsp+68h] + mov [r10+r11],eax + mov eax,[rsp+28h] + mov [r11+r10*2],eax + mov eax,[rsp+78h] + mov [rdx+r11],eax + mov eax,[rsp+14h] + mov [rdi-2],eax + mov eax,[rsp+64h] + mov [r10+rdi-2],eax + mov eax,[rsp+24h] + mov [rdi+r10*2-2],eax + mov eax, [rsp+74h] + mov [rdx+rdi-2],eax + mov eax, [rsp+1Ch] + mov [rbx],eax + mov eax, [rsp+6Ch] + mov [r10+rbx],eax + mov eax,[rsp+2Ch] + mov [rbx+r10*2],eax + mov eax,[rsp+7Ch] + mov [rdx+rbx],eax + lea rsp,[rsp+140h] + POP_XMM + mov rbx, [rsp+28h] + pop rdi + ret WELS_EXTERN DeblockChromaLt4H_ssse3 - mov rax,rsp - push rbx - push rbp - push rsi - push rdi - push r12 - PUSH_XMM 16 - sub rsp,170h + mov rax,rsp + push rbx + push rbp + push rsi + push rdi + push r12 + PUSH_XMM 16 + sub rsp,170h - movsxd rsi,r8d - lea eax,[r8*4] - mov r11d,r9d - movsxd r10,eax - mov eax, [rcx-2] - mov r12,rdx - mov [rsp+40h],eax - mov eax, [rsi+rcx-2] - lea rbx,[r10+rcx-2] - movdqa xmm5,[rsp+40h] - mov [rsp+50h],eax - mov eax, [rcx+rsi*2-2] - lea rbp,[r10+rdx-2] - movdqa xmm2, [rsp+50h] - mov [rsp+60h],eax - lea r10,[rsi+rsi*2] - mov rdi,rcx - mov eax,[r10+rcx-2] - movdqa xmm4,[rsp+60h] - mov [rsp+70h],eax - mov eax,[rdx-2] - mov [rsp+80h],eax - mov eax, [rsi+rdx-2] - movdqa xmm3,[rsp+70h] - mov [rsp+90h],eax - mov eax,[rdx+rsi*2-2] - punpckldq xmm5,[rsp+80h] - mov [rsp+0A0h],eax - mov eax, [r10+rdx-2] - punpckldq xmm2,[rsp+90h] - mov [rsp+0B0h],eax - mov eax, [rbx] - punpckldq xmm4,[rsp+0A0h] - mov [rsp+80h],eax - mov eax,[rbp] - punpckldq xmm3,[rsp+0B0h] - mov [rsp+90h],eax - mov eax,[rsi+rbx] - movdqa xmm0,[rsp+80h] - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax,[rsi+rbp] - movdqa xmm0,[rsp+80h] - movdqa xmm1,xmm5 - mov [rsp+90h],eax - mov eax,[rbx+rsi*2] - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax,[rbp+rsi*2] - movdqa xmm0, [rsp+80h] - mov [rsp+90h],eax - mov eax,[r10+rbx] - movdqa xmm7,xmm1 - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax, [r10+rbp] - movdqa xmm0,[rsp+80h] - mov [rsp+90h],eax - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm7,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm6,xmm7 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm6,xmm0 - punpckhdq xmm7,xmm0 - movdqa xmm0,xmm1 - punpckldq xmm0,xmm5 - mov rax, [rsp+1C8h+160] ; pTC - punpckhdq xmm1,xmm5 - movdqa xmm9,xmm6 - punpckhqdq xmm6,xmm0 - punpcklqdq xmm9,xmm0 - movdqa xmm2,xmm7 - movdqa xmm13,xmm6 - movdqa xmm4,xmm9 - movdqa [rsp+10h],xmm9 - punpcklqdq xmm2,xmm1 - punpckhqdq xmm7,xmm1 - pxor xmm1,xmm1 - movsx ecx,byte [rax+3] - movsx edx,byte [rax+2] - movsx r8d,byte [rax+1] - movsx r9d,byte [rax] - movdqa xmm10,xmm1 - movdqa xmm15,xmm2 - punpckhbw xmm2,xmm1 - punpckhbw xmm6,xmm1 - punpcklbw xmm4,xmm1 - movsx eax,r11w - mov word [rsp+0Eh],cx - mov word [rsp+0Ch],cx - movdqa xmm3,xmm7 - movdqa xmm8,xmm7 - movdqa [rsp+20h],xmm7 - punpcklbw xmm15,xmm1 - punpcklbw xmm13,xmm1 - punpcklbw xmm3,xmm1 - mov word [rsp+0Ah],dx - mov word [rsp+8],dx - mov word [rsp+6],r8w - movd xmm0,eax - movdqa [rsp+30h],xmm6 - punpckhbw xmm9,xmm1 - punpckhbw xmm8,xmm1 - punpcklwd xmm0,xmm0 - movsx eax,word [rsp+1C0h+160] ; iBeta - mov word [rsp+4],r8w - mov word [rsp+2],r9w - pshufd xmm12,xmm0,0 - mov word [rsp],r9w - movd xmm0,eax - mov eax,4 - cwde - movdqa xmm14, [rsp] - movdqa [rsp],xmm2 - movdqa xmm2,xmm12 - punpcklwd xmm0,xmm0 - pshufd xmm11,xmm0,0 - psubw xmm10,xmm14 - movd xmm0,eax - movdqa xmm7,xmm14 - movdqa xmm6,xmm14 - pcmpgtw xmm7,xmm1 - punpcklwd xmm0,xmm0 - pshufd xmm5,xmm0,0 - movdqa xmm0,xmm4 - movdqa xmm1,xmm15 - psubw xmm4,xmm13 - psubw xmm0,xmm3 - psubw xmm1,xmm13 - psubw xmm3,xmm15 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm10 - psraw xmm1,3 - pmaxsw xmm0,xmm1 - pminsw xmm6,xmm0 - movdqa xmm1,xmm11 - movdqa xmm0,xmm13 - psubw xmm0,xmm15 - pabsw xmm0,xmm0 - pcmpgtw xmm2,xmm0 - pabsw xmm0,xmm4 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm3 - pand xmm2,xmm1 - movdqa xmm1,xmm11 - movdqa xmm3,[rsp+30h] - pcmpgtw xmm1,xmm0 - movdqa xmm0,xmm9 - pand xmm2,xmm1 - psubw xmm0,xmm8 - psubw xmm9,xmm3 - pand xmm2,xmm7 - pand xmm6,xmm2 - psubw xmm15,xmm6 - paddw xmm13,xmm6 - movdqa xmm2,[rsp] - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - psubw xmm8,xmm2 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm3 - movdqa xmm5,[rsp+10h] - psubw xmm0,xmm2 - psraw xmm1,3 - movdqa xmm4,xmm5 - pabsw xmm0,xmm0 - pmaxsw xmm10,xmm1 - movdqa xmm1,xmm11 - pcmpgtw xmm12,xmm0 - pabsw xmm0,xmm9 - pminsw xmm14,xmm10 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm8 - pcmpgtw xmm11,xmm0 - pand xmm12,xmm1 - movdqa xmm1,[rsp+20h] - pand xmm12,xmm11 - pand xmm12,xmm7 - pand xmm14,xmm12 - paddw xmm3,xmm14 - psubw xmm2,xmm14 - packuswb xmm13,xmm3 - packuswb xmm15,xmm2 - punpcklbw xmm4,xmm13 - punpckhbw xmm5,xmm13 - movdqa xmm0,xmm15 - punpcklbw xmm0,xmm1 - punpckhbw xmm15,xmm1 - movdqa xmm3,xmm4 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm5 - movdqa xmm2,xmm3 - movdqa xmm1,xmm4 - punpcklwd xmm0,xmm15 - punpckhwd xmm5,xmm15 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm5 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm5 - punpckhqdq xmm2,xmm1 - movdqa [rsp+40h],xmm0 - movdqa xmm0,xmm3 - movdqa [rsp+90h],xmm2 - mov eax,[rsp+40h] - mov [rdi-2],eax - mov eax, [rsp+90h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [rsi+rdi-2],eax - movdqa [rsp+50h],xmm0 - mov eax,[rsp+50h] - movdqa [rsp+0A0h],xmm3 - mov [rdi+rsi*2-2],eax - mov eax,[rsp+0A0h] - mov [r10+rdi-2],eax - mov eax,[rsp+48h] - mov [rbx],eax - mov eax,[rsp+98h] - mov [rsi+rbx],eax - mov eax,[rsp+58h] - mov [rbx+rsi*2],eax - mov eax, [rsp+0A8h] - mov [r10+rbx],eax - mov eax, [rsp+44h] - mov [r12-2],eax - mov eax,[rsp+94h] - mov [rsi+r12-2],eax - mov eax,[rsp+54h] - mov [r12+rsi*2-2],eax - mov eax, [rsp+0A4h] - mov [r10+r12-2],eax - mov eax,[rsp+4Ch] - mov [rbp],eax - mov eax,[rsp+9Ch] - mov [rsi+rbp],eax - mov eax, [rsp+5Ch] - mov [rbp+rsi*2],eax - mov eax,[rsp+0ACh] - mov [r10+rbp],eax - lea r11,[rsp+170h] - mov rsp,r11 - POP_XMM - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - ret + movsxd rsi,r8d + lea eax,[r8*4] + mov r11d,r9d + movsxd r10,eax + mov eax, [rcx-2] + mov r12,rdx + mov [rsp+40h],eax + mov eax, [rsi+rcx-2] + lea rbx,[r10+rcx-2] + movdqa xmm5,[rsp+40h] + mov [rsp+50h],eax + mov eax, [rcx+rsi*2-2] + lea rbp,[r10+rdx-2] + movdqa xmm2, [rsp+50h] + mov [rsp+60h],eax + lea r10,[rsi+rsi*2] + mov rdi,rcx + mov eax,[r10+rcx-2] + movdqa xmm4,[rsp+60h] + mov [rsp+70h],eax + mov eax,[rdx-2] + mov [rsp+80h],eax + mov eax, [rsi+rdx-2] + movdqa xmm3,[rsp+70h] + mov [rsp+90h],eax + mov eax,[rdx+rsi*2-2] + punpckldq xmm5,[rsp+80h] + mov [rsp+0A0h],eax + mov eax, [r10+rdx-2] + punpckldq xmm2,[rsp+90h] + mov [rsp+0B0h],eax + mov eax, [rbx] + punpckldq xmm4,[rsp+0A0h] + mov [rsp+80h],eax + mov eax,[rbp] + punpckldq xmm3,[rsp+0B0h] + mov [rsp+90h],eax + mov eax,[rsi+rbx] + movdqa xmm0,[rsp+80h] + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm5,xmm0 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax,[rsi+rbp] + movdqa xmm0,[rsp+80h] + movdqa xmm1,xmm5 + mov [rsp+90h],eax + mov eax,[rbx+rsi*2] + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm2,xmm0 + punpcklbw xmm1,xmm2 + punpckhbw xmm5,xmm2 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax,[rbp+rsi*2] + movdqa xmm0, [rsp+80h] + mov [rsp+90h],eax + mov eax,[r10+rbx] + movdqa xmm7,xmm1 + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm4,xmm0 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax, [r10+rbp] + movdqa xmm0,[rsp+80h] + mov [rsp+90h],eax + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm3,xmm0 + movdqa xmm0,xmm4 + punpcklbw xmm0,xmm3 + punpckhbw xmm4,xmm3 + punpcklwd xmm7,xmm0 + punpckhwd xmm1,xmm0 + movdqa xmm0,xmm5 + movdqa xmm6,xmm7 + punpcklwd xmm0,xmm4 + punpckhwd xmm5,xmm4 + punpckldq xmm6,xmm0 + punpckhdq xmm7,xmm0 + movdqa xmm0,xmm1 + punpckldq xmm0,xmm5 + mov rax, [rsp+1C8h+160] ; pTC + punpckhdq xmm1,xmm5 + movdqa xmm9,xmm6 + punpckhqdq xmm6,xmm0 + punpcklqdq xmm9,xmm0 + movdqa xmm2,xmm7 + movdqa xmm13,xmm6 + movdqa xmm4,xmm9 + movdqa [rsp+10h],xmm9 + punpcklqdq xmm2,xmm1 + punpckhqdq xmm7,xmm1 + pxor xmm1,xmm1 + movsx ecx,byte [rax+3] + movsx edx,byte [rax+2] + movsx r8d,byte [rax+1] + movsx r9d,byte [rax] + movdqa xmm10,xmm1 + movdqa xmm15,xmm2 + punpckhbw xmm2,xmm1 + punpckhbw xmm6,xmm1 + punpcklbw xmm4,xmm1 + movsx eax,r11w + mov word [rsp+0Eh],cx + mov word [rsp+0Ch],cx + movdqa xmm3,xmm7 + movdqa xmm8,xmm7 + movdqa [rsp+20h],xmm7 + punpcklbw xmm15,xmm1 + punpcklbw xmm13,xmm1 + punpcklbw xmm3,xmm1 + mov word [rsp+0Ah],dx + mov word [rsp+8],dx + mov word [rsp+6],r8w + movd xmm0,eax + movdqa [rsp+30h],xmm6 + punpckhbw xmm9,xmm1 + punpckhbw xmm8,xmm1 + punpcklwd xmm0,xmm0 + movsx eax,word [rsp+1C0h+160] ; iBeta + mov word [rsp+4],r8w + mov word [rsp+2],r9w + pshufd xmm12,xmm0,0 + mov word [rsp],r9w + movd xmm0,eax + mov eax,4 + cwde + movdqa xmm14, [rsp] + movdqa [rsp],xmm2 + movdqa xmm2,xmm12 + punpcklwd xmm0,xmm0 + pshufd xmm11,xmm0,0 + psubw xmm10,xmm14 + movd xmm0,eax + movdqa xmm7,xmm14 + movdqa xmm6,xmm14 + pcmpgtw xmm7,xmm1 + punpcklwd xmm0,xmm0 + pshufd xmm5,xmm0,0 + movdqa xmm0,xmm4 + movdqa xmm1,xmm15 + psubw xmm4,xmm13 + psubw xmm0,xmm3 + psubw xmm1,xmm13 + psubw xmm3,xmm15 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm10 + psraw xmm1,3 + pmaxsw xmm0,xmm1 + pminsw xmm6,xmm0 + movdqa xmm1,xmm11 + movdqa xmm0,xmm13 + psubw xmm0,xmm15 + pabsw xmm0,xmm0 + pcmpgtw xmm2,xmm0 + pabsw xmm0,xmm4 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm3 + pand xmm2,xmm1 + movdqa xmm1,xmm11 + movdqa xmm3,[rsp+30h] + pcmpgtw xmm1,xmm0 + movdqa xmm0,xmm9 + pand xmm2,xmm1 + psubw xmm0,xmm8 + psubw xmm9,xmm3 + pand xmm2,xmm7 + pand xmm6,xmm2 + psubw xmm15,xmm6 + paddw xmm13,xmm6 + movdqa xmm2,[rsp] + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + psubw xmm8,xmm2 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm3 + movdqa xmm5,[rsp+10h] + psubw xmm0,xmm2 + psraw xmm1,3 + movdqa xmm4,xmm5 + pabsw xmm0,xmm0 + pmaxsw xmm10,xmm1 + movdqa xmm1,xmm11 + pcmpgtw xmm12,xmm0 + pabsw xmm0,xmm9 + pminsw xmm14,xmm10 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm8 + pcmpgtw xmm11,xmm0 + pand xmm12,xmm1 + movdqa xmm1,[rsp+20h] + pand xmm12,xmm11 + pand xmm12,xmm7 + pand xmm14,xmm12 + paddw xmm3,xmm14 + psubw xmm2,xmm14 + packuswb xmm13,xmm3 + packuswb xmm15,xmm2 + punpcklbw xmm4,xmm13 + punpckhbw xmm5,xmm13 + movdqa xmm0,xmm15 + punpcklbw xmm0,xmm1 + punpckhbw xmm15,xmm1 + movdqa xmm3,xmm4 + punpcklwd xmm3,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm0,xmm5 + movdqa xmm2,xmm3 + movdqa xmm1,xmm4 + punpcklwd xmm0,xmm15 + punpckhwd xmm5,xmm15 + punpckldq xmm2,xmm0 + punpckhdq xmm3,xmm0 + punpckldq xmm1,xmm5 + movdqa xmm0,xmm2 + punpcklqdq xmm0,xmm1 + punpckhdq xmm4,xmm5 + punpckhqdq xmm2,xmm1 + movdqa [rsp+40h],xmm0 + movdqa xmm0,xmm3 + movdqa [rsp+90h],xmm2 + mov eax,[rsp+40h] + mov [rdi-2],eax + mov eax, [rsp+90h] + punpcklqdq xmm0,xmm4 + punpckhqdq xmm3,xmm4 + mov [rsi+rdi-2],eax + movdqa [rsp+50h],xmm0 + mov eax,[rsp+50h] + movdqa [rsp+0A0h],xmm3 + mov [rdi+rsi*2-2],eax + mov eax,[rsp+0A0h] + mov [r10+rdi-2],eax + mov eax,[rsp+48h] + mov [rbx],eax + mov eax,[rsp+98h] + mov [rsi+rbx],eax + mov eax,[rsp+58h] + mov [rbx+rsi*2],eax + mov eax, [rsp+0A8h] + mov [r10+rbx],eax + mov eax, [rsp+44h] + mov [r12-2],eax + mov eax,[rsp+94h] + mov [rsi+r12-2],eax + mov eax,[rsp+54h] + mov [r12+rsi*2-2],eax + mov eax, [rsp+0A4h] + mov [r10+r12-2],eax + mov eax,[rsp+4Ch] + mov [rbp],eax + mov eax,[rsp+9Ch] + mov [rsi+rbp],eax + mov eax, [rsp+5Ch] + mov [rbp+rsi*2],eax + mov eax,[rsp+0ACh] + mov [r10+rbp],eax + lea r11,[rsp+170h] + mov rsp,r11 + POP_XMM + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret @@ -1638,1591 +1638,1591 @@ WELS_EXTERN DeblockChromaLt4H_ssse3 WELS_EXTERN DeblockLumaLt4V_ssse3 - push rbp - mov r11,r8 ; pTC - sub rsp,1B0h - lea rbp,[rsp+20h] - movd xmm4,edx - movd xmm2,ecx - mov qword [rbp+180h],r12 - mov r10,rdi - movsxd r12,esi - add rsi,rsi - movsxd rdx,esi - sub r10,r12 - movsx r8d,byte [r11] - pxor xmm3,xmm3 - punpcklwd xmm2,xmm2 - movaps [rbp+50h],xmm14 - lea rax,[r12+r12*2] - movdqa xmm14,[rdx+rdi] - neg rax - pshufd xmm0,xmm2,0 - movd xmm2,r8d - movsx rsi,byte [r11+1] - movsx r8d,byte [r11+2] - movsx r11d,byte [r11+3] - movaps [rbp+70h],xmm12 - movd xmm1,esi - movaps [rbp+80h],xmm11 - movd xmm12,r8d - movd xmm11,r11d - movdqa xmm5, [rax+rdi] - lea rax,[r12+r12] - punpcklwd xmm12,xmm12 - neg rax - punpcklwd xmm11,xmm11 - movaps [rbp],xmm8 - movdqa xmm8, [r10] - punpcklwd xmm2,xmm2 - punpcklwd xmm1,xmm1 - punpcklqdq xmm12,xmm12 - punpcklqdq xmm11,xmm11 - punpcklqdq xmm2,xmm2 - punpcklqdq xmm1,xmm1 - shufps xmm12,xmm11,88h - movdqa xmm11,xmm8 - movaps [rbp+30h],xmm9 - movdqa xmm9,[rdi] - shufps xmm2,xmm1,88h - movdqa xmm1,xmm5 - punpcklbw xmm11,xmm3 - movaps [rbp+20h],xmm6 - movaps [rbp+60h],xmm13 - movdqa xmm13,xmm11 - movaps [rbp+90h],xmm10 - movdqa xmm10,xmm9 - movdqa xmm6,[rax+rdi] - punpcklbw xmm1,xmm3 - movaps [rbp+0A0h],xmm12 - psubw xmm13,xmm1 - movaps [rbp+40h],xmm15 - movdqa xmm15,xmm14 - movaps [rbp+10h],xmm7 - movdqa xmm7,xmm6 - punpcklbw xmm10,xmm3 - movdqa xmm12,[r12+rdi] - punpcklbw xmm7,xmm3 - punpcklbw xmm12,xmm3 - punpcklbw xmm15,xmm3 - pabsw xmm3,xmm13 - movdqa xmm13,xmm10 - psubw xmm13,xmm15 - movdqa [rbp+0F0h],xmm15 - pabsw xmm15,xmm13 - movdqa xmm13,xmm11 - movdqa [rbp+0B0h],xmm1 - movdqa xmm1,xmm0 - pavgw xmm13,xmm10 - pcmpgtw xmm1,xmm3 - movdqa [rbp+120h],xmm13 - movaps xmm13,xmm2 - punpcklwd xmm4,xmm4 - movdqa xmm3,xmm0 - movdqa [rbp+100h],xmm1 - psubw xmm13,xmm1 - movdqa xmm1,xmm10 - pcmpgtw xmm3,xmm15 - pshufd xmm4,xmm4,0 - psubw xmm1,xmm11 - movdqa [rbp+0D0h],xmm10 - psubw xmm13,xmm3 - movdqa [rbp+110h],xmm3 - pabsw xmm15,xmm1 - movdqa xmm3,xmm4 - psubw xmm10,xmm12 - pcmpgtw xmm3,xmm15 - pabsw xmm15,xmm10 - movdqa xmm10,xmm0 - psllw xmm1,2 - movdqa [rbp+0C0h],xmm11 - psubw xmm11,xmm7 - pcmpgtw xmm10,xmm15 - pabsw xmm11,xmm11 - movdqa xmm15,xmm0 - pand xmm3,xmm10 - pcmpgtw xmm15,xmm11 - movaps xmm11,xmm2 - pxor xmm10,xmm10 - pand xmm3,xmm15 - pcmpgtw xmm11,xmm10 - pcmpeqw xmm10,xmm2 - por xmm11,xmm10 - pand xmm3,xmm11 - movdqa xmm11,xmm7 - psubw xmm11,xmm12 - pxor xmm15,xmm15 - paddw xmm11,xmm1 - psubw xmm15,xmm13 - movdqa [rbp+0E0h],xmm12 - paddw xmm11,[FOUR_16B_SSE2] - pxor xmm12,xmm12 - psraw xmm11,3 - punpckhbw xmm8,xmm12 - pmaxsw xmm15,xmm11 - punpckhbw xmm5,xmm12 - movdqa xmm11,xmm8 - pminsw xmm13,xmm15 - psubw xmm11,xmm5 - punpckhbw xmm9,xmm12 - pand xmm13,xmm3 - movdqa [rbp+130h],xmm13 - pabsw xmm13,xmm11 - punpckhbw xmm14,xmm12 - movdqa xmm11,xmm9 - psubw xmm11,xmm14 - movdqa xmm15,xmm0 - movdqa [rbp+140h],xmm14 - pabsw xmm14,xmm11 - movdqa xmm11,xmm8 - pcmpgtw xmm15,xmm14 - movdqa xmm1,[r12+rdi] - pavgw xmm11,xmm9 - movdqa [rbp+170h],xmm11 - movdqa xmm10,xmm9 - punpckhbw xmm6,xmm12 - psubw xmm10,xmm8 - punpckhbw xmm1,xmm12 - movdqa xmm12,xmm0 - movaps xmm11,[rbp+0A0h] - pcmpgtw xmm12,xmm13 - movaps xmm13,xmm11 - psubw xmm13,xmm12 - movdqa [rbp+160h],xmm15 - psubw xmm13,xmm15 - movdqa xmm15,xmm9 - psubw xmm15,xmm1 - movdqa [rbp+150h],xmm12 - pabsw xmm12,xmm10 - pabsw xmm14,xmm15 - movdqa xmm15,xmm8 - pcmpgtw xmm4,xmm12 - movdqa xmm12,xmm0 - psubw xmm15,xmm6 - pcmpgtw xmm12,xmm14 - pabsw xmm14,xmm15 - psllw xmm10,2 - pcmpgtw xmm0,xmm14 - movdqa xmm14,xmm6 - psubw xmm14,xmm1 - pand xmm4,xmm12 - paddw xmm14,xmm10 - pand xmm4,xmm0 - paddw xmm14,[FOUR_16B_SSE2] - pxor xmm15,xmm15 - movaps xmm12,xmm11 - psubw xmm15,xmm13 - pxor xmm0,xmm0 - psraw xmm14,3 - pcmpgtw xmm12,xmm0 - pcmpeqw xmm0,xmm11 - pmaxsw xmm15,xmm14 - por xmm12,xmm0 - movdqa xmm0,[rbp+120h] - pminsw xmm13,xmm15 - movdqa xmm15,[rbp+0B0h] - movdqa xmm10,xmm7 - pand xmm4,xmm12 - paddw xmm15,xmm0 - pxor xmm12,xmm12 - paddw xmm10,xmm7 - movdqa xmm14,xmm12 - psubw xmm15,xmm10 - psubw xmm14,xmm2 - psraw xmm15,1 - pmaxsw xmm15,xmm14 - movdqa xmm10,xmm6 - pminsw xmm15,xmm2 - paddw xmm10,xmm6 - pand xmm15,xmm3 - psubw xmm12,xmm11 - pand xmm15,[rbp+100h] - pand xmm13,xmm4 - paddw xmm7,xmm15 - paddw xmm8,xmm13 - movdqa xmm15,[rbp+170h] - psubw xmm9,xmm13 - paddw xmm5,xmm15 - psubw xmm5,xmm10 - psraw xmm5,1 - pmaxsw xmm5,xmm12 - pminsw xmm5,xmm11 - pand xmm5,xmm4 - pand xmm5,[rbp+150h] - paddw xmm6,xmm5 - movdqa xmm5,[rbp+0C0h] - packuswb xmm7,xmm6 - movdqa xmm6,[rbp+130h] - paddw xmm5,xmm6 - packuswb xmm5,xmm8 - movdqa xmm8,[rbp+0D0h] - psubw xmm8,xmm6 - movdqa xmm6,[rbp+0F0h] - paddw xmm6,xmm0 - movdqa xmm0,[rbp+0E0h] - packuswb xmm8,xmm9 - movdqa xmm9,xmm0 - paddw xmm9,xmm0 - psubw xmm6,xmm9 - psraw xmm6,1 - pmaxsw xmm14,xmm6 - pminsw xmm2,xmm14 - pand xmm2,xmm3 - pand xmm2,[rbp+110h] - paddw xmm0,xmm2 - movdqa xmm2,[rbp+140h] - paddw xmm2,xmm15 - movdqa xmm15,xmm1 - paddw xmm15,xmm1 - psubw xmm2,xmm15 - psraw xmm2,1 - pmaxsw xmm12,xmm2 - pminsw xmm11,xmm12 - pand xmm11,xmm4 - pand xmm11,[rbp+160h] - paddw xmm1,xmm11 - movdqa [rax+rdi],xmm7 - movdqa [r10],xmm5 - packuswb xmm0,xmm1 - movdqa [rdi],xmm8 - movdqa [r12+rdi],xmm0 - mov r12,qword [rbp+180h] - lea rsp,[rbp+190h] - pop rbp - ret + push rbp + mov r11,r8 ; pTC + sub rsp,1B0h + lea rbp,[rsp+20h] + movd xmm4,edx + movd xmm2,ecx + mov qword [rbp+180h],r12 + mov r10,rdi + movsxd r12,esi + add rsi,rsi + movsxd rdx,esi + sub r10,r12 + movsx r8d,byte [r11] + pxor xmm3,xmm3 + punpcklwd xmm2,xmm2 + movaps [rbp+50h],xmm14 + lea rax,[r12+r12*2] + movdqa xmm14,[rdx+rdi] + neg rax + pshufd xmm0,xmm2,0 + movd xmm2,r8d + movsx rsi,byte [r11+1] + movsx r8d,byte [r11+2] + movsx r11d,byte [r11+3] + movaps [rbp+70h],xmm12 + movd xmm1,esi + movaps [rbp+80h],xmm11 + movd xmm12,r8d + movd xmm11,r11d + movdqa xmm5, [rax+rdi] + lea rax,[r12+r12] + punpcklwd xmm12,xmm12 + neg rax + punpcklwd xmm11,xmm11 + movaps [rbp],xmm8 + movdqa xmm8, [r10] + punpcklwd xmm2,xmm2 + punpcklwd xmm1,xmm1 + punpcklqdq xmm12,xmm12 + punpcklqdq xmm11,xmm11 + punpcklqdq xmm2,xmm2 + punpcklqdq xmm1,xmm1 + shufps xmm12,xmm11,88h + movdqa xmm11,xmm8 + movaps [rbp+30h],xmm9 + movdqa xmm9,[rdi] + shufps xmm2,xmm1,88h + movdqa xmm1,xmm5 + punpcklbw xmm11,xmm3 + movaps [rbp+20h],xmm6 + movaps [rbp+60h],xmm13 + movdqa xmm13,xmm11 + movaps [rbp+90h],xmm10 + movdqa xmm10,xmm9 + movdqa xmm6,[rax+rdi] + punpcklbw xmm1,xmm3 + movaps [rbp+0A0h],xmm12 + psubw xmm13,xmm1 + movaps [rbp+40h],xmm15 + movdqa xmm15,xmm14 + movaps [rbp+10h],xmm7 + movdqa xmm7,xmm6 + punpcklbw xmm10,xmm3 + movdqa xmm12,[r12+rdi] + punpcklbw xmm7,xmm3 + punpcklbw xmm12,xmm3 + punpcklbw xmm15,xmm3 + pabsw xmm3,xmm13 + movdqa xmm13,xmm10 + psubw xmm13,xmm15 + movdqa [rbp+0F0h],xmm15 + pabsw xmm15,xmm13 + movdqa xmm13,xmm11 + movdqa [rbp+0B0h],xmm1 + movdqa xmm1,xmm0 + pavgw xmm13,xmm10 + pcmpgtw xmm1,xmm3 + movdqa [rbp+120h],xmm13 + movaps xmm13,xmm2 + punpcklwd xmm4,xmm4 + movdqa xmm3,xmm0 + movdqa [rbp+100h],xmm1 + psubw xmm13,xmm1 + movdqa xmm1,xmm10 + pcmpgtw xmm3,xmm15 + pshufd xmm4,xmm4,0 + psubw xmm1,xmm11 + movdqa [rbp+0D0h],xmm10 + psubw xmm13,xmm3 + movdqa [rbp+110h],xmm3 + pabsw xmm15,xmm1 + movdqa xmm3,xmm4 + psubw xmm10,xmm12 + pcmpgtw xmm3,xmm15 + pabsw xmm15,xmm10 + movdqa xmm10,xmm0 + psllw xmm1,2 + movdqa [rbp+0C0h],xmm11 + psubw xmm11,xmm7 + pcmpgtw xmm10,xmm15 + pabsw xmm11,xmm11 + movdqa xmm15,xmm0 + pand xmm3,xmm10 + pcmpgtw xmm15,xmm11 + movaps xmm11,xmm2 + pxor xmm10,xmm10 + pand xmm3,xmm15 + pcmpgtw xmm11,xmm10 + pcmpeqw xmm10,xmm2 + por xmm11,xmm10 + pand xmm3,xmm11 + movdqa xmm11,xmm7 + psubw xmm11,xmm12 + pxor xmm15,xmm15 + paddw xmm11,xmm1 + psubw xmm15,xmm13 + movdqa [rbp+0E0h],xmm12 + paddw xmm11,[FOUR_16B_SSE2] + pxor xmm12,xmm12 + psraw xmm11,3 + punpckhbw xmm8,xmm12 + pmaxsw xmm15,xmm11 + punpckhbw xmm5,xmm12 + movdqa xmm11,xmm8 + pminsw xmm13,xmm15 + psubw xmm11,xmm5 + punpckhbw xmm9,xmm12 + pand xmm13,xmm3 + movdqa [rbp+130h],xmm13 + pabsw xmm13,xmm11 + punpckhbw xmm14,xmm12 + movdqa xmm11,xmm9 + psubw xmm11,xmm14 + movdqa xmm15,xmm0 + movdqa [rbp+140h],xmm14 + pabsw xmm14,xmm11 + movdqa xmm11,xmm8 + pcmpgtw xmm15,xmm14 + movdqa xmm1,[r12+rdi] + pavgw xmm11,xmm9 + movdqa [rbp+170h],xmm11 + movdqa xmm10,xmm9 + punpckhbw xmm6,xmm12 + psubw xmm10,xmm8 + punpckhbw xmm1,xmm12 + movdqa xmm12,xmm0 + movaps xmm11,[rbp+0A0h] + pcmpgtw xmm12,xmm13 + movaps xmm13,xmm11 + psubw xmm13,xmm12 + movdqa [rbp+160h],xmm15 + psubw xmm13,xmm15 + movdqa xmm15,xmm9 + psubw xmm15,xmm1 + movdqa [rbp+150h],xmm12 + pabsw xmm12,xmm10 + pabsw xmm14,xmm15 + movdqa xmm15,xmm8 + pcmpgtw xmm4,xmm12 + movdqa xmm12,xmm0 + psubw xmm15,xmm6 + pcmpgtw xmm12,xmm14 + pabsw xmm14,xmm15 + psllw xmm10,2 + pcmpgtw xmm0,xmm14 + movdqa xmm14,xmm6 + psubw xmm14,xmm1 + pand xmm4,xmm12 + paddw xmm14,xmm10 + pand xmm4,xmm0 + paddw xmm14,[FOUR_16B_SSE2] + pxor xmm15,xmm15 + movaps xmm12,xmm11 + psubw xmm15,xmm13 + pxor xmm0,xmm0 + psraw xmm14,3 + pcmpgtw xmm12,xmm0 + pcmpeqw xmm0,xmm11 + pmaxsw xmm15,xmm14 + por xmm12,xmm0 + movdqa xmm0,[rbp+120h] + pminsw xmm13,xmm15 + movdqa xmm15,[rbp+0B0h] + movdqa xmm10,xmm7 + pand xmm4,xmm12 + paddw xmm15,xmm0 + pxor xmm12,xmm12 + paddw xmm10,xmm7 + movdqa xmm14,xmm12 + psubw xmm15,xmm10 + psubw xmm14,xmm2 + psraw xmm15,1 + pmaxsw xmm15,xmm14 + movdqa xmm10,xmm6 + pminsw xmm15,xmm2 + paddw xmm10,xmm6 + pand xmm15,xmm3 + psubw xmm12,xmm11 + pand xmm15,[rbp+100h] + pand xmm13,xmm4 + paddw xmm7,xmm15 + paddw xmm8,xmm13 + movdqa xmm15,[rbp+170h] + psubw xmm9,xmm13 + paddw xmm5,xmm15 + psubw xmm5,xmm10 + psraw xmm5,1 + pmaxsw xmm5,xmm12 + pminsw xmm5,xmm11 + pand xmm5,xmm4 + pand xmm5,[rbp+150h] + paddw xmm6,xmm5 + movdqa xmm5,[rbp+0C0h] + packuswb xmm7,xmm6 + movdqa xmm6,[rbp+130h] + paddw xmm5,xmm6 + packuswb xmm5,xmm8 + movdqa xmm8,[rbp+0D0h] + psubw xmm8,xmm6 + movdqa xmm6,[rbp+0F0h] + paddw xmm6,xmm0 + movdqa xmm0,[rbp+0E0h] + packuswb xmm8,xmm9 + movdqa xmm9,xmm0 + paddw xmm9,xmm0 + psubw xmm6,xmm9 + psraw xmm6,1 + pmaxsw xmm14,xmm6 + pminsw xmm2,xmm14 + pand xmm2,xmm3 + pand xmm2,[rbp+110h] + paddw xmm0,xmm2 + movdqa xmm2,[rbp+140h] + paddw xmm2,xmm15 + movdqa xmm15,xmm1 + paddw xmm15,xmm1 + psubw xmm2,xmm15 + psraw xmm2,1 + pmaxsw xmm12,xmm2 + pminsw xmm11,xmm12 + pand xmm11,xmm4 + pand xmm11,[rbp+160h] + paddw xmm1,xmm11 + movdqa [rax+rdi],xmm7 + movdqa [r10],xmm5 + packuswb xmm0,xmm1 + movdqa [rdi],xmm8 + movdqa [r12+rdi],xmm0 + mov r12,qword [rbp+180h] + lea rsp,[rbp+190h] + pop rbp + ret WELS_EXTERN DeblockLumaEq4V_ssse3 - mov rax,rsp - push rbx - push rbp - mov r8, rdx - mov r9, rcx - mov rcx, rdi - mov rdx, rsi - sub rsp,1D8h - movaps [rax-38h],xmm6 - movaps [rax-48h],xmm7 - movaps [rax-58h],xmm8 - pxor xmm1,xmm1 - movsxd r10,edx - mov rbp,rcx - mov r11d,r8d - mov rdx,rcx - mov rdi,rbp - mov rbx,rbp - movdqa xmm5,[rbp] - movaps [rax-68h],xmm9 - movaps [rax-78h],xmm10 - punpcklbw xmm5,xmm1 - movaps [rax-88h],xmm11 - movaps [rax-98h],xmm12 - movaps [rax-0A8h],xmm13 - movaps [rax-0B8h],xmm14 - movdqa xmm14,[r10+rbp] - movaps [rax-0C8h],xmm15 - lea eax,[r10*4] - movsxd r8,eax - lea eax,[r10+r10*2] - movsxd rcx,eax - lea eax,[r10+r10] - sub rdx,r8 - punpcklbw xmm14,xmm1 - movdqa [rsp+90h],xmm5 - movdqa [rsp+30h],xmm14 - movsxd rsi,eax - movsx eax,r11w - sub rdi,rcx - sub rbx,rsi - mov r8,rbp - sub r8,r10 - movd xmm0,eax - movsx eax,r9w - movdqa xmm12,[rdi] - movdqa xmm6, [rsi+rbp] - movdqa xmm13,[rbx] - punpcklwd xmm0,xmm0 - pshufd xmm11,xmm0,0 - punpcklbw xmm13,xmm1 - punpcklbw xmm6,xmm1 - movdqa xmm8,[r8] - movd xmm0,eax - movdqa xmm10,xmm11 - mov eax,2 - punpcklbw xmm8,xmm1 - punpcklbw xmm12,xmm1 - cwde - punpcklwd xmm0,xmm0 - psraw xmm10,2 - movdqa xmm1,xmm8 - movdqa [rsp+0F0h],xmm13 - movdqa [rsp+0B0h],xmm8 - pshufd xmm7,xmm0,0 - psubw xmm1,xmm13 - movdqa xmm0,xmm5 - movdqa xmm4,xmm7 - movdqa xmm2,xmm7 - psubw xmm0,xmm8 - pabsw xmm3,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm5 - movdqa [rsp+40h],xmm7 - movdqa [rsp+60h],xmm6 - pcmpgtw xmm4,xmm0 - psubw xmm1,xmm14 - pabsw xmm0,xmm1 - pcmpgtw xmm2,xmm0 - pand xmm4,xmm2 - movdqa xmm0,xmm11 - pcmpgtw xmm0,xmm3 - pand xmm4,xmm0 - movd xmm0,eax - movdqa [rsp+20h],xmm4 - punpcklwd xmm0,xmm0 - pshufd xmm2,xmm0,0 - paddw xmm10,xmm2 - movdqa [rsp+0A0h],xmm2 - movdqa xmm15,xmm7 - pxor xmm4,xmm4 - movdqa xmm0,xmm8 - psubw xmm0,xmm12 - mov eax,4 - pabsw xmm0,xmm0 - movdqa xmm1,xmm10 - cwde - pcmpgtw xmm15,xmm0 - pcmpgtw xmm1,xmm3 - movdqa xmm3,xmm7 - movdqa xmm7,[rdx] - movdqa xmm0,xmm5 - psubw xmm0,xmm6 - pand xmm15,xmm1 - punpcklbw xmm7,xmm4 - movdqa xmm9,xmm15 - pabsw xmm0,xmm0 - psllw xmm7,1 - pandn xmm9,xmm12 - pcmpgtw xmm3,xmm0 - paddw xmm7,xmm12 - movd xmm0,eax - pand xmm3,xmm1 - paddw xmm7,xmm12 - punpcklwd xmm0,xmm0 - paddw xmm7,xmm12 - pshufd xmm1,xmm0,0 - paddw xmm7,xmm13 - movdqa xmm0,xmm3 - pandn xmm0,xmm6 - paddw xmm7,xmm8 - movdqa [rsp+70h],xmm1 - paddw xmm7,xmm5 - movdqa [rsp+120h],xmm0 - movdqa xmm0,[rcx+rbp] - punpcklbw xmm0,xmm4 - paddw xmm7,xmm1 - movdqa xmm4,xmm15 - psllw xmm0,1 - psraw xmm7,3 - paddw xmm0,xmm6 - pand xmm7,xmm15 - paddw xmm0,xmm6 - paddw xmm0,xmm6 - paddw xmm0,xmm14 - movdqa xmm6,xmm15 - paddw xmm0,xmm5 - pandn xmm6,xmm13 - paddw xmm0,xmm8 - paddw xmm0,xmm1 - psraw xmm0,3 - movdqa xmm1,xmm12 - paddw xmm1,xmm13 - pand xmm0,xmm3 - movdqa [rsp+100h],xmm0 - movdqa xmm0,xmm8 - paddw xmm0,xmm5 - paddw xmm1,xmm0 - movdqa xmm0,xmm3 - paddw xmm1,xmm2 - psraw xmm1,2 - pandn xmm0,xmm14 - pand xmm4,xmm1 - movdqa [rsp+0E0h],xmm0 - movdqa xmm0,xmm5 - paddw xmm0,xmm8 - movdqa xmm1,[rsp+60h] - paddw xmm1,xmm14 - movdqa xmm14,xmm3 - paddw xmm1,xmm0 - movdqa xmm0,xmm8 - paddw xmm0,[rsp+30h] - paddw xmm1,xmm2 - psraw xmm1,2 - pand xmm14,xmm1 - movdqa xmm1,xmm13 - paddw xmm1,xmm13 - paddw xmm1,xmm0 - paddw xmm1,xmm2 - psraw xmm1,2 - movdqa xmm0,[rsp+30h] - movdqa xmm2,xmm13 - movdqa xmm5,xmm15 - paddw xmm0,[rsp+70h] - pandn xmm5,xmm1 - paddw xmm2,xmm8 - movdqa xmm8,[rsp+90h] - movdqa xmm1,xmm12 - paddw xmm2,xmm8 - psllw xmm2,1 - paddw xmm2,xmm0 - paddw xmm1,xmm2 - movdqa xmm0,xmm8 - movdqa xmm8,xmm3 - movdqa xmm2,[rsp+30h] - paddw xmm0,xmm13 - psraw xmm1,3 - pand xmm15,xmm1 - movdqa xmm1,xmm2 - paddw xmm1,xmm2 - paddw xmm2,[rsp+90h] - paddw xmm2,[rsp+0B0h] - paddw xmm1,xmm0 - movdqa xmm0,xmm13 - movdqa xmm13,[r8] - paddw xmm0, [rsp+70h] - paddw xmm1, [rsp+0A0h] - psllw xmm2,1 - paddw xmm2,xmm0 - psraw xmm1,2 - movdqa xmm0, [rdi] - pandn xmm8,xmm1 - movdqa xmm1, [rsp+60h] - paddw xmm1,xmm2 - movdqa xmm2, [rbx] - psraw xmm1,3 - pand xmm3,xmm1 - movdqa xmm1, [rbp] - movdqa [rsp+0D0h],xmm3 - pxor xmm3,xmm3 - punpckhbw xmm0,xmm3 - punpckhbw xmm1,xmm3 - punpckhbw xmm13,xmm3 - movdqa [rsp+0C0h],xmm0 - movdqa xmm0,[r10+rbp] - movdqa [rsp],xmm1 - punpckhbw xmm0,xmm3 - punpckhbw xmm2,xmm3 - movdqa [rsp+80h],xmm0 - movdqa xmm0,[rsi+rbp] - movdqa [rsp+10h],xmm13 - punpckhbw xmm0,xmm3 - movdqa [rsp+50h],xmm0 - movdqa xmm0,xmm1 - movdqa xmm1,xmm13 - psubw xmm0,xmm13 - psubw xmm1,xmm2 - pabsw xmm3,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,[rsp] - movdqa xmm13,[rsp+40h] - movdqa [rsp+110h],xmm2 - psubw xmm1, [rsp+80h] - pcmpgtw xmm13,xmm0 - pcmpgtw xmm11,xmm3 - pabsw xmm0,xmm1 - pcmpgtw xmm10,xmm3 - movdqa xmm1, [rsp+40h] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 - pcmpgtw xmm2,xmm0 - movdqa xmm0, [rsp+10h] - pand xmm13,xmm2 - pand xmm13,xmm11 - movdqa xmm11,[rsp+0C0h] - psubw xmm0,xmm11 - pabsw xmm0,xmm0 - pcmpgtw xmm3,xmm0 - pand xmm3,xmm10 - movdqa xmm0,[rsp] - psubw xmm0,[rsp+50h] - movdqa xmm2,[rdx] - pabsw xmm0,xmm0 - por xmm7,xmm9 - movdqa xmm9,[rsp+20h] - pcmpgtw xmm1,xmm0 - pand xmm9,xmm7 - movdqa xmm7,[rsp+20h] - movdqa xmm0,xmm7 - pandn xmm0,xmm12 - movdqa xmm12,[rsp+110h] - pand xmm1,xmm10 - movdqa xmm10,[rsp+70h] - movdqa [rsp+40h],xmm1 - movdqa xmm1,xmm13 - por xmm9,xmm0 - pxor xmm0,xmm0 - por xmm4,xmm6 - movdqa xmm6,xmm7 - punpckhbw xmm2,xmm0 - por xmm15,xmm5 - movdqa xmm5,[rsp+20h] - movdqa xmm0,xmm3 - psllw xmm2,1 - pandn xmm0,xmm11 - pand xmm6,xmm4 - movdqa xmm4,[rsp] - paddw xmm2,xmm11 - pand xmm5,xmm15 - movdqa xmm15,[rsp+20h] - paddw xmm2,xmm11 - paddw xmm2,xmm11 - paddw xmm2,xmm12 - paddw xmm2,[rsp+10h] - paddw xmm2,[rsp] - paddw xmm2,xmm10 - psraw xmm2,3 - pand xmm2,xmm3 - por xmm2,xmm0 - pand xmm1,xmm2 - movdqa xmm0,xmm13 - movdqa xmm2,xmm11 - pandn xmm0,xmm11 - paddw xmm2,xmm12 - por xmm1,xmm0 - packuswb xmm9,xmm1 - movdqa xmm0,xmm7 - movdqa xmm7,[rsp+0A0h] - pandn xmm0,[rsp+0F0h] - movdqa xmm1,xmm3 - por xmm6,xmm0 - movdqa xmm0,[rsp+10h] - paddw xmm0,xmm4 - paddw xmm2,xmm0 - paddw xmm2,xmm7 - movdqa xmm0,xmm3 - pandn xmm0,xmm12 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - movdqa xmm2,xmm13 - movdqa xmm0,xmm13 - pand xmm2,xmm1 - pandn xmm0,xmm12 - movdqa xmm1,xmm12 - paddw xmm1,[rsp+10h] - por xmm2,xmm0 - movdqa xmm0,xmm15 - pandn xmm0,[rsp+0B0h] - paddw xmm1,xmm4 - packuswb xmm6,xmm2 - movdqa xmm2,xmm3 - psllw xmm1,1 - por xmm5,xmm0 - movdqa xmm0,[rsp+80h] - paddw xmm0,xmm10 - paddw xmm1,xmm0 - paddw xmm11,xmm1 - psraw xmm11,3 - movdqa xmm1,xmm12 - pand xmm2,xmm11 - paddw xmm1,xmm12 - movdqa xmm11,[rsp+80h] - movdqa xmm0, [rsp+10h] - por xmm14,[rsp+0E0h] - paddw xmm0,xmm11 - movdqa xmm4,xmm15 - paddw xmm1,xmm0 - movdqa xmm0,xmm13 - paddw xmm1,xmm7 - psraw xmm1,2 - pandn xmm3,xmm1 - por xmm2,xmm3 - movdqa xmm1,xmm13 - movdqa xmm3,[rsp+10h] - pandn xmm0,xmm3 - pand xmm1,xmm2 - movdqa xmm2,xmm11 - paddw xmm2,[rsp] - por xmm1,xmm0 - movdqa xmm0,[rsp+0D0h] - por xmm0,xmm8 - paddw xmm2,xmm3 - packuswb xmm5,xmm1 - movdqa xmm8,[rsp+40h] - movdqa xmm1,[rsp+50h] - movdqa xmm3,xmm8 - pand xmm4,xmm0 - psllw xmm2,1 - movdqa xmm0,xmm15 - pandn xmm0,[rsp+90h] - por xmm4,xmm0 - movdqa xmm0,xmm12 - paddw xmm0,xmm10 - paddw xmm2,xmm0 - paddw xmm1,xmm2 - movdqa xmm0,[rsp] - movdqa xmm2,xmm11 - paddw xmm0,xmm12 - movdqa xmm12,[rsp] - paddw xmm2,xmm11 - paddw xmm2,xmm0 - psraw xmm1,3 - movdqa xmm0,xmm8 - pand xmm3,xmm1 - paddw xmm2,xmm7 - movdqa xmm1,xmm13 - psraw xmm2,2 - pandn xmm0,xmm2 - por xmm3,xmm0 - movdqa xmm2,[rsp+50h] - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm3 - paddw xmm2,xmm11 - movdqa xmm3,xmm15 - por xmm1,xmm0 - pand xmm3,xmm14 - movdqa xmm14,[rsp+10h] - movdqa xmm0,xmm15 - pandn xmm0,[rsp+30h] - packuswb xmm4,xmm1 - movdqa xmm1,xmm8 - por xmm3,xmm0 - movdqa xmm0,xmm12 - paddw xmm0,xmm14 - paddw xmm2,xmm0 - paddw xmm2,xmm7 - movdqa xmm0,xmm8 - pandn xmm0,xmm11 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - movdqa xmm2,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm11 - pand xmm2,xmm1 - movdqa xmm1,xmm15 - por xmm2,xmm0 - packuswb xmm3,xmm2 - movdqa xmm0,[rsp+100h] - por xmm0,[rsp+120h] - pand xmm1,xmm0 - movdqa xmm2,[rcx+rbp] - movdqa xmm7,[rsp+50h] - pandn xmm15,[rsp+60h] - lea r11,[rsp+1D8h] - pxor xmm0,xmm0 - por xmm1,xmm15 - movaps xmm15,[r11-0A8h] - movdqa [rdi],xmm9 - movaps xmm9,[r11-48h] - punpckhbw xmm2,xmm0 - psllw xmm2,1 - paddw xmm2,xmm7 - paddw xmm2,xmm7 - movdqa [rbx],xmm6 - movaps xmm6,[r11-18h] - paddw xmm2,xmm7 - paddw xmm2,xmm11 - movaps xmm11,[r11-68h] - paddw xmm2,xmm12 - movaps xmm12,[r11-78h] - paddw xmm2,xmm14 - paddw xmm2,xmm10 - psraw xmm2,3 - movaps xmm10,[r11-58h] - movaps xmm14,[r11-98h] - movdqa xmm0,xmm13 - pand xmm2,xmm8 - pandn xmm8,xmm7 - pandn xmm13,xmm7 - por xmm2,xmm8 - movaps xmm7,[r11-28h] - movaps xmm8,[r11-38h] - movdqa [r8],xmm5 - pand xmm0,xmm2 - por xmm0,xmm13 - packuswb xmm1,xmm0 - movaps xmm13,[r11-88h] - movdqa [rbp],xmm4 - movdqa [r10+rbp],xmm3 - movdqa [rsi+rbp],xmm1 - mov rsp,r11 - pop rbp - pop rbx - ret + mov rax,rsp + push rbx + push rbp + mov r8, rdx + mov r9, rcx + mov rcx, rdi + mov rdx, rsi + sub rsp,1D8h + movaps [rax-38h],xmm6 + movaps [rax-48h],xmm7 + movaps [rax-58h],xmm8 + pxor xmm1,xmm1 + movsxd r10,edx + mov rbp,rcx + mov r11d,r8d + mov rdx,rcx + mov rdi,rbp + mov rbx,rbp + movdqa xmm5,[rbp] + movaps [rax-68h],xmm9 + movaps [rax-78h],xmm10 + punpcklbw xmm5,xmm1 + movaps [rax-88h],xmm11 + movaps [rax-98h],xmm12 + movaps [rax-0A8h],xmm13 + movaps [rax-0B8h],xmm14 + movdqa xmm14,[r10+rbp] + movaps [rax-0C8h],xmm15 + lea eax,[r10*4] + movsxd r8,eax + lea eax,[r10+r10*2] + movsxd rcx,eax + lea eax,[r10+r10] + sub rdx,r8 + punpcklbw xmm14,xmm1 + movdqa [rsp+90h],xmm5 + movdqa [rsp+30h],xmm14 + movsxd rsi,eax + movsx eax,r11w + sub rdi,rcx + sub rbx,rsi + mov r8,rbp + sub r8,r10 + movd xmm0,eax + movsx eax,r9w + movdqa xmm12,[rdi] + movdqa xmm6, [rsi+rbp] + movdqa xmm13,[rbx] + punpcklwd xmm0,xmm0 + pshufd xmm11,xmm0,0 + punpcklbw xmm13,xmm1 + punpcklbw xmm6,xmm1 + movdqa xmm8,[r8] + movd xmm0,eax + movdqa xmm10,xmm11 + mov eax,2 + punpcklbw xmm8,xmm1 + punpcklbw xmm12,xmm1 + cwde + punpcklwd xmm0,xmm0 + psraw xmm10,2 + movdqa xmm1,xmm8 + movdqa [rsp+0F0h],xmm13 + movdqa [rsp+0B0h],xmm8 + pshufd xmm7,xmm0,0 + psubw xmm1,xmm13 + movdqa xmm0,xmm5 + movdqa xmm4,xmm7 + movdqa xmm2,xmm7 + psubw xmm0,xmm8 + pabsw xmm3,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm5 + movdqa [rsp+40h],xmm7 + movdqa [rsp+60h],xmm6 + pcmpgtw xmm4,xmm0 + psubw xmm1,xmm14 + pabsw xmm0,xmm1 + pcmpgtw xmm2,xmm0 + pand xmm4,xmm2 + movdqa xmm0,xmm11 + pcmpgtw xmm0,xmm3 + pand xmm4,xmm0 + movd xmm0,eax + movdqa [rsp+20h],xmm4 + punpcklwd xmm0,xmm0 + pshufd xmm2,xmm0,0 + paddw xmm10,xmm2 + movdqa [rsp+0A0h],xmm2 + movdqa xmm15,xmm7 + pxor xmm4,xmm4 + movdqa xmm0,xmm8 + psubw xmm0,xmm12 + mov eax,4 + pabsw xmm0,xmm0 + movdqa xmm1,xmm10 + cwde + pcmpgtw xmm15,xmm0 + pcmpgtw xmm1,xmm3 + movdqa xmm3,xmm7 + movdqa xmm7,[rdx] + movdqa xmm0,xmm5 + psubw xmm0,xmm6 + pand xmm15,xmm1 + punpcklbw xmm7,xmm4 + movdqa xmm9,xmm15 + pabsw xmm0,xmm0 + psllw xmm7,1 + pandn xmm9,xmm12 + pcmpgtw xmm3,xmm0 + paddw xmm7,xmm12 + movd xmm0,eax + pand xmm3,xmm1 + paddw xmm7,xmm12 + punpcklwd xmm0,xmm0 + paddw xmm7,xmm12 + pshufd xmm1,xmm0,0 + paddw xmm7,xmm13 + movdqa xmm0,xmm3 + pandn xmm0,xmm6 + paddw xmm7,xmm8 + movdqa [rsp+70h],xmm1 + paddw xmm7,xmm5 + movdqa [rsp+120h],xmm0 + movdqa xmm0,[rcx+rbp] + punpcklbw xmm0,xmm4 + paddw xmm7,xmm1 + movdqa xmm4,xmm15 + psllw xmm0,1 + psraw xmm7,3 + paddw xmm0,xmm6 + pand xmm7,xmm15 + paddw xmm0,xmm6 + paddw xmm0,xmm6 + paddw xmm0,xmm14 + movdqa xmm6,xmm15 + paddw xmm0,xmm5 + pandn xmm6,xmm13 + paddw xmm0,xmm8 + paddw xmm0,xmm1 + psraw xmm0,3 + movdqa xmm1,xmm12 + paddw xmm1,xmm13 + pand xmm0,xmm3 + movdqa [rsp+100h],xmm0 + movdqa xmm0,xmm8 + paddw xmm0,xmm5 + paddw xmm1,xmm0 + movdqa xmm0,xmm3 + paddw xmm1,xmm2 + psraw xmm1,2 + pandn xmm0,xmm14 + pand xmm4,xmm1 + movdqa [rsp+0E0h],xmm0 + movdqa xmm0,xmm5 + paddw xmm0,xmm8 + movdqa xmm1,[rsp+60h] + paddw xmm1,xmm14 + movdqa xmm14,xmm3 + paddw xmm1,xmm0 + movdqa xmm0,xmm8 + paddw xmm0,[rsp+30h] + paddw xmm1,xmm2 + psraw xmm1,2 + pand xmm14,xmm1 + movdqa xmm1,xmm13 + paddw xmm1,xmm13 + paddw xmm1,xmm0 + paddw xmm1,xmm2 + psraw xmm1,2 + movdqa xmm0,[rsp+30h] + movdqa xmm2,xmm13 + movdqa xmm5,xmm15 + paddw xmm0,[rsp+70h] + pandn xmm5,xmm1 + paddw xmm2,xmm8 + movdqa xmm8,[rsp+90h] + movdqa xmm1,xmm12 + paddw xmm2,xmm8 + psllw xmm2,1 + paddw xmm2,xmm0 + paddw xmm1,xmm2 + movdqa xmm0,xmm8 + movdqa xmm8,xmm3 + movdqa xmm2,[rsp+30h] + paddw xmm0,xmm13 + psraw xmm1,3 + pand xmm15,xmm1 + movdqa xmm1,xmm2 + paddw xmm1,xmm2 + paddw xmm2,[rsp+90h] + paddw xmm2,[rsp+0B0h] + paddw xmm1,xmm0 + movdqa xmm0,xmm13 + movdqa xmm13,[r8] + paddw xmm0, [rsp+70h] + paddw xmm1, [rsp+0A0h] + psllw xmm2,1 + paddw xmm2,xmm0 + psraw xmm1,2 + movdqa xmm0, [rdi] + pandn xmm8,xmm1 + movdqa xmm1, [rsp+60h] + paddw xmm1,xmm2 + movdqa xmm2, [rbx] + psraw xmm1,3 + pand xmm3,xmm1 + movdqa xmm1, [rbp] + movdqa [rsp+0D0h],xmm3 + pxor xmm3,xmm3 + punpckhbw xmm0,xmm3 + punpckhbw xmm1,xmm3 + punpckhbw xmm13,xmm3 + movdqa [rsp+0C0h],xmm0 + movdqa xmm0,[r10+rbp] + movdqa [rsp],xmm1 + punpckhbw xmm0,xmm3 + punpckhbw xmm2,xmm3 + movdqa [rsp+80h],xmm0 + movdqa xmm0,[rsi+rbp] + movdqa [rsp+10h],xmm13 + punpckhbw xmm0,xmm3 + movdqa [rsp+50h],xmm0 + movdqa xmm0,xmm1 + movdqa xmm1,xmm13 + psubw xmm0,xmm13 + psubw xmm1,xmm2 + pabsw xmm3,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,[rsp] + movdqa xmm13,[rsp+40h] + movdqa [rsp+110h],xmm2 + psubw xmm1, [rsp+80h] + pcmpgtw xmm13,xmm0 + pcmpgtw xmm11,xmm3 + pabsw xmm0,xmm1 + pcmpgtw xmm10,xmm3 + movdqa xmm1, [rsp+40h] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 + pcmpgtw xmm2,xmm0 + movdqa xmm0, [rsp+10h] + pand xmm13,xmm2 + pand xmm13,xmm11 + movdqa xmm11,[rsp+0C0h] + psubw xmm0,xmm11 + pabsw xmm0,xmm0 + pcmpgtw xmm3,xmm0 + pand xmm3,xmm10 + movdqa xmm0,[rsp] + psubw xmm0,[rsp+50h] + movdqa xmm2,[rdx] + pabsw xmm0,xmm0 + por xmm7,xmm9 + movdqa xmm9,[rsp+20h] + pcmpgtw xmm1,xmm0 + pand xmm9,xmm7 + movdqa xmm7,[rsp+20h] + movdqa xmm0,xmm7 + pandn xmm0,xmm12 + movdqa xmm12,[rsp+110h] + pand xmm1,xmm10 + movdqa xmm10,[rsp+70h] + movdqa [rsp+40h],xmm1 + movdqa xmm1,xmm13 + por xmm9,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm6 + movdqa xmm6,xmm7 + punpckhbw xmm2,xmm0 + por xmm15,xmm5 + movdqa xmm5,[rsp+20h] + movdqa xmm0,xmm3 + psllw xmm2,1 + pandn xmm0,xmm11 + pand xmm6,xmm4 + movdqa xmm4,[rsp] + paddw xmm2,xmm11 + pand xmm5,xmm15 + movdqa xmm15,[rsp+20h] + paddw xmm2,xmm11 + paddw xmm2,xmm11 + paddw xmm2,xmm12 + paddw xmm2,[rsp+10h] + paddw xmm2,[rsp] + paddw xmm2,xmm10 + psraw xmm2,3 + pand xmm2,xmm3 + por xmm2,xmm0 + pand xmm1,xmm2 + movdqa xmm0,xmm13 + movdqa xmm2,xmm11 + pandn xmm0,xmm11 + paddw xmm2,xmm12 + por xmm1,xmm0 + packuswb xmm9,xmm1 + movdqa xmm0,xmm7 + movdqa xmm7,[rsp+0A0h] + pandn xmm0,[rsp+0F0h] + movdqa xmm1,xmm3 + por xmm6,xmm0 + movdqa xmm0,[rsp+10h] + paddw xmm0,xmm4 + paddw xmm2,xmm0 + paddw xmm2,xmm7 + movdqa xmm0,xmm3 + pandn xmm0,xmm12 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + movdqa xmm2,xmm13 + movdqa xmm0,xmm13 + pand xmm2,xmm1 + pandn xmm0,xmm12 + movdqa xmm1,xmm12 + paddw xmm1,[rsp+10h] + por xmm2,xmm0 + movdqa xmm0,xmm15 + pandn xmm0,[rsp+0B0h] + paddw xmm1,xmm4 + packuswb xmm6,xmm2 + movdqa xmm2,xmm3 + psllw xmm1,1 + por xmm5,xmm0 + movdqa xmm0,[rsp+80h] + paddw xmm0,xmm10 + paddw xmm1,xmm0 + paddw xmm11,xmm1 + psraw xmm11,3 + movdqa xmm1,xmm12 + pand xmm2,xmm11 + paddw xmm1,xmm12 + movdqa xmm11,[rsp+80h] + movdqa xmm0, [rsp+10h] + por xmm14,[rsp+0E0h] + paddw xmm0,xmm11 + movdqa xmm4,xmm15 + paddw xmm1,xmm0 + movdqa xmm0,xmm13 + paddw xmm1,xmm7 + psraw xmm1,2 + pandn xmm3,xmm1 + por xmm2,xmm3 + movdqa xmm1,xmm13 + movdqa xmm3,[rsp+10h] + pandn xmm0,xmm3 + pand xmm1,xmm2 + movdqa xmm2,xmm11 + paddw xmm2,[rsp] + por xmm1,xmm0 + movdqa xmm0,[rsp+0D0h] + por xmm0,xmm8 + paddw xmm2,xmm3 + packuswb xmm5,xmm1 + movdqa xmm8,[rsp+40h] + movdqa xmm1,[rsp+50h] + movdqa xmm3,xmm8 + pand xmm4,xmm0 + psllw xmm2,1 + movdqa xmm0,xmm15 + pandn xmm0,[rsp+90h] + por xmm4,xmm0 + movdqa xmm0,xmm12 + paddw xmm0,xmm10 + paddw xmm2,xmm0 + paddw xmm1,xmm2 + movdqa xmm0,[rsp] + movdqa xmm2,xmm11 + paddw xmm0,xmm12 + movdqa xmm12,[rsp] + paddw xmm2,xmm11 + paddw xmm2,xmm0 + psraw xmm1,3 + movdqa xmm0,xmm8 + pand xmm3,xmm1 + paddw xmm2,xmm7 + movdqa xmm1,xmm13 + psraw xmm2,2 + pandn xmm0,xmm2 + por xmm3,xmm0 + movdqa xmm2,[rsp+50h] + movdqa xmm0,xmm13 + pandn xmm0,xmm12 + pand xmm1,xmm3 + paddw xmm2,xmm11 + movdqa xmm3,xmm15 + por xmm1,xmm0 + pand xmm3,xmm14 + movdqa xmm14,[rsp+10h] + movdqa xmm0,xmm15 + pandn xmm0,[rsp+30h] + packuswb xmm4,xmm1 + movdqa xmm1,xmm8 + por xmm3,xmm0 + movdqa xmm0,xmm12 + paddw xmm0,xmm14 + paddw xmm2,xmm0 + paddw xmm2,xmm7 + movdqa xmm0,xmm8 + pandn xmm0,xmm11 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + movdqa xmm2,xmm13 + movdqa xmm0,xmm13 + pandn xmm0,xmm11 + pand xmm2,xmm1 + movdqa xmm1,xmm15 + por xmm2,xmm0 + packuswb xmm3,xmm2 + movdqa xmm0,[rsp+100h] + por xmm0,[rsp+120h] + pand xmm1,xmm0 + movdqa xmm2,[rcx+rbp] + movdqa xmm7,[rsp+50h] + pandn xmm15,[rsp+60h] + lea r11,[rsp+1D8h] + pxor xmm0,xmm0 + por xmm1,xmm15 + movaps xmm15,[r11-0A8h] + movdqa [rdi],xmm9 + movaps xmm9,[r11-48h] + punpckhbw xmm2,xmm0 + psllw xmm2,1 + paddw xmm2,xmm7 + paddw xmm2,xmm7 + movdqa [rbx],xmm6 + movaps xmm6,[r11-18h] + paddw xmm2,xmm7 + paddw xmm2,xmm11 + movaps xmm11,[r11-68h] + paddw xmm2,xmm12 + movaps xmm12,[r11-78h] + paddw xmm2,xmm14 + paddw xmm2,xmm10 + psraw xmm2,3 + movaps xmm10,[r11-58h] + movaps xmm14,[r11-98h] + movdqa xmm0,xmm13 + pand xmm2,xmm8 + pandn xmm8,xmm7 + pandn xmm13,xmm7 + por xmm2,xmm8 + movaps xmm7,[r11-28h] + movaps xmm8,[r11-38h] + movdqa [r8],xmm5 + pand xmm0,xmm2 + por xmm0,xmm13 + packuswb xmm1,xmm0 + movaps xmm13,[r11-88h] + movdqa [rbp],xmm4 + movdqa [r10+rbp],xmm3 + movdqa [rsi+rbp],xmm1 + mov rsp,r11 + pop rbp + pop rbx + ret WELS_EXTERN DeblockChromaLt4V_ssse3 - mov rax,rsp - push rbx - push rbp - mov r10, rdx - mov r11, rcx - mov rcx, rdi - mov rdx, rsi - mov rsi, r10 - mov r10, r9 - mov rbp, r8 - mov r8, rsi - mov r9, r11 - sub rsp,0C8h - pxor xmm1,xmm1 - mov rbx,rcx - movsxd r11,r8d - movsx ecx,byte [r10] - movsx r8d,byte [r10+2] - mov rdi,rdx - movq xmm2,[rbx] - movq xmm9,[r11+rbx] - movsx edx,byte [r10+1] - mov word [rsp+2],cx - mov word [rsp],cx - movsx eax,byte [r10+3] - mov word [rsp+6],dx - mov word [rsp+4],dx - movdqa xmm11,xmm1 - mov word [rsp+0Eh],ax - mov word [rsp+0Ch],ax - lea eax,[r11+r11] - movsxd rcx,eax - mov rax,rbx - mov rdx,rdi - sub rax,rcx - mov word [rsp+0Ah],r8w - mov word [rsp+8],r8w - movdqa xmm6,[rsp] - movdqa xmm7,xmm6 - movq xmm13, [rax] - mov rax,rdi - sub rax,rcx - mov rcx,rbx - pcmpgtw xmm7,xmm1 - psubw xmm11,xmm6 - sub rcx,r11 - sub rdx,r11 - movq xmm0,[rax] - movsx eax,r9w - movq xmm15,[rcx] - punpcklqdq xmm13,xmm0 - movq xmm0, [rdx] - movdqa xmm4,xmm13 - punpcklqdq xmm15,xmm0 - movq xmm0, [rdi] - punpcklbw xmm4,xmm1 - movdqa xmm12,xmm15 - punpcklqdq xmm2,xmm0 - movq xmm0, [r11+rdi] - punpcklbw xmm12,xmm1 - movdqa xmm14,xmm2 - punpcklqdq xmm9,xmm0 - punpckhbw xmm2,xmm1 - punpcklbw xmm14,xmm1 - movd xmm0,eax - mov eax, ebp ; iBeta - punpckhbw xmm13,xmm1 - punpckhbw xmm15,xmm1 - movdqa xmm3,xmm9 - movdqa [rsp+10h],xmm2 - punpcklwd xmm0,xmm0 - punpckhbw xmm9,xmm1 - punpcklbw xmm3,xmm1 - movdqa xmm1,xmm14 - pshufd xmm10,xmm0,0 - movd xmm0,eax - mov eax,4 - cwde - punpcklwd xmm0,xmm0 - pshufd xmm8,xmm0,0 - movd xmm0,eax - punpcklwd xmm0,xmm0 - pshufd xmm5,xmm0,0 - psubw xmm1,xmm12 - movdqa xmm2,xmm10 - lea r11,[rsp+0C8h] - psllw xmm1,2 - movdqa xmm0,xmm4 - psubw xmm4,xmm12 - psubw xmm0,xmm3 - psubw xmm3,xmm14 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm11 - psraw xmm1,3 - pmaxsw xmm0,xmm1 - pminsw xmm6,xmm0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm12 - psubw xmm0,xmm14 - pabsw xmm0,xmm0 - pcmpgtw xmm2,xmm0 - pabsw xmm0,xmm4 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm3 - movdqa xmm3,[rsp] - pand xmm2,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm1,xmm0 - movdqa xmm0,xmm13 - pand xmm2,xmm1 - psubw xmm0,xmm9 - psubw xmm13,xmm15 - pand xmm2,xmm7 - pand xmm6,xmm2 - paddw xmm12,xmm6 - psubw xmm14,xmm6 - movdqa xmm2,[rsp+10h] - movaps xmm6,[r11-18h] - movdqa xmm1,xmm2 - psubw xmm1,xmm15 - psubw xmm9,xmm2 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm15 - psubw xmm0,xmm2 - psraw xmm1,3 - pmaxsw xmm11,xmm1 - pabsw xmm0,xmm0 - movdqa xmm1,xmm8 - pcmpgtw xmm10,xmm0 - pabsw xmm0,xmm13 - pminsw xmm3,xmm11 - movaps xmm11,[r11-68h] - movaps xmm13,[rsp+40h] - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm9 - movaps xmm9, [r11-48h] - pand xmm10,xmm1 - pcmpgtw xmm8,xmm0 - pand xmm10,xmm8 - pand xmm10,xmm7 - movaps xmm8,[r11-38h] - movaps xmm7,[r11-28h] - pand xmm3,xmm10 - paddw xmm15,xmm3 - psubw xmm2,xmm3 - movaps xmm10,[r11-58h] - packuswb xmm12,xmm15 - movaps xmm15,[rsp+20h] - packuswb xmm14,xmm2 - movq [rcx],xmm12 - movq [rbx],xmm14 - psrldq xmm12,8 - psrldq xmm14,8 - movq [rdx],xmm12 - movaps xmm12,[r11-78h] - movq [rdi],xmm14 - movaps xmm14,[rsp+30h] - mov rsp,r11 - pop rbp - pop rbx - ret + mov rax,rsp + push rbx + push rbp + mov r10, rdx + mov r11, rcx + mov rcx, rdi + mov rdx, rsi + mov rsi, r10 + mov r10, r9 + mov rbp, r8 + mov r8, rsi + mov r9, r11 + sub rsp,0C8h + pxor xmm1,xmm1 + mov rbx,rcx + movsxd r11,r8d + movsx ecx,byte [r10] + movsx r8d,byte [r10+2] + mov rdi,rdx + movq xmm2,[rbx] + movq xmm9,[r11+rbx] + movsx edx,byte [r10+1] + mov word [rsp+2],cx + mov word [rsp],cx + movsx eax,byte [r10+3] + mov word [rsp+6],dx + mov word [rsp+4],dx + movdqa xmm11,xmm1 + mov word [rsp+0Eh],ax + mov word [rsp+0Ch],ax + lea eax,[r11+r11] + movsxd rcx,eax + mov rax,rbx + mov rdx,rdi + sub rax,rcx + mov word [rsp+0Ah],r8w + mov word [rsp+8],r8w + movdqa xmm6,[rsp] + movdqa xmm7,xmm6 + movq xmm13, [rax] + mov rax,rdi + sub rax,rcx + mov rcx,rbx + pcmpgtw xmm7,xmm1 + psubw xmm11,xmm6 + sub rcx,r11 + sub rdx,r11 + movq xmm0,[rax] + movsx eax,r9w + movq xmm15,[rcx] + punpcklqdq xmm13,xmm0 + movq xmm0, [rdx] + movdqa xmm4,xmm13 + punpcklqdq xmm15,xmm0 + movq xmm0, [rdi] + punpcklbw xmm4,xmm1 + movdqa xmm12,xmm15 + punpcklqdq xmm2,xmm0 + movq xmm0, [r11+rdi] + punpcklbw xmm12,xmm1 + movdqa xmm14,xmm2 + punpcklqdq xmm9,xmm0 + punpckhbw xmm2,xmm1 + punpcklbw xmm14,xmm1 + movd xmm0,eax + mov eax, ebp ; iBeta + punpckhbw xmm13,xmm1 + punpckhbw xmm15,xmm1 + movdqa xmm3,xmm9 + movdqa [rsp+10h],xmm2 + punpcklwd xmm0,xmm0 + punpckhbw xmm9,xmm1 + punpcklbw xmm3,xmm1 + movdqa xmm1,xmm14 + pshufd xmm10,xmm0,0 + movd xmm0,eax + mov eax,4 + cwde + punpcklwd xmm0,xmm0 + pshufd xmm8,xmm0,0 + movd xmm0,eax + punpcklwd xmm0,xmm0 + pshufd xmm5,xmm0,0 + psubw xmm1,xmm12 + movdqa xmm2,xmm10 + lea r11,[rsp+0C8h] + psllw xmm1,2 + movdqa xmm0,xmm4 + psubw xmm4,xmm12 + psubw xmm0,xmm3 + psubw xmm3,xmm14 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm11 + psraw xmm1,3 + pmaxsw xmm0,xmm1 + pminsw xmm6,xmm0 + movdqa xmm1,xmm8 + movdqa xmm0,xmm12 + psubw xmm0,xmm14 + pabsw xmm0,xmm0 + pcmpgtw xmm2,xmm0 + pabsw xmm0,xmm4 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm3 + movdqa xmm3,[rsp] + pand xmm2,xmm1 + movdqa xmm1,xmm8 + pcmpgtw xmm1,xmm0 + movdqa xmm0,xmm13 + pand xmm2,xmm1 + psubw xmm0,xmm9 + psubw xmm13,xmm15 + pand xmm2,xmm7 + pand xmm6,xmm2 + paddw xmm12,xmm6 + psubw xmm14,xmm6 + movdqa xmm2,[rsp+10h] + movaps xmm6,[r11-18h] + movdqa xmm1,xmm2 + psubw xmm1,xmm15 + psubw xmm9,xmm2 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm15 + psubw xmm0,xmm2 + psraw xmm1,3 + pmaxsw xmm11,xmm1 + pabsw xmm0,xmm0 + movdqa xmm1,xmm8 + pcmpgtw xmm10,xmm0 + pabsw xmm0,xmm13 + pminsw xmm3,xmm11 + movaps xmm11,[r11-68h] + movaps xmm13,[rsp+40h] + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm9 + movaps xmm9, [r11-48h] + pand xmm10,xmm1 + pcmpgtw xmm8,xmm0 + pand xmm10,xmm8 + pand xmm10,xmm7 + movaps xmm8,[r11-38h] + movaps xmm7,[r11-28h] + pand xmm3,xmm10 + paddw xmm15,xmm3 + psubw xmm2,xmm3 + movaps xmm10,[r11-58h] + packuswb xmm12,xmm15 + movaps xmm15,[rsp+20h] + packuswb xmm14,xmm2 + movq [rcx],xmm12 + movq [rbx],xmm14 + psrldq xmm12,8 + psrldq xmm14,8 + movq [rdx],xmm12 + movaps xmm12,[r11-78h] + movq [rdi],xmm14 + movaps xmm14,[rsp+30h] + mov rsp,r11 + pop rbp + pop rbx + ret WELS_EXTERN DeblockChromaEq4V_ssse3 - mov rax,rsp - push rbx - push rbp + mov rax,rsp + push rbx + push rbp - mov rbp, r8 - mov r8, rdx - mov r9, rcx - mov rcx, rdi - mov rdx, rsi + mov rbp, r8 + mov r8, rdx + mov r9, rcx + mov rcx, rdi + mov rdx, rsi - sub rsp,90h - pxor xmm1,xmm1 - mov r11,rcx - mov rbx,rdx - mov r10d,r9d - movq xmm13,[r11] - lea eax,[r8+r8] - movsxd r9,eax - mov rax,rcx - sub rax,r9 - movq xmm14,[rax] - mov rax,rdx - sub rax,r9 - movq xmm0,[rax] - movsxd rax,r8d - sub rcx,rax - sub rdx,rax - movq xmm12,[rax+r11] - movq xmm10,[rcx] - punpcklqdq xmm14,xmm0 - movdqa xmm8,xmm14 - movq xmm0,[rdx] - punpcklbw xmm8,xmm1 - punpckhbw xmm14,xmm1 - punpcklqdq xmm10,xmm0 - movq xmm0,[rbx] - movdqa xmm5,xmm10 - punpcklqdq xmm13,xmm0 - movq xmm0, [rax+rbx] - punpcklbw xmm5,xmm1 - movsx eax,r10w - movdqa xmm9,xmm13 - punpcklqdq xmm12,xmm0 - punpcklbw xmm9,xmm1 - punpckhbw xmm10,xmm1 - movd xmm0,eax - mov eax, ebp ; iBeta - punpckhbw xmm13,xmm1 - movdqa xmm7,xmm12 - punpcklwd xmm0,xmm0 - punpckhbw xmm12,xmm1 - pshufd xmm11,xmm0,0 - punpcklbw xmm7,xmm1 - movd xmm0,eax - movdqa xmm1,xmm8 - psubw xmm1,xmm5 - punpcklwd xmm0,xmm0 - movdqa xmm6,xmm11 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm5 - psubw xmm0,xmm9 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm10 - movdqa xmm1,xmm14 - psubw xmm0,xmm13 - psubw xmm1,xmm10 - pabsw xmm0,xmm0 - pcmpgtw xmm11,xmm0 - pabsw xmm0,xmm1 - pcmpgtw xmm2,xmm0 - pand xmm11,xmm2 - movdqa xmm0,xmm12 - movdqa xmm4,xmm6 - movdqa xmm1,xmm8 - mov eax,2 - cwde - paddw xmm1,xmm8 - psubw xmm0,xmm13 - paddw xmm1,xmm5 - pabsw xmm0,xmm0 - movdqa xmm2,xmm14 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm14 - movd xmm0,eax - pand xmm11,xmm3 - paddw xmm7,xmm7 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - paddw xmm2,xmm12 - paddw xmm12,xmm12 - pshufd xmm3,xmm0,0 - paddw xmm7,xmm9 - paddw xmm12,xmm13 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm5 - paddw xmm7,xmm8 - psraw xmm1,2 - paddw xmm12,xmm14 - paddw xmm7,xmm3 - ;movaps xmm14,[rsp] - pand xmm4,xmm1 - paddw xmm12,xmm3 - psraw xmm7,2 - movdqa xmm1,xmm11 - por xmm4,xmm0 - psraw xmm12,2 - paddw xmm2,xmm3 - movdqa xmm0,xmm11 - pandn xmm0,xmm10 - psraw xmm2,2 - pand xmm1,xmm2 - por xmm1,xmm0 - packuswb xmm4,xmm1 - movdqa xmm0,xmm11 - movdqa xmm1,xmm6 - pand xmm1,xmm7 - movq [rcx],xmm4 - pandn xmm6,xmm9 - pandn xmm11,xmm13 - pand xmm0,xmm12 - por xmm1,xmm6 - por xmm0,xmm11 - psrldq xmm4,8 - packuswb xmm1,xmm0 - movq [r11],xmm1 - psrldq xmm1,8 - movq [rdx],xmm4 - lea r11,[rsp+90h] - movq [rbx],xmm1 - mov rsp,r11 - pop rbp - pop rbx - ret + sub rsp,90h + pxor xmm1,xmm1 + mov r11,rcx + mov rbx,rdx + mov r10d,r9d + movq xmm13,[r11] + lea eax,[r8+r8] + movsxd r9,eax + mov rax,rcx + sub rax,r9 + movq xmm14,[rax] + mov rax,rdx + sub rax,r9 + movq xmm0,[rax] + movsxd rax,r8d + sub rcx,rax + sub rdx,rax + movq xmm12,[rax+r11] + movq xmm10,[rcx] + punpcklqdq xmm14,xmm0 + movdqa xmm8,xmm14 + movq xmm0,[rdx] + punpcklbw xmm8,xmm1 + punpckhbw xmm14,xmm1 + punpcklqdq xmm10,xmm0 + movq xmm0,[rbx] + movdqa xmm5,xmm10 + punpcklqdq xmm13,xmm0 + movq xmm0, [rax+rbx] + punpcklbw xmm5,xmm1 + movsx eax,r10w + movdqa xmm9,xmm13 + punpcklqdq xmm12,xmm0 + punpcklbw xmm9,xmm1 + punpckhbw xmm10,xmm1 + movd xmm0,eax + mov eax, ebp ; iBeta + punpckhbw xmm13,xmm1 + movdqa xmm7,xmm12 + punpcklwd xmm0,xmm0 + punpckhbw xmm12,xmm1 + pshufd xmm11,xmm0,0 + punpcklbw xmm7,xmm1 + movd xmm0,eax + movdqa xmm1,xmm8 + psubw xmm1,xmm5 + punpcklwd xmm0,xmm0 + movdqa xmm6,xmm11 + pshufd xmm3,xmm0,0 + movdqa xmm0,xmm5 + psubw xmm0,xmm9 + movdqa xmm2,xmm3 + pabsw xmm0,xmm0 + pcmpgtw xmm6,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm3 + pcmpgtw xmm2,xmm0 + pand xmm6,xmm2 + movdqa xmm0,xmm7 + movdqa xmm2,xmm3 + psubw xmm0,xmm9 + pabsw xmm0,xmm0 + pcmpgtw xmm1,xmm0 + pand xmm6,xmm1 + movdqa xmm0,xmm10 + movdqa xmm1,xmm14 + psubw xmm0,xmm13 + psubw xmm1,xmm10 + pabsw xmm0,xmm0 + pcmpgtw xmm11,xmm0 + pabsw xmm0,xmm1 + pcmpgtw xmm2,xmm0 + pand xmm11,xmm2 + movdqa xmm0,xmm12 + movdqa xmm4,xmm6 + movdqa xmm1,xmm8 + mov eax,2 + cwde + paddw xmm1,xmm8 + psubw xmm0,xmm13 + paddw xmm1,xmm5 + pabsw xmm0,xmm0 + movdqa xmm2,xmm14 + paddw xmm1,xmm7 + pcmpgtw xmm3,xmm0 + paddw xmm2,xmm14 + movd xmm0,eax + pand xmm11,xmm3 + paddw xmm7,xmm7 + paddw xmm2,xmm10 + punpcklwd xmm0,xmm0 + paddw xmm2,xmm12 + paddw xmm12,xmm12 + pshufd xmm3,xmm0,0 + paddw xmm7,xmm9 + paddw xmm12,xmm13 + movdqa xmm0,xmm6 + paddw xmm1,xmm3 + pandn xmm0,xmm5 + paddw xmm7,xmm8 + psraw xmm1,2 + paddw xmm12,xmm14 + paddw xmm7,xmm3 + ;movaps xmm14,[rsp] + pand xmm4,xmm1 + paddw xmm12,xmm3 + psraw xmm7,2 + movdqa xmm1,xmm11 + por xmm4,xmm0 + psraw xmm12,2 + paddw xmm2,xmm3 + movdqa xmm0,xmm11 + pandn xmm0,xmm10 + psraw xmm2,2 + pand xmm1,xmm2 + por xmm1,xmm0 + packuswb xmm4,xmm1 + movdqa xmm0,xmm11 + movdqa xmm1,xmm6 + pand xmm1,xmm7 + movq [rcx],xmm4 + pandn xmm6,xmm9 + pandn xmm11,xmm13 + pand xmm0,xmm12 + por xmm1,xmm6 + por xmm0,xmm11 + psrldq xmm4,8 + packuswb xmm1,xmm0 + movq [r11],xmm1 + psrldq xmm1,8 + movq [rdx],xmm4 + lea r11,[rsp+90h] + movq [rbx],xmm1 + mov rsp,r11 + pop rbp + pop rbx + ret WELS_EXTERN DeblockChromaEq4H_ssse3 - mov rax,rsp - push rbx - push rbp - push r12 + mov rax,rsp + push rbx + push rbp + push r12 - mov rbp, r8 - mov r8, rdx - mov r9, rcx - mov rcx, rdi - mov rdx, rsi - mov rdi, rdx + mov rbp, r8 + mov r8, rdx + mov r9, rcx + mov rcx, rdi + mov rdx, rsi + mov rdi, rdx - sub rsp,140h - lea eax,[r8*4] - movsxd r10,eax - mov eax,[rcx-2] - mov [rsp+10h],eax - lea rbx,[r10+rdx-2] - lea r11,[r10+rcx-2] + sub rsp,140h + lea eax,[r8*4] + movsxd r10,eax + mov eax,[rcx-2] + mov [rsp+10h],eax + lea rbx,[r10+rdx-2] + lea r11,[r10+rcx-2] - movdqa xmm5,[rsp+10h] - movsxd r10,r8d - mov eax,[r10+rcx-2] - lea rdx,[r10+r10*2] - mov [rsp+20h],eax - mov eax,[rcx+r10*2-2] - mov [rsp+30h],eax - mov eax,[rdx+rcx-2] - movdqa xmm2,[rsp+20h] - mov [rsp+40h],eax - mov eax, [rdi-2] - movdqa xmm4,[rsp+30h] - mov [rsp+50h],eax - mov eax,[r10+rdi-2] - movdqa xmm3,[rsp+40h] - mov [rsp+60h],eax - mov eax,[rdi+r10*2-2] - punpckldq xmm5,[rsp+50h] - mov [rsp+70h],eax - mov eax, [rdx+rdi-2] - punpckldq xmm2, [rsp+60h] - mov [rsp+80h],eax - mov eax,[r11] - punpckldq xmm4, [rsp+70h] - mov [rsp+50h],eax - mov eax,[rbx] - punpckldq xmm3,[rsp+80h] - mov [rsp+60h],eax - mov eax,[r10+r11] - movdqa xmm0, [rsp+50h] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[r10+rbx] - movdqa xmm0,[rsp+50h] - movdqa xmm1,xmm5 - mov [rsp+60h],eax - mov eax,[r11+r10*2] - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax,[rbx+r10*2] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - mov eax, [rdx+r11] - movdqa xmm15,xmm1 - punpckldq xmm0,[rsp+60h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+50h],xmm0 - mov [rsp+50h],eax - mov eax, [rdx+rbx] - movdqa xmm0,[rsp+50h] - mov [rsp+60h],eax - punpckldq xmm0, [rsp+60h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm15,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm12,xmm15 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm12,xmm0 - punpckhdq xmm15,xmm0 - movdqa xmm0,xmm1 - movdqa xmm11,xmm12 - punpckldq xmm0,xmm5 - punpckhdq xmm1,xmm5 - punpcklqdq xmm11,xmm0 - punpckhqdq xmm12,xmm0 - movsx eax,r9w - movdqa xmm14,xmm15 - punpcklqdq xmm14,xmm1 - punpckhqdq xmm15,xmm1 - pxor xmm1,xmm1 - movd xmm0,eax - movdqa xmm4,xmm12 - movdqa xmm8,xmm11 - mov eax, ebp ; iBeta - punpcklwd xmm0,xmm0 - punpcklbw xmm4,xmm1 - punpckhbw xmm12,xmm1 - movdqa xmm9,xmm14 - movdqa xmm7,xmm15 - movdqa xmm10,xmm15 - pshufd xmm13,xmm0,0 - punpcklbw xmm9,xmm1 - punpckhbw xmm14,xmm1 - movdqa xmm6,xmm13 - movd xmm0,eax - movdqa [rsp],xmm11 - mov eax,2 - cwde - punpckhbw xmm11,xmm1 - punpckhbw xmm10,xmm1 - punpcklbw xmm7,xmm1 - punpcklwd xmm0,xmm0 - punpcklbw xmm8,xmm1 - pshufd xmm3,xmm0,0 - movdqa xmm1,xmm8 - movdqa xmm0,xmm4 - psubw xmm0,xmm9 - psubw xmm1,xmm4 - movdqa xmm2,xmm3 - pabsw xmm0,xmm0 - pcmpgtw xmm6,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm3 - pcmpgtw xmm2,xmm0 - pand xmm6,xmm2 - movdqa xmm0,xmm7 - movdqa xmm2,xmm3 - psubw xmm0,xmm9 - pabsw xmm0,xmm0 - pcmpgtw xmm1,xmm0 - pand xmm6,xmm1 - movdqa xmm0,xmm12 - movdqa xmm1,xmm11 - psubw xmm0,xmm14 - psubw xmm1,xmm12 - movdqa xmm5,xmm6 - pabsw xmm0,xmm0 - pcmpgtw xmm13,xmm0 - pabsw xmm0,xmm1 - movdqa xmm1,xmm8 - pcmpgtw xmm2,xmm0 - paddw xmm1,xmm8 - movdqa xmm0,xmm10 - pand xmm13,xmm2 - psubw xmm0,xmm14 - paddw xmm1,xmm4 - movdqa xmm2,xmm11 - pabsw xmm0,xmm0 - paddw xmm2,xmm11 - paddw xmm1,xmm7 - pcmpgtw xmm3,xmm0 - paddw xmm2,xmm12 - movd xmm0,eax - pand xmm13,xmm3 - paddw xmm2,xmm10 - punpcklwd xmm0,xmm0 - pshufd xmm3,xmm0,0 - movdqa xmm0,xmm6 - paddw xmm1,xmm3 - pandn xmm0,xmm4 - paddw xmm2,xmm3 - psraw xmm1,2 - pand xmm5,xmm1 - por xmm5,xmm0 - paddw xmm7,xmm7 - paddw xmm10,xmm10 - psraw xmm2,2 - movdqa xmm1,xmm13 - movdqa xmm0,xmm13 - pandn xmm0,xmm12 - pand xmm1,xmm2 - paddw xmm7,xmm9 - por xmm1,xmm0 - paddw xmm10,xmm14 - paddw xmm7,xmm8 - movdqa xmm0,xmm13 - packuswb xmm5,xmm1 - paddw xmm7,xmm3 - paddw xmm10,xmm11 - movdqa xmm1,xmm6 - paddw xmm10,xmm3 - pandn xmm6,xmm9 - psraw xmm7,2 - pand xmm1,xmm7 - psraw xmm10,2 - pandn xmm13,xmm14 - pand xmm0,xmm10 - por xmm1,xmm6 - movdqa xmm6,[rsp] - movdqa xmm4,xmm6 - por xmm0,xmm13 - punpcklbw xmm4,xmm5 - punpckhbw xmm6,xmm5 - movdqa xmm3,xmm4 - packuswb xmm1,xmm0 - movdqa xmm0,xmm1 - punpckhbw xmm1,xmm15 - punpcklbw xmm0,xmm15 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm6 - movdqa xmm2,xmm3 - punpcklwd xmm0,xmm1 - punpckhwd xmm6,xmm1 - movdqa xmm1,xmm4 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm6 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm6 - punpckhqdq xmm2,xmm1 - movdqa [rsp+10h],xmm0 - movdqa [rsp+60h],xmm2 - movdqa xmm0,xmm3 - mov eax,[rsp+10h] - mov [rcx-2],eax - mov eax,[rsp+60h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [r10+rcx-2],eax - movdqa [rsp+20h],xmm0 - mov eax, [rsp+20h] - movdqa [rsp+70h],xmm3 - mov [rcx+r10*2-2],eax - mov eax,[rsp+70h] - mov [rdx+rcx-2],eax - mov eax,[rsp+18h] - mov [r11],eax - mov eax,[rsp+68h] - mov [r10+r11],eax - mov eax,[rsp+28h] - mov [r11+r10*2],eax - mov eax,[rsp+78h] - mov [rdx+r11],eax - mov eax,[rsp+14h] - mov [rdi-2],eax - mov eax,[rsp+64h] - mov [r10+rdi-2],eax - mov eax,[rsp+24h] - mov [rdi+r10*2-2],eax - mov eax, [rsp+74h] - mov [rdx+rdi-2],eax - mov eax, [rsp+1Ch] - mov [rbx],eax - mov eax, [rsp+6Ch] - mov [r10+rbx],eax - mov eax,[rsp+2Ch] - mov [rbx+r10*2],eax - mov eax,[rsp+7Ch] - mov [rdx+rbx],eax - lea r11,[rsp+140h] - mov rbx, [r11+28h] - mov rsp,r11 - pop r12 - pop rbp - pop rbx - ret + movdqa xmm5,[rsp+10h] + movsxd r10,r8d + mov eax,[r10+rcx-2] + lea rdx,[r10+r10*2] + mov [rsp+20h],eax + mov eax,[rcx+r10*2-2] + mov [rsp+30h],eax + mov eax,[rdx+rcx-2] + movdqa xmm2,[rsp+20h] + mov [rsp+40h],eax + mov eax, [rdi-2] + movdqa xmm4,[rsp+30h] + mov [rsp+50h],eax + mov eax,[r10+rdi-2] + movdqa xmm3,[rsp+40h] + mov [rsp+60h],eax + mov eax,[rdi+r10*2-2] + punpckldq xmm5,[rsp+50h] + mov [rsp+70h],eax + mov eax, [rdx+rdi-2] + punpckldq xmm2, [rsp+60h] + mov [rsp+80h],eax + mov eax,[r11] + punpckldq xmm4, [rsp+70h] + mov [rsp+50h],eax + mov eax,[rbx] + punpckldq xmm3,[rsp+80h] + mov [rsp+60h],eax + mov eax,[r10+r11] + movdqa xmm0, [rsp+50h] + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm5,xmm0 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax,[r10+rbx] + movdqa xmm0,[rsp+50h] + movdqa xmm1,xmm5 + mov [rsp+60h],eax + mov eax,[r11+r10*2] + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm2,xmm0 + punpcklbw xmm1,xmm2 + punpckhbw xmm5,xmm2 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax,[rbx+r10*2] + movdqa xmm0,[rsp+50h] + mov [rsp+60h],eax + mov eax, [rdx+r11] + movdqa xmm15,xmm1 + punpckldq xmm0,[rsp+60h] + punpcklqdq xmm4,xmm0 + movdqa [rsp+50h],xmm0 + mov [rsp+50h],eax + mov eax, [rdx+rbx] + movdqa xmm0,[rsp+50h] + mov [rsp+60h],eax + punpckldq xmm0, [rsp+60h] + punpcklqdq xmm3,xmm0 + movdqa xmm0,xmm4 + punpcklbw xmm0,xmm3 + punpckhbw xmm4,xmm3 + punpcklwd xmm15,xmm0 + punpckhwd xmm1,xmm0 + movdqa xmm0,xmm5 + movdqa xmm12,xmm15 + punpcklwd xmm0,xmm4 + punpckhwd xmm5,xmm4 + punpckldq xmm12,xmm0 + punpckhdq xmm15,xmm0 + movdqa xmm0,xmm1 + movdqa xmm11,xmm12 + punpckldq xmm0,xmm5 + punpckhdq xmm1,xmm5 + punpcklqdq xmm11,xmm0 + punpckhqdq xmm12,xmm0 + movsx eax,r9w + movdqa xmm14,xmm15 + punpcklqdq xmm14,xmm1 + punpckhqdq xmm15,xmm1 + pxor xmm1,xmm1 + movd xmm0,eax + movdqa xmm4,xmm12 + movdqa xmm8,xmm11 + mov eax, ebp ; iBeta + punpcklwd xmm0,xmm0 + punpcklbw xmm4,xmm1 + punpckhbw xmm12,xmm1 + movdqa xmm9,xmm14 + movdqa xmm7,xmm15 + movdqa xmm10,xmm15 + pshufd xmm13,xmm0,0 + punpcklbw xmm9,xmm1 + punpckhbw xmm14,xmm1 + movdqa xmm6,xmm13 + movd xmm0,eax + movdqa [rsp],xmm11 + mov eax,2 + cwde + punpckhbw xmm11,xmm1 + punpckhbw xmm10,xmm1 + punpcklbw xmm7,xmm1 + punpcklwd xmm0,xmm0 + punpcklbw xmm8,xmm1 + pshufd xmm3,xmm0,0 + movdqa xmm1,xmm8 + movdqa xmm0,xmm4 + psubw xmm0,xmm9 + psubw xmm1,xmm4 + movdqa xmm2,xmm3 + pabsw xmm0,xmm0 + pcmpgtw xmm6,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm3 + pcmpgtw xmm2,xmm0 + pand xmm6,xmm2 + movdqa xmm0,xmm7 + movdqa xmm2,xmm3 + psubw xmm0,xmm9 + pabsw xmm0,xmm0 + pcmpgtw xmm1,xmm0 + pand xmm6,xmm1 + movdqa xmm0,xmm12 + movdqa xmm1,xmm11 + psubw xmm0,xmm14 + psubw xmm1,xmm12 + movdqa xmm5,xmm6 + pabsw xmm0,xmm0 + pcmpgtw xmm13,xmm0 + pabsw xmm0,xmm1 + movdqa xmm1,xmm8 + pcmpgtw xmm2,xmm0 + paddw xmm1,xmm8 + movdqa xmm0,xmm10 + pand xmm13,xmm2 + psubw xmm0,xmm14 + paddw xmm1,xmm4 + movdqa xmm2,xmm11 + pabsw xmm0,xmm0 + paddw xmm2,xmm11 + paddw xmm1,xmm7 + pcmpgtw xmm3,xmm0 + paddw xmm2,xmm12 + movd xmm0,eax + pand xmm13,xmm3 + paddw xmm2,xmm10 + punpcklwd xmm0,xmm0 + pshufd xmm3,xmm0,0 + movdqa xmm0,xmm6 + paddw xmm1,xmm3 + pandn xmm0,xmm4 + paddw xmm2,xmm3 + psraw xmm1,2 + pand xmm5,xmm1 + por xmm5,xmm0 + paddw xmm7,xmm7 + paddw xmm10,xmm10 + psraw xmm2,2 + movdqa xmm1,xmm13 + movdqa xmm0,xmm13 + pandn xmm0,xmm12 + pand xmm1,xmm2 + paddw xmm7,xmm9 + por xmm1,xmm0 + paddw xmm10,xmm14 + paddw xmm7,xmm8 + movdqa xmm0,xmm13 + packuswb xmm5,xmm1 + paddw xmm7,xmm3 + paddw xmm10,xmm11 + movdqa xmm1,xmm6 + paddw xmm10,xmm3 + pandn xmm6,xmm9 + psraw xmm7,2 + pand xmm1,xmm7 + psraw xmm10,2 + pandn xmm13,xmm14 + pand xmm0,xmm10 + por xmm1,xmm6 + movdqa xmm6,[rsp] + movdqa xmm4,xmm6 + por xmm0,xmm13 + punpcklbw xmm4,xmm5 + punpckhbw xmm6,xmm5 + movdqa xmm3,xmm4 + packuswb xmm1,xmm0 + movdqa xmm0,xmm1 + punpckhbw xmm1,xmm15 + punpcklbw xmm0,xmm15 + punpcklwd xmm3,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm0,xmm6 + movdqa xmm2,xmm3 + punpcklwd xmm0,xmm1 + punpckhwd xmm6,xmm1 + movdqa xmm1,xmm4 + punpckldq xmm2,xmm0 + punpckhdq xmm3,xmm0 + punpckldq xmm1,xmm6 + movdqa xmm0,xmm2 + punpcklqdq xmm0,xmm1 + punpckhdq xmm4,xmm6 + punpckhqdq xmm2,xmm1 + movdqa [rsp+10h],xmm0 + movdqa [rsp+60h],xmm2 + movdqa xmm0,xmm3 + mov eax,[rsp+10h] + mov [rcx-2],eax + mov eax,[rsp+60h] + punpcklqdq xmm0,xmm4 + punpckhqdq xmm3,xmm4 + mov [r10+rcx-2],eax + movdqa [rsp+20h],xmm0 + mov eax, [rsp+20h] + movdqa [rsp+70h],xmm3 + mov [rcx+r10*2-2],eax + mov eax,[rsp+70h] + mov [rdx+rcx-2],eax + mov eax,[rsp+18h] + mov [r11],eax + mov eax,[rsp+68h] + mov [r10+r11],eax + mov eax,[rsp+28h] + mov [r11+r10*2],eax + mov eax,[rsp+78h] + mov [rdx+r11],eax + mov eax,[rsp+14h] + mov [rdi-2],eax + mov eax,[rsp+64h] + mov [r10+rdi-2],eax + mov eax,[rsp+24h] + mov [rdi+r10*2-2],eax + mov eax, [rsp+74h] + mov [rdx+rdi-2],eax + mov eax, [rsp+1Ch] + mov [rbx],eax + mov eax, [rsp+6Ch] + mov [r10+rbx],eax + mov eax,[rsp+2Ch] + mov [rbx+r10*2],eax + mov eax,[rsp+7Ch] + mov [rdx+rbx],eax + lea r11,[rsp+140h] + mov rbx, [r11+28h] + mov rsp,r11 + pop r12 + pop rbp + pop rbx + ret WELS_EXTERN DeblockChromaLt4H_ssse3 - mov rax,rsp - push rbx - push rbp - push r12 - push r13 - push r14 - sub rsp,170h + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + sub rsp,170h - mov r13, r8 - mov r14, r9 - mov r8, rdx - mov r9, rcx - mov rdx, rdi - mov rcx, rsi + mov r13, r8 + mov r14, r9 + mov r8, rdx + mov r9, rcx + mov rdx, rdi + mov rcx, rsi - movsxd rsi,r8d - lea eax,[r8*4] - mov r11d,r9d - movsxd r10,eax - mov eax, [rcx-2] - mov r12,rdx - mov [rsp+40h],eax - mov eax, [rsi+rcx-2] - lea rbx,[r10+rcx-2] - movdqa xmm5,[rsp+40h] - mov [rsp+50h],eax - mov eax, [rcx+rsi*2-2] - lea rbp,[r10+rdx-2] - movdqa xmm2, [rsp+50h] - mov [rsp+60h],eax - lea r10,[rsi+rsi*2] - mov rdi,rcx - mov eax,[r10+rcx-2] - movdqa xmm4,[rsp+60h] - mov [rsp+70h],eax - mov eax,[rdx-2] - mov [rsp+80h],eax - mov eax, [rsi+rdx-2] - movdqa xmm3,[rsp+70h] - mov [rsp+90h],eax - mov eax,[rdx+rsi*2-2] - punpckldq xmm5,[rsp+80h] - mov [rsp+0A0h],eax - mov eax, [r10+rdx-2] - punpckldq xmm2,[rsp+90h] - mov [rsp+0B0h],eax - mov eax, [rbx] - punpckldq xmm4,[rsp+0A0h] - mov [rsp+80h],eax - mov eax,[rbp] - punpckldq xmm3,[rsp+0B0h] - mov [rsp+90h],eax - mov eax,[rsi+rbx] - movdqa xmm0,[rsp+80h] - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm5,xmm0 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax,[rsi+rbp] - movdqa xmm0,[rsp+80h] - movdqa xmm1,xmm5 - mov [rsp+90h],eax - mov eax,[rbx+rsi*2] - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm2,xmm0 - punpcklbw xmm1,xmm2 - punpckhbw xmm5,xmm2 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax,[rbp+rsi*2] - movdqa xmm0, [rsp+80h] - mov [rsp+90h],eax - mov eax,[r10+rbx] - movdqa xmm7,xmm1 - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm4,xmm0 - movdqa [rsp+80h],xmm0 - mov [rsp+80h],eax - mov eax, [r10+rbp] - movdqa xmm0,[rsp+80h] - mov [rsp+90h],eax - punpckldq xmm0,[rsp+90h] - punpcklqdq xmm3,xmm0 - movdqa xmm0,xmm4 - punpcklbw xmm0,xmm3 - punpckhbw xmm4,xmm3 - punpcklwd xmm7,xmm0 - punpckhwd xmm1,xmm0 - movdqa xmm0,xmm5 - movdqa xmm6,xmm7 - punpcklwd xmm0,xmm4 - punpckhwd xmm5,xmm4 - punpckldq xmm6,xmm0 - punpckhdq xmm7,xmm0 - movdqa xmm0,xmm1 - punpckldq xmm0,xmm5 - mov rax, r14 ; pTC - punpckhdq xmm1,xmm5 - movdqa xmm9,xmm6 - punpckhqdq xmm6,xmm0 - punpcklqdq xmm9,xmm0 - movdqa xmm2,xmm7 - movdqa xmm13,xmm6 - movdqa xmm4,xmm9 - movdqa [rsp+10h],xmm9 - punpcklqdq xmm2,xmm1 - punpckhqdq xmm7,xmm1 - pxor xmm1,xmm1 - movsx ecx,byte [rax+3] - movsx edx,byte [rax+2] - movsx r8d,byte [rax+1] - movsx r9d,byte [rax] - movdqa xmm10,xmm1 - movdqa xmm15,xmm2 - punpckhbw xmm2,xmm1 - punpckhbw xmm6,xmm1 - punpcklbw xmm4,xmm1 - movsx eax,r11w - mov word [rsp+0Eh],cx - mov word [rsp+0Ch],cx - movdqa xmm3,xmm7 - movdqa xmm8,xmm7 - movdqa [rsp+20h],xmm7 - punpcklbw xmm15,xmm1 - punpcklbw xmm13,xmm1 - punpcklbw xmm3,xmm1 - mov word [rsp+0Ah],dx - mov word [rsp+8],dx - mov word [rsp+6],r8w - movd xmm0,eax - movdqa [rsp+30h],xmm6 - punpckhbw xmm9,xmm1 - punpckhbw xmm8,xmm1 - punpcklwd xmm0,xmm0 - mov eax, r13d ; iBeta - mov word [rsp+4],r8w - mov word [rsp+2],r9w - pshufd xmm12,xmm0,0 - mov word [rsp],r9w - movd xmm0,eax - mov eax,4 - cwde - movdqa xmm14, [rsp] - movdqa [rsp],xmm2 - movdqa xmm2,xmm12 - punpcklwd xmm0,xmm0 - pshufd xmm11,xmm0,0 - psubw xmm10,xmm14 - movd xmm0,eax - movdqa xmm7,xmm14 - movdqa xmm6,xmm14 - pcmpgtw xmm7,xmm1 - punpcklwd xmm0,xmm0 - pshufd xmm5,xmm0,0 - movdqa xmm0,xmm4 - movdqa xmm1,xmm15 - psubw xmm4,xmm13 - psubw xmm0,xmm3 - psubw xmm1,xmm13 - psubw xmm3,xmm15 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm10 - psraw xmm1,3 - pmaxsw xmm0,xmm1 - pminsw xmm6,xmm0 - movdqa xmm1,xmm11 - movdqa xmm0,xmm13 - psubw xmm0,xmm15 - pabsw xmm0,xmm0 - pcmpgtw xmm2,xmm0 - pabsw xmm0,xmm4 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm3 - pand xmm2,xmm1 - movdqa xmm1,xmm11 - movdqa xmm3,[rsp+30h] - pcmpgtw xmm1,xmm0 - movdqa xmm0,xmm9 - pand xmm2,xmm1 - psubw xmm0,xmm8 - psubw xmm9,xmm3 - pand xmm2,xmm7 - pand xmm6,xmm2 - psubw xmm15,xmm6 - paddw xmm13,xmm6 - movdqa xmm2,[rsp] - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - psubw xmm8,xmm2 - psllw xmm1,2 - paddw xmm1,xmm0 - paddw xmm1,xmm5 - movdqa xmm0,xmm3 - movdqa xmm5,[rsp+10h] - psubw xmm0,xmm2 - psraw xmm1,3 - movdqa xmm4,xmm5 - pabsw xmm0,xmm0 - pmaxsw xmm10,xmm1 - movdqa xmm1,xmm11 - pcmpgtw xmm12,xmm0 - pabsw xmm0,xmm9 - pminsw xmm14,xmm10 - pcmpgtw xmm1,xmm0 - pabsw xmm0,xmm8 - pcmpgtw xmm11,xmm0 - pand xmm12,xmm1 - movdqa xmm1,[rsp+20h] - pand xmm12,xmm11 - pand xmm12,xmm7 - pand xmm14,xmm12 - paddw xmm3,xmm14 - psubw xmm2,xmm14 - packuswb xmm13,xmm3 - packuswb xmm15,xmm2 - punpcklbw xmm4,xmm13 - punpckhbw xmm5,xmm13 - movdqa xmm0,xmm15 - punpcklbw xmm0,xmm1 - punpckhbw xmm15,xmm1 - movdqa xmm3,xmm4 - punpcklwd xmm3,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm0,xmm5 - movdqa xmm2,xmm3 - movdqa xmm1,xmm4 - punpcklwd xmm0,xmm15 - punpckhwd xmm5,xmm15 - punpckldq xmm2,xmm0 - punpckhdq xmm3,xmm0 - punpckldq xmm1,xmm5 - movdqa xmm0,xmm2 - punpcklqdq xmm0,xmm1 - punpckhdq xmm4,xmm5 - punpckhqdq xmm2,xmm1 - movdqa [rsp+40h],xmm0 - movdqa xmm0,xmm3 - movdqa [rsp+90h],xmm2 - mov eax,[rsp+40h] - mov [rdi-2],eax - mov eax, [rsp+90h] - punpcklqdq xmm0,xmm4 - punpckhqdq xmm3,xmm4 - mov [rsi+rdi-2],eax - movdqa [rsp+50h],xmm0 - mov eax,[rsp+50h] - movdqa [rsp+0A0h],xmm3 - mov [rdi+rsi*2-2],eax - mov eax,[rsp+0A0h] - mov [r10+rdi-2],eax - mov eax,[rsp+48h] - mov [rbx],eax - mov eax,[rsp+98h] - mov [rsi+rbx],eax - mov eax,[rsp+58h] - mov [rbx+rsi*2],eax - mov eax, [rsp+0A8h] - mov [r10+rbx],eax - mov eax, [rsp+44h] - mov [r12-2],eax - mov eax,[rsp+94h] - mov [rsi+r12-2],eax - mov eax,[rsp+54h] - mov [r12+rsi*2-2],eax - mov eax, [rsp+0A4h] - mov [r10+r12-2],eax - mov eax,[rsp+4Ch] - mov [rbp],eax - mov eax,[rsp+9Ch] - mov [rsi+rbp],eax - mov eax, [rsp+5Ch] - mov [rbp+rsi*2],eax - mov eax,[rsp+0ACh] - mov [r10+rbp],eax - lea r11,[rsp+170h] - mov rsp,r11 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - ret + movsxd rsi,r8d + lea eax,[r8*4] + mov r11d,r9d + movsxd r10,eax + mov eax, [rcx-2] + mov r12,rdx + mov [rsp+40h],eax + mov eax, [rsi+rcx-2] + lea rbx,[r10+rcx-2] + movdqa xmm5,[rsp+40h] + mov [rsp+50h],eax + mov eax, [rcx+rsi*2-2] + lea rbp,[r10+rdx-2] + movdqa xmm2, [rsp+50h] + mov [rsp+60h],eax + lea r10,[rsi+rsi*2] + mov rdi,rcx + mov eax,[r10+rcx-2] + movdqa xmm4,[rsp+60h] + mov [rsp+70h],eax + mov eax,[rdx-2] + mov [rsp+80h],eax + mov eax, [rsi+rdx-2] + movdqa xmm3,[rsp+70h] + mov [rsp+90h],eax + mov eax,[rdx+rsi*2-2] + punpckldq xmm5,[rsp+80h] + mov [rsp+0A0h],eax + mov eax, [r10+rdx-2] + punpckldq xmm2,[rsp+90h] + mov [rsp+0B0h],eax + mov eax, [rbx] + punpckldq xmm4,[rsp+0A0h] + mov [rsp+80h],eax + mov eax,[rbp] + punpckldq xmm3,[rsp+0B0h] + mov [rsp+90h],eax + mov eax,[rsi+rbx] + movdqa xmm0,[rsp+80h] + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm5,xmm0 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax,[rsi+rbp] + movdqa xmm0,[rsp+80h] + movdqa xmm1,xmm5 + mov [rsp+90h],eax + mov eax,[rbx+rsi*2] + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm2,xmm0 + punpcklbw xmm1,xmm2 + punpckhbw xmm5,xmm2 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax,[rbp+rsi*2] + movdqa xmm0, [rsp+80h] + mov [rsp+90h],eax + mov eax,[r10+rbx] + movdqa xmm7,xmm1 + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm4,xmm0 + movdqa [rsp+80h],xmm0 + mov [rsp+80h],eax + mov eax, [r10+rbp] + movdqa xmm0,[rsp+80h] + mov [rsp+90h],eax + punpckldq xmm0,[rsp+90h] + punpcklqdq xmm3,xmm0 + movdqa xmm0,xmm4 + punpcklbw xmm0,xmm3 + punpckhbw xmm4,xmm3 + punpcklwd xmm7,xmm0 + punpckhwd xmm1,xmm0 + movdqa xmm0,xmm5 + movdqa xmm6,xmm7 + punpcklwd xmm0,xmm4 + punpckhwd xmm5,xmm4 + punpckldq xmm6,xmm0 + punpckhdq xmm7,xmm0 + movdqa xmm0,xmm1 + punpckldq xmm0,xmm5 + mov rax, r14 ; pTC + punpckhdq xmm1,xmm5 + movdqa xmm9,xmm6 + punpckhqdq xmm6,xmm0 + punpcklqdq xmm9,xmm0 + movdqa xmm2,xmm7 + movdqa xmm13,xmm6 + movdqa xmm4,xmm9 + movdqa [rsp+10h],xmm9 + punpcklqdq xmm2,xmm1 + punpckhqdq xmm7,xmm1 + pxor xmm1,xmm1 + movsx ecx,byte [rax+3] + movsx edx,byte [rax+2] + movsx r8d,byte [rax+1] + movsx r9d,byte [rax] + movdqa xmm10,xmm1 + movdqa xmm15,xmm2 + punpckhbw xmm2,xmm1 + punpckhbw xmm6,xmm1 + punpcklbw xmm4,xmm1 + movsx eax,r11w + mov word [rsp+0Eh],cx + mov word [rsp+0Ch],cx + movdqa xmm3,xmm7 + movdqa xmm8,xmm7 + movdqa [rsp+20h],xmm7 + punpcklbw xmm15,xmm1 + punpcklbw xmm13,xmm1 + punpcklbw xmm3,xmm1 + mov word [rsp+0Ah],dx + mov word [rsp+8],dx + mov word [rsp+6],r8w + movd xmm0,eax + movdqa [rsp+30h],xmm6 + punpckhbw xmm9,xmm1 + punpckhbw xmm8,xmm1 + punpcklwd xmm0,xmm0 + mov eax, r13d ; iBeta + mov word [rsp+4],r8w + mov word [rsp+2],r9w + pshufd xmm12,xmm0,0 + mov word [rsp],r9w + movd xmm0,eax + mov eax,4 + cwde + movdqa xmm14, [rsp] + movdqa [rsp],xmm2 + movdqa xmm2,xmm12 + punpcklwd xmm0,xmm0 + pshufd xmm11,xmm0,0 + psubw xmm10,xmm14 + movd xmm0,eax + movdqa xmm7,xmm14 + movdqa xmm6,xmm14 + pcmpgtw xmm7,xmm1 + punpcklwd xmm0,xmm0 + pshufd xmm5,xmm0,0 + movdqa xmm0,xmm4 + movdqa xmm1,xmm15 + psubw xmm4,xmm13 + psubw xmm0,xmm3 + psubw xmm1,xmm13 + psubw xmm3,xmm15 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm10 + psraw xmm1,3 + pmaxsw xmm0,xmm1 + pminsw xmm6,xmm0 + movdqa xmm1,xmm11 + movdqa xmm0,xmm13 + psubw xmm0,xmm15 + pabsw xmm0,xmm0 + pcmpgtw xmm2,xmm0 + pabsw xmm0,xmm4 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm3 + pand xmm2,xmm1 + movdqa xmm1,xmm11 + movdqa xmm3,[rsp+30h] + pcmpgtw xmm1,xmm0 + movdqa xmm0,xmm9 + pand xmm2,xmm1 + psubw xmm0,xmm8 + psubw xmm9,xmm3 + pand xmm2,xmm7 + pand xmm6,xmm2 + psubw xmm15,xmm6 + paddw xmm13,xmm6 + movdqa xmm2,[rsp] + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + psubw xmm8,xmm2 + psllw xmm1,2 + paddw xmm1,xmm0 + paddw xmm1,xmm5 + movdqa xmm0,xmm3 + movdqa xmm5,[rsp+10h] + psubw xmm0,xmm2 + psraw xmm1,3 + movdqa xmm4,xmm5 + pabsw xmm0,xmm0 + pmaxsw xmm10,xmm1 + movdqa xmm1,xmm11 + pcmpgtw xmm12,xmm0 + pabsw xmm0,xmm9 + pminsw xmm14,xmm10 + pcmpgtw xmm1,xmm0 + pabsw xmm0,xmm8 + pcmpgtw xmm11,xmm0 + pand xmm12,xmm1 + movdqa xmm1,[rsp+20h] + pand xmm12,xmm11 + pand xmm12,xmm7 + pand xmm14,xmm12 + paddw xmm3,xmm14 + psubw xmm2,xmm14 + packuswb xmm13,xmm3 + packuswb xmm15,xmm2 + punpcklbw xmm4,xmm13 + punpckhbw xmm5,xmm13 + movdqa xmm0,xmm15 + punpcklbw xmm0,xmm1 + punpckhbw xmm15,xmm1 + movdqa xmm3,xmm4 + punpcklwd xmm3,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm0,xmm5 + movdqa xmm2,xmm3 + movdqa xmm1,xmm4 + punpcklwd xmm0,xmm15 + punpckhwd xmm5,xmm15 + punpckldq xmm2,xmm0 + punpckhdq xmm3,xmm0 + punpckldq xmm1,xmm5 + movdqa xmm0,xmm2 + punpcklqdq xmm0,xmm1 + punpckhdq xmm4,xmm5 + punpckhqdq xmm2,xmm1 + movdqa [rsp+40h],xmm0 + movdqa xmm0,xmm3 + movdqa [rsp+90h],xmm2 + mov eax,[rsp+40h] + mov [rdi-2],eax + mov eax, [rsp+90h] + punpcklqdq xmm0,xmm4 + punpckhqdq xmm3,xmm4 + mov [rsi+rdi-2],eax + movdqa [rsp+50h],xmm0 + mov eax,[rsp+50h] + movdqa [rsp+0A0h],xmm3 + mov [rdi+rsi*2-2],eax + mov eax,[rsp+0A0h] + mov [r10+rdi-2],eax + mov eax,[rsp+48h] + mov [rbx],eax + mov eax,[rsp+98h] + mov [rsi+rbx],eax + mov eax,[rsp+58h] + mov [rbx+rsi*2],eax + mov eax, [rsp+0A8h] + mov [r10+rbx],eax + mov eax, [rsp+44h] + mov [r12-2],eax + mov eax,[rsp+94h] + mov [rsi+r12-2],eax + mov eax,[rsp+54h] + mov [r12+rsi*2-2],eax + mov eax, [rsp+0A4h] + mov [r10+r12-2],eax + mov eax,[rsp+4Ch] + mov [rbp],eax + mov eax,[rsp+9Ch] + mov [rsi+rbp],eax + mov eax, [rsp+5Ch] + mov [rbp+rsi*2],eax + mov eax,[rsp+0ACh] + mov [r10+rbp],eax + lea r11,[rsp+170h] + mov rsp,r11 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret @@ -3233,166 +3233,166 @@ WELS_EXTERN DeblockChromaLt4H_ssse3 ; int32_t iAlpha, int32_t iBeta) ;******************************************************************************** WELS_EXTERN DeblockChromaEq4V_ssse3 - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,68h - mov edx,[ebp+10h] ; iStride - mov eax,[ebp+8] ; pPixCb - mov ecx,[ebp+0Ch] ; pPixCr - movq xmm4,[ecx] - movq xmm5,[edx+ecx] - push esi - push edi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - movq xmm1,[edi] - mov edi,ecx - sub edi,esi - movq xmm2,[edi] - punpcklqdq xmm1,xmm2 - mov esi,eax - sub esi,edx - movq xmm2,[esi] - mov edi,ecx - sub edi,edx - movq xmm3,[edi] - punpcklqdq xmm2,xmm3 - movq xmm3,[eax] - punpcklqdq xmm3,xmm4 - movq xmm4,[edx+eax] - mov edx, [ebp + 14h] - punpcklqdq xmm4,xmm5 - movd xmm5,edx - mov edx, [ebp + 18h] - pxor xmm0,xmm0 - movdqa xmm6,xmm5 - punpcklwd xmm6,xmm5 - pshufd xmm5,xmm6,0 - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,xmm1 - punpckhbw xmm1,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+40h],xmm1 - movdqa [esp+60h],xmm7 - movdqa xmm7,xmm2 - punpcklbw xmm7,xmm0 - movdqa [esp+10h],xmm7 - movdqa xmm7,xmm3 - punpcklbw xmm7,xmm0 - punpckhbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm7,xmm4 - punpckhbw xmm4,xmm0 - punpckhbw xmm2,xmm0 - punpcklbw xmm7,xmm0 - movdqa [esp+30h],xmm3 - movdqa xmm3,[esp+10h] - movdqa xmm1,xmm3 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa [esp+20h],xmm4 - movdqa xmm0,xmm5 - pcmpgtw xmm0,xmm1 - movdqa xmm1,[esp+60h] - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - pand xmm0,xmm4 - movdqa xmm1,xmm7 - psubw xmm1,[esp+50h] - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,xmm2 - psubw xmm1,[esp+30h] - pabsw xmm1,xmm1 - pcmpgtw xmm5,xmm1 - movdqa xmm1,[esp+40h] - pand xmm0,xmm4 - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - movdqa xmm4,xmm6 - pcmpgtw xmm4,xmm1 - movdqa xmm1,[esp+20h] - psubw xmm1,[esp+30h] - pand xmm5,xmm4 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - pand xmm5,xmm6 - mov edx,2 - movsx edx,dx - movd xmm1,edx - movdqa xmm4,xmm1 - punpcklwd xmm4,xmm1 - pshufd xmm1,xmm4,0 - movdqa xmm4,[esp+60h] - movdqa xmm6,xmm4 - paddw xmm6,xmm4 - paddw xmm6,xmm3 - paddw xmm6,xmm7 - movdqa [esp+10h],xmm1 - paddw xmm6,[esp+10h] - psraw xmm6,2 - movdqa xmm4,xmm0 - pandn xmm4,xmm3 - movdqa xmm3,[esp+40h] - movdqa xmm1,xmm0 - pand xmm1,xmm6 - por xmm1,xmm4 - movdqa xmm6,xmm3 - paddw xmm6,xmm3 - movdqa xmm3,[esp+10h] - paddw xmm6,xmm2 - paddw xmm6,[esp+20h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm4,xmm5 - pand xmm4,xmm6 - movdqa xmm6,xmm5 - pandn xmm6,xmm2 - por xmm4,xmm6 - packuswb xmm1,xmm4 - movdqa xmm4,[esp+50h] - movdqa xmm6,xmm7 - paddw xmm6,xmm7 - paddw xmm6,xmm4 - paddw xmm6,[esp+60h] - paddw xmm6,xmm3 - psraw xmm6,2 - movdqa xmm2,xmm0 - pand xmm2,xmm6 - pandn xmm0,xmm4 - por xmm2,xmm0 - movdqa xmm0,[esp+20h] - movdqa xmm6,xmm0 - paddw xmm6,xmm0 - movdqa xmm0,[esp+30h] - paddw xmm6,xmm0 - paddw xmm6,[esp+40h] - movdqa xmm4,xmm5 - paddw xmm6,xmm3 - movq [esi],xmm1 - psraw xmm6,2 - pand xmm4,xmm6 - pandn xmm5,xmm0 - por xmm4,xmm5 - packuswb xmm2,xmm4 - movq [eax],xmm2 - psrldq xmm1,8 - movq [edi],xmm1 - pop edi - psrldq xmm2,8 - movq [ecx],xmm2 - pop esi - mov esp,ebp - pop ebp - ret + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,68h + mov edx,[ebp+10h] ; iStride + mov eax,[ebp+8] ; pPixCb + mov ecx,[ebp+0Ch] ; pPixCr + movq xmm4,[ecx] + movq xmm5,[edx+ecx] + push esi + push edi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + movq xmm1,[edi] + mov edi,ecx + sub edi,esi + movq xmm2,[edi] + punpcklqdq xmm1,xmm2 + mov esi,eax + sub esi,edx + movq xmm2,[esi] + mov edi,ecx + sub edi,edx + movq xmm3,[edi] + punpcklqdq xmm2,xmm3 + movq xmm3,[eax] + punpcklqdq xmm3,xmm4 + movq xmm4,[edx+eax] + mov edx, [ebp + 14h] + punpcklqdq xmm4,xmm5 + movd xmm5,edx + mov edx, [ebp + 18h] + pxor xmm0,xmm0 + movdqa xmm6,xmm5 + punpcklwd xmm6,xmm5 + pshufd xmm5,xmm6,0 + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,xmm1 + punpckhbw xmm1,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+40h],xmm1 + movdqa [esp+60h],xmm7 + movdqa xmm7,xmm2 + punpcklbw xmm7,xmm0 + movdqa [esp+10h],xmm7 + movdqa xmm7,xmm3 + punpcklbw xmm7,xmm0 + punpckhbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm7,xmm4 + punpckhbw xmm4,xmm0 + punpckhbw xmm2,xmm0 + punpcklbw xmm7,xmm0 + movdqa [esp+30h],xmm3 + movdqa xmm3,[esp+10h] + movdqa xmm1,xmm3 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa [esp+20h],xmm4 + movdqa xmm0,xmm5 + pcmpgtw xmm0,xmm1 + movdqa xmm1,[esp+60h] + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + pand xmm0,xmm4 + movdqa xmm1,xmm7 + psubw xmm1,[esp+50h] + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,xmm2 + psubw xmm1,[esp+30h] + pabsw xmm1,xmm1 + pcmpgtw xmm5,xmm1 + movdqa xmm1,[esp+40h] + pand xmm0,xmm4 + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + movdqa xmm4,xmm6 + pcmpgtw xmm4,xmm1 + movdqa xmm1,[esp+20h] + psubw xmm1,[esp+30h] + pand xmm5,xmm4 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + pand xmm5,xmm6 + mov edx,2 + movsx edx,dx + movd xmm1,edx + movdqa xmm4,xmm1 + punpcklwd xmm4,xmm1 + pshufd xmm1,xmm4,0 + movdqa xmm4,[esp+60h] + movdqa xmm6,xmm4 + paddw xmm6,xmm4 + paddw xmm6,xmm3 + paddw xmm6,xmm7 + movdqa [esp+10h],xmm1 + paddw xmm6,[esp+10h] + psraw xmm6,2 + movdqa xmm4,xmm0 + pandn xmm4,xmm3 + movdqa xmm3,[esp+40h] + movdqa xmm1,xmm0 + pand xmm1,xmm6 + por xmm1,xmm4 + movdqa xmm6,xmm3 + paddw xmm6,xmm3 + movdqa xmm3,[esp+10h] + paddw xmm6,xmm2 + paddw xmm6,[esp+20h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm4,xmm5 + pand xmm4,xmm6 + movdqa xmm6,xmm5 + pandn xmm6,xmm2 + por xmm4,xmm6 + packuswb xmm1,xmm4 + movdqa xmm4,[esp+50h] + movdqa xmm6,xmm7 + paddw xmm6,xmm7 + paddw xmm6,xmm4 + paddw xmm6,[esp+60h] + paddw xmm6,xmm3 + psraw xmm6,2 + movdqa xmm2,xmm0 + pand xmm2,xmm6 + pandn xmm0,xmm4 + por xmm2,xmm0 + movdqa xmm0,[esp+20h] + movdqa xmm6,xmm0 + paddw xmm6,xmm0 + movdqa xmm0,[esp+30h] + paddw xmm6,xmm0 + paddw xmm6,[esp+40h] + movdqa xmm4,xmm5 + paddw xmm6,xmm3 + movq [esi],xmm1 + psraw xmm6,2 + pand xmm4,xmm6 + pandn xmm5,xmm0 + por xmm4,xmm5 + packuswb xmm2,xmm4 + movq [eax],xmm2 + psrldq xmm1,8 + movq [edi],xmm1 + pop edi + psrldq xmm2,8 + movq [ecx],xmm2 + pop esi + mov esp,ebp + pop ebp + ret ;****************************************************************************** ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, @@ -3400,200 +3400,200 @@ WELS_EXTERN DeblockChromaEq4V_ssse3 ;******************************************************************************* WELS_EXTERN DeblockChromaLt4V_ssse3 - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0E4h - push ebx - push esi - mov esi, [ebp+1Ch] ; pTC - movsx ebx, byte [esi+2] - push edi - movsx di,byte [esi+3] - mov word [esp+0Ch],bx - movsx bx,byte [esi+1] - movsx esi,byte [esi] - mov word [esp+0Eh],si - movzx esi,di - movd xmm1,esi - movzx esi,di - movd xmm2,esi - mov si,word [esp+0Ch] - mov edx, [ebp + 10h] - mov eax, [ebp + 08h] - movzx edi,si - movzx esi,si - mov ecx, [ebp + 0Ch] - movd xmm4,esi - movzx esi,bx - movd xmm5,esi - movd xmm3,edi - movzx esi,bx - movd xmm6,esi - mov si,word [esp+0Eh] - movzx edi,si - movzx esi,si - punpcklwd xmm6,xmm2 - pxor xmm0,xmm0 - movdqa [esp+40h],xmm0 - movd xmm7,edi - movd xmm0,esi - lea esi,[edx+edx] - mov edi,eax - sub edi,esi - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+40h] - punpcklwd xmm0,xmm4 - movq xmm4,[edx+ecx] - punpcklwd xmm7,xmm3 - movq xmm3,[eax] - punpcklwd xmm0,xmm6 - movq xmm6,[edi] - punpcklwd xmm7,xmm5 - punpcklwd xmm0,xmm7 - mov edi,ecx - sub edi,esi - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+60h],xmm2 - movq xmm2, [edi] - punpcklqdq xmm6,xmm2 - mov esi,eax - sub esi,edx - movq xmm7,[esi] - mov edi,ecx - sub edi,edx - movq xmm2,[edi] - punpcklqdq xmm7,xmm2 - movq xmm2,[ecx] - punpcklqdq xmm3,xmm2 - movq xmm2,[edx+eax] - movsx edx,word [ebp + 14h] - punpcklqdq xmm2,xmm4 - movdqa [esp+0E0h],xmm2 - movd xmm2,edx - movsx edx,word [ebp + 18h] - movdqa xmm4,xmm2 - punpcklwd xmm4,xmm2 - movd xmm2,edx - movdqa xmm5,xmm2 - punpcklwd xmm5,xmm2 - pshufd xmm2,xmm5,0 - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - movdqa [esp+0D0h],xmm3 - pshufd xmm4,xmm4,0 - movdqa [esp+30h],xmm2 - punpckhbw xmm6,xmm1 - movdqa [esp+80h],xmm6 - movdqa xmm6,[esp+0D0h] - punpckhbw xmm6,xmm1 - movdqa [esp+70h],xmm6 - movdqa xmm6, [esp+0E0h] - punpckhbw xmm6,xmm1 - movdqa [esp+90h],xmm6 - movdqa xmm5, [esp+0E0h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0A0h],xmm7 - punpcklbw xmm3,xmm1 - mov edx,4 - punpcklbw xmm2,xmm1 - movsx edx,dx - movd xmm6,edx - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa xmm7,[esp+30h] - movdqa [esp+20h],xmm6 - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa xmm1,[esp+60h] - movdqa [esp+40h],xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6, [esp+20h] - movdqa xmm7, [esp+50h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa [esp+10h],xmm0 - movdqa xmm6, [esp+10h] - pminsw xmm6,xmm1 - movdqa [esp+10h],xmm6 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - movdqa xmm6,xmm4 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+30h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1,[esp+50h] - pand xmm6,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5,[esp+80h] - psubw xmm5,[esp+90h] - pand xmm6,xmm1 - pand xmm6,[esp+40h] - movdqa xmm1,[esp+10h] - pand xmm1,xmm6 - movdqa xmm6,[esp+70h] - movdqa [esp+30h],xmm1 - movdqa xmm1,[esp+0A0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6,[esp+20h] - movdqa xmm5,[esp+60h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+70h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+80h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+90h] - pand xmm4,xmm7 - movdqa xmm7,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+40h] - pand xmm0,xmm4 - movdqa xmm4,[esp+30h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - packuswb xmm2,xmm1 - movq [esi],xmm2 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm3,xmm5 - movq [eax],xmm3 - psrldq xmm2,8 - movq [edi],xmm2 - pop edi - pop esi - psrldq xmm3,8 - movq [ecx],xmm3 - pop ebx - mov esp,ebp - pop ebp - ret + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0E4h + push ebx + push esi + mov esi, [ebp+1Ch] ; pTC + movsx ebx, byte [esi+2] + push edi + movsx di,byte [esi+3] + mov word [esp+0Ch],bx + movsx bx,byte [esi+1] + movsx esi,byte [esi] + mov word [esp+0Eh],si + movzx esi,di + movd xmm1,esi + movzx esi,di + movd xmm2,esi + mov si,word [esp+0Ch] + mov edx, [ebp + 10h] + mov eax, [ebp + 08h] + movzx edi,si + movzx esi,si + mov ecx, [ebp + 0Ch] + movd xmm4,esi + movzx esi,bx + movd xmm5,esi + movd xmm3,edi + movzx esi,bx + movd xmm6,esi + mov si,word [esp+0Eh] + movzx edi,si + movzx esi,si + punpcklwd xmm6,xmm2 + pxor xmm0,xmm0 + movdqa [esp+40h],xmm0 + movd xmm7,edi + movd xmm0,esi + lea esi,[edx+edx] + mov edi,eax + sub edi,esi + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+40h] + punpcklwd xmm0,xmm4 + movq xmm4,[edx+ecx] + punpcklwd xmm7,xmm3 + movq xmm3,[eax] + punpcklwd xmm0,xmm6 + movq xmm6,[edi] + punpcklwd xmm7,xmm5 + punpcklwd xmm0,xmm7 + mov edi,ecx + sub edi,esi + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+60h],xmm2 + movq xmm2, [edi] + punpcklqdq xmm6,xmm2 + mov esi,eax + sub esi,edx + movq xmm7,[esi] + mov edi,ecx + sub edi,edx + movq xmm2,[edi] + punpcklqdq xmm7,xmm2 + movq xmm2,[ecx] + punpcklqdq xmm3,xmm2 + movq xmm2,[edx+eax] + movsx edx,word [ebp + 14h] + punpcklqdq xmm2,xmm4 + movdqa [esp+0E0h],xmm2 + movd xmm2,edx + movsx edx,word [ebp + 18h] + movdqa xmm4,xmm2 + punpcklwd xmm4,xmm2 + movd xmm2,edx + movdqa xmm5,xmm2 + punpcklwd xmm5,xmm2 + pshufd xmm2,xmm5,0 + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + movdqa [esp+0D0h],xmm3 + pshufd xmm4,xmm4,0 + movdqa [esp+30h],xmm2 + punpckhbw xmm6,xmm1 + movdqa [esp+80h],xmm6 + movdqa xmm6,[esp+0D0h] + punpckhbw xmm6,xmm1 + movdqa [esp+70h],xmm6 + movdqa xmm6, [esp+0E0h] + punpckhbw xmm6,xmm1 + movdqa [esp+90h],xmm6 + movdqa xmm5, [esp+0E0h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0A0h],xmm7 + punpcklbw xmm3,xmm1 + mov edx,4 + punpcklbw xmm2,xmm1 + movsx edx,dx + movd xmm6,edx + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa xmm7,[esp+30h] + movdqa [esp+20h],xmm6 + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa xmm1,[esp+60h] + movdqa [esp+40h],xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6, [esp+20h] + movdqa xmm7, [esp+50h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa [esp+10h],xmm0 + movdqa xmm6, [esp+10h] + pminsw xmm6,xmm1 + movdqa [esp+10h],xmm6 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + movdqa xmm6,xmm4 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+30h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1,[esp+50h] + pand xmm6,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5,[esp+80h] + psubw xmm5,[esp+90h] + pand xmm6,xmm1 + pand xmm6,[esp+40h] + movdqa xmm1,[esp+10h] + pand xmm1,xmm6 + movdqa xmm6,[esp+70h] + movdqa [esp+30h],xmm1 + movdqa xmm1,[esp+0A0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6,[esp+20h] + movdqa xmm5,[esp+60h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+70h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+80h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+90h] + pand xmm4,xmm7 + movdqa xmm7,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+40h] + pand xmm0,xmm4 + movdqa xmm4,[esp+30h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + packuswb xmm2,xmm1 + movq [esi],xmm2 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm3,xmm5 + movq [eax],xmm3 + psrldq xmm2,8 + movq [edi],xmm2 + pop edi + pop esi + psrldq xmm3,8 + movq [ecx],xmm3 + pop ebx + mov esp,ebp + pop ebp + ret ;*************************************************************************** ; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, @@ -3601,280 +3601,280 @@ WELS_EXTERN DeblockChromaLt4V_ssse3 ;*************************************************************************** WELS_EXTERN DeblockChromaEq4H_ssse3 - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,0C8h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+18h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+7Ch] - push edi - mov dword [esp+14h],esi - mov dword [esp+18h],ecx - mov dword [esp+0Ch],edx - mov dword [esp+10h],eax - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+0Ch] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+10h] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - movsx ecx,word [ebp+14h] - movsx edx,word [ebp+18h] - movdqa xmm6,[esp+80h] - movdqa xmm4,[esp+90h] - movdqa xmm5,[esp+0A0h] - movdqa xmm7,[esp+0B0h] - pxor xmm0,xmm0 - movd xmm1,ecx - movdqa xmm2,xmm1 - punpcklwd xmm2,xmm1 - pshufd xmm1,xmm2,0 - movd xmm2,edx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3,xmm6 - punpckhbw xmm6,xmm0 - movdqa [esp+60h],xmm6 - movdqa xmm6,[esp+90h] - punpckhbw xmm6,xmm0 - movdqa [esp+30h],xmm6 - movdqa xmm6,[esp+0A0h] - punpckhbw xmm6,xmm0 - movdqa [esp+40h],xmm6 - movdqa xmm6,[esp+0B0h] - punpckhbw xmm6,xmm0 - movdqa [esp+70h],xmm6 - punpcklbw xmm7,xmm0 - punpcklbw xmm4,xmm0 - punpcklbw xmm5,xmm0 - punpcklbw xmm3,xmm0 - movdqa [esp+50h],xmm7 - movdqa xmm6,xmm4 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - movdqa xmm0,xmm1 - pcmpgtw xmm0,xmm6 - movdqa xmm6,xmm3 - psubw xmm6,xmm4 - pabsw xmm6,xmm6 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+30h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pcmpgtw xmm1,xmm6 - movdqa xmm6,[esp+60h] - psubw xmm6,[esp+30h] - pabsw xmm6,xmm6 - pand xmm0,xmm7 - movdqa xmm7,xmm2 - pcmpgtw xmm7,xmm6 - movdqa xmm6,[esp+70h] - psubw xmm6,[esp+40h] - pabsw xmm6,xmm6 - pand xmm1,xmm7 - pcmpgtw xmm2,xmm6 - pand xmm1,xmm2 - mov eax,2 - movsx ecx,ax - movd xmm2,ecx - movdqa xmm6,xmm2 - punpcklwd xmm6,xmm2 - pshufd xmm2,xmm6,0 - movdqa [esp+20h],xmm2 - movdqa xmm2,xmm3 - paddw xmm2,xmm3 - paddw xmm2,xmm4 - paddw xmm2,[esp+50h] - paddw xmm2,[esp+20h] - psraw xmm2,2 - movdqa xmm6,xmm0 - pand xmm6,xmm2 - movdqa xmm2,xmm0 - pandn xmm2,xmm4 - por xmm6,xmm2 - movdqa xmm2,[esp+60h] - movdqa xmm7,xmm2 - paddw xmm7,xmm2 - paddw xmm7,[esp+30h] - paddw xmm7,[esp+70h] - paddw xmm7,[esp+20h] - movdqa xmm4,xmm1 - movdqa xmm2,xmm1 - pandn xmm2,[esp+30h] - psraw xmm7,2 - pand xmm4,xmm7 - por xmm4,xmm2 - movdqa xmm2,[esp+50h] - packuswb xmm6,xmm4 - movdqa [esp+90h],xmm6 - movdqa xmm6,xmm2 - paddw xmm6,xmm2 - movdqa xmm2,[esp+20h] - paddw xmm6,xmm5 - paddw xmm6,xmm3 - movdqa xmm4,xmm0 - pandn xmm0,xmm5 - paddw xmm6,xmm2 - psraw xmm6,2 - pand xmm4,xmm6 - por xmm4,xmm0 - movdqa xmm0,[esp+70h] - movdqa xmm5,xmm0 - paddw xmm5,xmm0 - movdqa xmm0,[esp+40h] - paddw xmm5,xmm0 - paddw xmm5,[esp+60h] - movdqa xmm3,xmm1 - paddw xmm5,xmm2 - psraw xmm5,2 - pand xmm3,xmm5 - pandn xmm1,xmm0 - por xmm3,xmm1 - packuswb xmm4,xmm3 - movdqa [esp+0A0h],xmm4 - mov esi,dword [esp+10h] - movdqa xmm0,[esi] - movdqa xmm1,[esi+10h] - movdqa xmm2,[esi+20h] - movdqa xmm3,[esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+1Ch] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+14h] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+0Ch] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,0C8h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+18h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+7Ch] + push edi + mov dword [esp+14h],esi + mov dword [esp+18h],ecx + mov dword [esp+0Ch],edx + mov dword [esp+10h],eax + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+0Ch] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+10h] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + movsx ecx,word [ebp+14h] + movsx edx,word [ebp+18h] + movdqa xmm6,[esp+80h] + movdqa xmm4,[esp+90h] + movdqa xmm5,[esp+0A0h] + movdqa xmm7,[esp+0B0h] + pxor xmm0,xmm0 + movd xmm1,ecx + movdqa xmm2,xmm1 + punpcklwd xmm2,xmm1 + pshufd xmm1,xmm2,0 + movd xmm2,edx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3,xmm6 + punpckhbw xmm6,xmm0 + movdqa [esp+60h],xmm6 + movdqa xmm6,[esp+90h] + punpckhbw xmm6,xmm0 + movdqa [esp+30h],xmm6 + movdqa xmm6,[esp+0A0h] + punpckhbw xmm6,xmm0 + movdqa [esp+40h],xmm6 + movdqa xmm6,[esp+0B0h] + punpckhbw xmm6,xmm0 + movdqa [esp+70h],xmm6 + punpcklbw xmm7,xmm0 + punpcklbw xmm4,xmm0 + punpcklbw xmm5,xmm0 + punpcklbw xmm3,xmm0 + movdqa [esp+50h],xmm7 + movdqa xmm6,xmm4 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + movdqa xmm0,xmm1 + pcmpgtw xmm0,xmm6 + movdqa xmm6,xmm3 + psubw xmm6,xmm4 + pabsw xmm6,xmm6 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+30h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pcmpgtw xmm1,xmm6 + movdqa xmm6,[esp+60h] + psubw xmm6,[esp+30h] + pabsw xmm6,xmm6 + pand xmm0,xmm7 + movdqa xmm7,xmm2 + pcmpgtw xmm7,xmm6 + movdqa xmm6,[esp+70h] + psubw xmm6,[esp+40h] + pabsw xmm6,xmm6 + pand xmm1,xmm7 + pcmpgtw xmm2,xmm6 + pand xmm1,xmm2 + mov eax,2 + movsx ecx,ax + movd xmm2,ecx + movdqa xmm6,xmm2 + punpcklwd xmm6,xmm2 + pshufd xmm2,xmm6,0 + movdqa [esp+20h],xmm2 + movdqa xmm2,xmm3 + paddw xmm2,xmm3 + paddw xmm2,xmm4 + paddw xmm2,[esp+50h] + paddw xmm2,[esp+20h] + psraw xmm2,2 + movdqa xmm6,xmm0 + pand xmm6,xmm2 + movdqa xmm2,xmm0 + pandn xmm2,xmm4 + por xmm6,xmm2 + movdqa xmm2,[esp+60h] + movdqa xmm7,xmm2 + paddw xmm7,xmm2 + paddw xmm7,[esp+30h] + paddw xmm7,[esp+70h] + paddw xmm7,[esp+20h] + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + pandn xmm2,[esp+30h] + psraw xmm7,2 + pand xmm4,xmm7 + por xmm4,xmm2 + movdqa xmm2,[esp+50h] + packuswb xmm6,xmm4 + movdqa [esp+90h],xmm6 + movdqa xmm6,xmm2 + paddw xmm6,xmm2 + movdqa xmm2,[esp+20h] + paddw xmm6,xmm5 + paddw xmm6,xmm3 + movdqa xmm4,xmm0 + pandn xmm0,xmm5 + paddw xmm6,xmm2 + psraw xmm6,2 + pand xmm4,xmm6 + por xmm4,xmm0 + movdqa xmm0,[esp+70h] + movdqa xmm5,xmm0 + paddw xmm5,xmm0 + movdqa xmm0,[esp+40h] + paddw xmm5,xmm0 + paddw xmm5,[esp+60h] + movdqa xmm3,xmm1 + paddw xmm5,xmm2 + psraw xmm5,2 + pand xmm3,xmm5 + pandn xmm1,xmm0 + por xmm3,xmm1 + packuswb xmm4,xmm3 + movdqa [esp+0A0h],xmm4 + mov esi,dword [esp+10h] + movdqa xmm0,[esi] + movdqa xmm1,[esi+10h] + movdqa xmm2,[esi+20h] + movdqa xmm3,[esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+1Ch] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+14h] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+0Ch] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret ;******************************************************************************* ; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, @@ -3882,308 +3882,308 @@ WELS_EXTERN DeblockChromaEq4H_ssse3 ;******************************************************************************* WELS_EXTERN DeblockChromaLt4H_ssse3 - push ebp - mov ebp,esp - and esp,0FFFFFFF0h - sub esp,108h - mov ecx,dword [ebp+8] - mov edx,dword [ebp+0Ch] - mov eax,dword [ebp+10h] - sub ecx,2 - sub edx,2 - push esi - lea esi,[eax+eax*2] - mov dword [esp+10h],ecx - mov dword [esp+4],edx - lea ecx,[ecx+eax*4] - lea edx,[edx+eax*4] - lea eax,[esp+6Ch] - push edi - mov dword [esp+0Ch],esi - mov dword [esp+18h],ecx - mov dword [esp+10h],edx - mov dword [esp+1Ch],eax - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - movd xmm0,dword [esi] - movd xmm1,dword [esi+ecx] - movd xmm2,dword [esi+ecx*2] - movd xmm3,dword [esi+edx] - mov esi,dword [esp+8] - movd xmm4,dword [esi] - movd xmm5,dword [esi+ecx] - movd xmm6,dword [esi+ecx*2] - movd xmm7,dword [esi+edx] - punpckldq xmm0,xmm4 - punpckldq xmm1,xmm5 - punpckldq xmm2,xmm6 - punpckldq xmm3,xmm7 - mov esi,dword [esp+18h] - mov edi,dword [esp+10h] - movd xmm4,dword [esi] - movd xmm5,dword [edi] - punpckldq xmm4,xmm5 - punpcklqdq xmm0,xmm4 - movd xmm4,dword [esi+ecx] - movd xmm5,dword [edi+ecx] - punpckldq xmm4,xmm5 - punpcklqdq xmm1,xmm4 - movd xmm4,dword [esi+ecx*2] - movd xmm5,dword [edi+ecx*2] - punpckldq xmm4,xmm5 - punpcklqdq xmm2,xmm4 - movd xmm4,dword [esi+edx] - movd xmm5,dword [edi+edx] - punpckldq xmm4,xmm5 - punpcklqdq xmm3,xmm4 - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov edi,dword [esp+1Ch] - movdqa [edi],xmm0 - movdqa [edi+10h],xmm5 - movdqa [edi+20h],xmm1 - movdqa [edi+30h],xmm6 - mov eax,dword [ebp+1Ch] - movsx cx,byte [eax+3] - movsx dx,byte [eax+2] - movsx si,byte [eax+1] - movsx ax,byte [eax] - movzx edi,cx - movzx ecx,cx - movd xmm2,ecx - movzx ecx,dx - movzx edx,dx - movd xmm3,ecx - movd xmm4,edx - movzx ecx,si - movzx edx,si - movd xmm5,ecx - pxor xmm0,xmm0 - movd xmm6,edx - movzx ecx,ax - movdqa [esp+60h],xmm0 - movzx edx,ax - movsx eax,word [ebp+14h] - punpcklwd xmm6,xmm2 - movd xmm1,edi - movd xmm7,ecx - movsx ecx,word [ebp+18h] - movd xmm0,edx - punpcklwd xmm7,xmm3 - punpcklwd xmm5,xmm1 - movdqa xmm1,[esp+60h] - punpcklwd xmm7,xmm5 - movdqa xmm5,[esp+0A0h] - punpcklwd xmm0,xmm4 - punpcklwd xmm0,xmm6 - movdqa xmm6, [esp+70h] - punpcklwd xmm0,xmm7 - movdqa xmm7,[esp+80h] - movdqa xmm2,xmm1 - psubw xmm2,xmm0 - movdqa [esp+0D0h],xmm2 - movd xmm2,eax - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm4,xmm3,0 - movd xmm2,ecx - movdqa xmm3,xmm2 - punpcklwd xmm3,xmm2 - pshufd xmm2,xmm3,0 - movdqa xmm3, [esp+90h] - movdqa [esp+50h],xmm2 - movdqa xmm2,xmm6 - punpcklbw xmm2,xmm1 - punpckhbw xmm6,xmm1 - movdqa [esp+40h],xmm2 - movdqa [esp+0B0h],xmm6 - movdqa xmm6,[esp+90h] - movdqa xmm2,xmm7 - punpckhbw xmm7,xmm1 - punpckhbw xmm6,xmm1 - punpcklbw xmm2,xmm1 - punpcklbw xmm3,xmm1 - punpcklbw xmm5,xmm1 - movdqa [esp+0F0h],xmm7 - movdqa [esp+0C0h],xmm6 - movdqa xmm6, [esp+0A0h] - punpckhbw xmm6,xmm1 - movdqa [esp+0E0h],xmm6 - mov edx,4 - movsx eax,dx - movd xmm6,eax - movdqa xmm7,xmm6 - punpcklwd xmm7,xmm6 - pshufd xmm6,xmm7,0 - movdqa [esp+30h],xmm6 - movdqa xmm7, [esp+40h] - psubw xmm7,xmm5 - movdqa xmm6,xmm0 - pcmpgtw xmm6,xmm1 - movdqa [esp+60h],xmm6 - movdqa xmm1, [esp+0D0h] - movdqa xmm6,xmm3 - psubw xmm6,xmm2 - psllw xmm6,2 - paddw xmm6,xmm7 - paddw xmm6,[esp+30h] - psraw xmm6,3 - pmaxsw xmm1,xmm6 - movdqa xmm7,[esp+50h] - movdqa [esp+20h],xmm0 - movdqa xmm6, [esp+20h] - pminsw xmm6,xmm1 - movdqa [esp+20h],xmm6 - movdqa xmm6,xmm4 - movdqa xmm1,xmm2 - psubw xmm1,xmm3 - pabsw xmm1,xmm1 - pcmpgtw xmm6,xmm1 - movdqa xmm1, [esp+40h] - psubw xmm1,xmm2 - pabsw xmm1,xmm1 - pcmpgtw xmm7,xmm1 - movdqa xmm1, [esp+50h] - pand xmm6,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm5,xmm3 - pabsw xmm5,xmm5 - pcmpgtw xmm1,xmm5 - movdqa xmm5, [esp+0B0h] - psubw xmm5,[esp+0E0h] - pand xmm6,xmm1 - pand xmm6, [esp+60h] - movdqa xmm1, [esp+20h] - pand xmm1,xmm6 - movdqa xmm6, [esp+0C0h] - movdqa [esp+40h],xmm1 - movdqa xmm1, [esp+0F0h] - psubw xmm6,xmm1 - psllw xmm6,2 - paddw xmm6,xmm5 - paddw xmm6, [esp+30h] - movdqa xmm5, [esp+0D0h] - psraw xmm6,3 - pmaxsw xmm5,xmm6 - pminsw xmm0,xmm5 - movdqa xmm5,[esp+0C0h] - movdqa xmm6,xmm1 - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm4,xmm6 - movdqa xmm6,[esp+0B0h] - psubw xmm6,xmm1 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - movdqa xmm6, [esp+0E0h] - pand xmm4,xmm7 - movdqa xmm7, [esp+50h] - psubw xmm6,xmm5 - pabsw xmm6,xmm6 - pcmpgtw xmm7,xmm6 - pand xmm4,xmm7 - pand xmm4,[esp+60h] - pand xmm0,xmm4 - movdqa xmm4, [esp+40h] - paddw xmm2,xmm4 - paddw xmm1,xmm0 - psubw xmm3,xmm4 - psubw xmm5,xmm0 - packuswb xmm2,xmm1 - packuswb xmm3,xmm5 - movdqa [esp+80h],xmm2 - movdqa [esp+90h],xmm3 - mov esi,dword [esp+1Ch] - movdqa xmm0, [esi] - movdqa xmm1, [esi+10h] - movdqa xmm2, [esi+20h] - movdqa xmm3, [esi+30h] - movdqa xmm6,xmm0 - punpcklbw xmm0,xmm1 - punpckhbw xmm6,xmm1 - movdqa xmm7,xmm2 - punpcklbw xmm2,xmm3 - punpckhbw xmm7,xmm3 - movdqa xmm4,xmm0 - movdqa xmm5,xmm6 - punpcklwd xmm0,xmm2 - punpckhwd xmm4,xmm2 - punpcklwd xmm6,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm1,xmm0 - movdqa xmm2,xmm4 - punpckldq xmm0,xmm6 - punpckhdq xmm1,xmm6 - punpckldq xmm4,xmm5 - punpckhdq xmm2,xmm5 - movdqa xmm5,xmm0 - movdqa xmm6,xmm1 - punpcklqdq xmm0,xmm4 - punpckhqdq xmm5,xmm4 - punpcklqdq xmm1,xmm2 - punpckhqdq xmm6,xmm2 - mov esi,dword [esp+14h] - mov ecx,dword [ebp+10h] - mov edx,dword [esp+0Ch] - mov edi,dword [esp+8] - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov esi,dword [esp+18h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - movd dword [esi],xmm0 - movd dword [esi+ecx],xmm5 - movd dword [esi+ecx*2],xmm1 - movd dword [esi+edx],xmm6 - psrldq xmm0,4 - psrldq xmm5,4 - psrldq xmm1,4 - psrldq xmm6,4 - mov edi,dword [esp+10h] - movd dword [edi],xmm0 - movd dword [edi+ecx],xmm5 - movd dword [edi+ecx*2],xmm1 - movd dword [edi+edx],xmm6 - pop edi - pop esi - mov esp,ebp - pop ebp - ret + push ebp + mov ebp,esp + and esp,0FFFFFFF0h + sub esp,108h + mov ecx,dword [ebp+8] + mov edx,dword [ebp+0Ch] + mov eax,dword [ebp+10h] + sub ecx,2 + sub edx,2 + push esi + lea esi,[eax+eax*2] + mov dword [esp+10h],ecx + mov dword [esp+4],edx + lea ecx,[ecx+eax*4] + lea edx,[edx+eax*4] + lea eax,[esp+6Ch] + push edi + mov dword [esp+0Ch],esi + mov dword [esp+18h],ecx + mov dword [esp+10h],edx + mov dword [esp+1Ch],eax + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + movd xmm0,dword [esi] + movd xmm1,dword [esi+ecx] + movd xmm2,dword [esi+ecx*2] + movd xmm3,dword [esi+edx] + mov esi,dword [esp+8] + movd xmm4,dword [esi] + movd xmm5,dword [esi+ecx] + movd xmm6,dword [esi+ecx*2] + movd xmm7,dword [esi+edx] + punpckldq xmm0,xmm4 + punpckldq xmm1,xmm5 + punpckldq xmm2,xmm6 + punpckldq xmm3,xmm7 + mov esi,dword [esp+18h] + mov edi,dword [esp+10h] + movd xmm4,dword [esi] + movd xmm5,dword [edi] + punpckldq xmm4,xmm5 + punpcklqdq xmm0,xmm4 + movd xmm4,dword [esi+ecx] + movd xmm5,dword [edi+ecx] + punpckldq xmm4,xmm5 + punpcklqdq xmm1,xmm4 + movd xmm4,dword [esi+ecx*2] + movd xmm5,dword [edi+ecx*2] + punpckldq xmm4,xmm5 + punpcklqdq xmm2,xmm4 + movd xmm4,dword [esi+edx] + movd xmm5,dword [edi+edx] + punpckldq xmm4,xmm5 + punpcklqdq xmm3,xmm4 + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov edi,dword [esp+1Ch] + movdqa [edi],xmm0 + movdqa [edi+10h],xmm5 + movdqa [edi+20h],xmm1 + movdqa [edi+30h],xmm6 + mov eax,dword [ebp+1Ch] + movsx cx,byte [eax+3] + movsx dx,byte [eax+2] + movsx si,byte [eax+1] + movsx ax,byte [eax] + movzx edi,cx + movzx ecx,cx + movd xmm2,ecx + movzx ecx,dx + movzx edx,dx + movd xmm3,ecx + movd xmm4,edx + movzx ecx,si + movzx edx,si + movd xmm5,ecx + pxor xmm0,xmm0 + movd xmm6,edx + movzx ecx,ax + movdqa [esp+60h],xmm0 + movzx edx,ax + movsx eax,word [ebp+14h] + punpcklwd xmm6,xmm2 + movd xmm1,edi + movd xmm7,ecx + movsx ecx,word [ebp+18h] + movd xmm0,edx + punpcklwd xmm7,xmm3 + punpcklwd xmm5,xmm1 + movdqa xmm1,[esp+60h] + punpcklwd xmm7,xmm5 + movdqa xmm5,[esp+0A0h] + punpcklwd xmm0,xmm4 + punpcklwd xmm0,xmm6 + movdqa xmm6, [esp+70h] + punpcklwd xmm0,xmm7 + movdqa xmm7,[esp+80h] + movdqa xmm2,xmm1 + psubw xmm2,xmm0 + movdqa [esp+0D0h],xmm2 + movd xmm2,eax + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm4,xmm3,0 + movd xmm2,ecx + movdqa xmm3,xmm2 + punpcklwd xmm3,xmm2 + pshufd xmm2,xmm3,0 + movdqa xmm3, [esp+90h] + movdqa [esp+50h],xmm2 + movdqa xmm2,xmm6 + punpcklbw xmm2,xmm1 + punpckhbw xmm6,xmm1 + movdqa [esp+40h],xmm2 + movdqa [esp+0B0h],xmm6 + movdqa xmm6,[esp+90h] + movdqa xmm2,xmm7 + punpckhbw xmm7,xmm1 + punpckhbw xmm6,xmm1 + punpcklbw xmm2,xmm1 + punpcklbw xmm3,xmm1 + punpcklbw xmm5,xmm1 + movdqa [esp+0F0h],xmm7 + movdqa [esp+0C0h],xmm6 + movdqa xmm6, [esp+0A0h] + punpckhbw xmm6,xmm1 + movdqa [esp+0E0h],xmm6 + mov edx,4 + movsx eax,dx + movd xmm6,eax + movdqa xmm7,xmm6 + punpcklwd xmm7,xmm6 + pshufd xmm6,xmm7,0 + movdqa [esp+30h],xmm6 + movdqa xmm7, [esp+40h] + psubw xmm7,xmm5 + movdqa xmm6,xmm0 + pcmpgtw xmm6,xmm1 + movdqa [esp+60h],xmm6 + movdqa xmm1, [esp+0D0h] + movdqa xmm6,xmm3 + psubw xmm6,xmm2 + psllw xmm6,2 + paddw xmm6,xmm7 + paddw xmm6,[esp+30h] + psraw xmm6,3 + pmaxsw xmm1,xmm6 + movdqa xmm7,[esp+50h] + movdqa [esp+20h],xmm0 + movdqa xmm6, [esp+20h] + pminsw xmm6,xmm1 + movdqa [esp+20h],xmm6 + movdqa xmm6,xmm4 + movdqa xmm1,xmm2 + psubw xmm1,xmm3 + pabsw xmm1,xmm1 + pcmpgtw xmm6,xmm1 + movdqa xmm1, [esp+40h] + psubw xmm1,xmm2 + pabsw xmm1,xmm1 + pcmpgtw xmm7,xmm1 + movdqa xmm1, [esp+50h] + pand xmm6,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm5,xmm3 + pabsw xmm5,xmm5 + pcmpgtw xmm1,xmm5 + movdqa xmm5, [esp+0B0h] + psubw xmm5,[esp+0E0h] + pand xmm6,xmm1 + pand xmm6, [esp+60h] + movdqa xmm1, [esp+20h] + pand xmm1,xmm6 + movdqa xmm6, [esp+0C0h] + movdqa [esp+40h],xmm1 + movdqa xmm1, [esp+0F0h] + psubw xmm6,xmm1 + psllw xmm6,2 + paddw xmm6,xmm5 + paddw xmm6, [esp+30h] + movdqa xmm5, [esp+0D0h] + psraw xmm6,3 + pmaxsw xmm5,xmm6 + pminsw xmm0,xmm5 + movdqa xmm5,[esp+0C0h] + movdqa xmm6,xmm1 + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm4,xmm6 + movdqa xmm6,[esp+0B0h] + psubw xmm6,xmm1 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + movdqa xmm6, [esp+0E0h] + pand xmm4,xmm7 + movdqa xmm7, [esp+50h] + psubw xmm6,xmm5 + pabsw xmm6,xmm6 + pcmpgtw xmm7,xmm6 + pand xmm4,xmm7 + pand xmm4,[esp+60h] + pand xmm0,xmm4 + movdqa xmm4, [esp+40h] + paddw xmm2,xmm4 + paddw xmm1,xmm0 + psubw xmm3,xmm4 + psubw xmm5,xmm0 + packuswb xmm2,xmm1 + packuswb xmm3,xmm5 + movdqa [esp+80h],xmm2 + movdqa [esp+90h],xmm3 + mov esi,dword [esp+1Ch] + movdqa xmm0, [esi] + movdqa xmm1, [esi+10h] + movdqa xmm2, [esi+20h] + movdqa xmm3, [esi+30h] + movdqa xmm6,xmm0 + punpcklbw xmm0,xmm1 + punpckhbw xmm6,xmm1 + movdqa xmm7,xmm2 + punpcklbw xmm2,xmm3 + punpckhbw xmm7,xmm3 + movdqa xmm4,xmm0 + movdqa xmm5,xmm6 + punpcklwd xmm0,xmm2 + punpckhwd xmm4,xmm2 + punpcklwd xmm6,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm1,xmm0 + movdqa xmm2,xmm4 + punpckldq xmm0,xmm6 + punpckhdq xmm1,xmm6 + punpckldq xmm4,xmm5 + punpckhdq xmm2,xmm5 + movdqa xmm5,xmm0 + movdqa xmm6,xmm1 + punpcklqdq xmm0,xmm4 + punpckhqdq xmm5,xmm4 + punpcklqdq xmm1,xmm2 + punpckhqdq xmm6,xmm2 + mov esi,dword [esp+14h] + mov ecx,dword [ebp+10h] + mov edx,dword [esp+0Ch] + mov edi,dword [esp+8] + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov esi,dword [esp+18h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + movd dword [esi],xmm0 + movd dword [esi+ecx],xmm5 + movd dword [esi+ecx*2],xmm1 + movd dword [esi+edx],xmm6 + psrldq xmm0,4 + psrldq xmm5,4 + psrldq xmm1,4 + psrldq xmm6,4 + mov edi,dword [esp+10h] + movd dword [edi],xmm0 + movd dword [edi+ecx],xmm5 + movd dword [edi+ecx*2],xmm1 + movd dword [edi+edx],xmm6 + pop edi + pop esi + mov esp,ebp + pop ebp + ret @@ -4194,385 +4194,385 @@ WELS_EXTERN DeblockChromaLt4H_ssse3 WELS_EXTERN DeblockLumaLt4V_ssse3 - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 420 ; 000001a4H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 420 ; 000001a4H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] - pxor xmm0, xmm0 - push ebx - mov edx, dword [ebp+24] - movdqa [esp+424-384], xmm0 - push esi + pxor xmm0, xmm0 + push ebx + mov edx, dword [ebp+24] + movdqa [esp+424-384], xmm0 + push esi - lea esi, [ecx+ecx*2] - push edi - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] + lea esi, [ecx+ecx*2] + push edi + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] - lea esi, [ecx+ecx] - movdqa [esp+432-208], xmm0 - mov edi, eax - sub edi, esi - movdqa xmm0, [edi] - movdqa [esp+448-208], xmm0 + lea esi, [ecx+ecx] + movdqa [esp+432-208], xmm0 + mov edi, eax + sub edi, esi + movdqa xmm0, [edi] + movdqa [esp+448-208], xmm0 - mov ebx, eax - sub ebx, ecx - movdqa xmm0, [ebx] - movdqa [esp+464-208], xmm0 + mov ebx, eax + sub ebx, ecx + movdqa xmm0, [ebx] + movdqa [esp+464-208], xmm0 - movdqa xmm0, [eax] + movdqa xmm0, [eax] - add ecx, eax - movdqa [esp+480-208], xmm0 - movdqa xmm0, [ecx] - mov dword [esp+432-404], ecx + add ecx, eax + movdqa [esp+480-208], xmm0 + movdqa xmm0, [ecx] + mov dword [esp+432-404], ecx - movsx ecx, word [ebp+16] - movdqa [esp+496-208], xmm0 - movdqa xmm0, [esi+eax] + movsx ecx, word [ebp+16] + movdqa [esp+496-208], xmm0 + movdqa xmm0, [esi+eax] - movsx si, byte [edx] - movdqa [esp+512-208], xmm0 - movd xmm0, ecx - movsx ecx, word [ebp+20] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - pshufd xmm0, xmm1, 0 - movdqa [esp+432-112], xmm0 - movd xmm0, ecx - movsx cx, byte [edx+1] - movdqa xmm1, xmm0 - punpcklwd xmm1, xmm0 - mov dword [esp+432-408], ebx - movzx ebx, cx - pshufd xmm0, xmm1, 0 - movd xmm1, ebx - movzx ebx, cx - movd xmm2, ebx - movzx ebx, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, si - movd xmm5, ecx - movzx ecx, si - movd xmm6, ecx - movzx ecx, si - movd xmm7, ecx - movzx ecx, si - movdqa [esp+432-336], xmm0 - movd xmm0, ecx + movsx si, byte [edx] + movdqa [esp+512-208], xmm0 + movd xmm0, ecx + movsx ecx, word [ebp+20] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + pshufd xmm0, xmm1, 0 + movdqa [esp+432-112], xmm0 + movd xmm0, ecx + movsx cx, byte [edx+1] + movdqa xmm1, xmm0 + punpcklwd xmm1, xmm0 + mov dword [esp+432-408], ebx + movzx ebx, cx + pshufd xmm0, xmm1, 0 + movd xmm1, ebx + movzx ebx, cx + movd xmm2, ebx + movzx ebx, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, si + movd xmm5, ecx + movzx ecx, si + movd xmm6, ecx + movzx ecx, si + movd xmm7, ecx + movzx ecx, si + movdqa [esp+432-336], xmm0 + movd xmm0, ecx - movsx cx, byte [edx+3] - movsx dx, byte [edx+2] - movd xmm3, ebx - punpcklwd xmm0, xmm4 - movzx esi, cx - punpcklwd xmm6, xmm2 - punpcklwd xmm5, xmm1 - punpcklwd xmm0, xmm6 - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - punpcklwd xmm0, xmm7 - movdqa [esp+432-400], xmm0 - movd xmm0, esi - movzx esi, cx - movd xmm2, esi - movzx esi, cx - movzx ecx, cx - movd xmm4, ecx - movzx ecx, dx - movd xmm3, esi - movd xmm5, ecx - punpcklwd xmm5, xmm0 + movsx cx, byte [edx+3] + movsx dx, byte [edx+2] + movd xmm3, ebx + punpcklwd xmm0, xmm4 + movzx esi, cx + punpcklwd xmm6, xmm2 + punpcklwd xmm5, xmm1 + punpcklwd xmm0, xmm6 + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + punpcklwd xmm0, xmm7 + movdqa [esp+432-400], xmm0 + movd xmm0, esi + movzx esi, cx + movd xmm2, esi + movzx esi, cx + movzx ecx, cx + movd xmm4, ecx + movzx ecx, dx + movd xmm3, esi + movd xmm5, ecx + punpcklwd xmm5, xmm0 - movdqa xmm0, [esp+432-384] - movzx ecx, dx - movd xmm6, ecx - movzx ecx, dx - movzx edx, dx - punpcklwd xmm6, xmm2 - movd xmm7, ecx - movd xmm1, edx + movdqa xmm0, [esp+432-384] + movzx ecx, dx + movd xmm6, ecx + movzx ecx, dx + movzx edx, dx + punpcklwd xmm6, xmm2 + movd xmm7, ecx + movd xmm1, edx - movdqa xmm2, [esp+448-208] - punpcklbw xmm2, xmm0 + movdqa xmm2, [esp+448-208] + punpcklbw xmm2, xmm0 - mov ecx, 4 - movsx edx, cx - punpcklwd xmm7, xmm3 - punpcklwd xmm7, xmm5 - movdqa xmm5, [esp+496-208] - movdqa xmm3, [esp+464-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-240], xmm5 - movdqa xmm5, [esp+512-208] - punpcklbw xmm5, xmm0 - movdqa [esp+432-352], xmm5 - punpcklwd xmm1, xmm4 - movdqa xmm4, [esp+432-208] - punpcklwd xmm1, xmm6 - movdqa xmm6, [esp+480-208] - punpcklwd xmm1, xmm7 - punpcklbw xmm6, xmm0 - punpcklbw xmm3, xmm0 - punpcklbw xmm4, xmm0 - movdqa xmm7, xmm3 - psubw xmm7, xmm4 - pabsw xmm7, xmm7 - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-336] - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-352] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 - movdqa xmm5, xmm3 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 - movdqa xmm5, [esp+432-400] - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, xmm3 - movdqa [esp+432-32], xmm6 - psubw xmm6, [esp+432-240] - movdqa xmm7, xmm5 - movdqa [esp+432-384], xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 + mov ecx, 4 + movsx edx, cx + punpcklwd xmm7, xmm3 + punpcklwd xmm7, xmm5 + movdqa xmm5, [esp+496-208] + movdqa xmm3, [esp+464-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-240], xmm5 + movdqa xmm5, [esp+512-208] + punpcklbw xmm5, xmm0 + movdqa [esp+432-352], xmm5 + punpcklwd xmm1, xmm4 + movdqa xmm4, [esp+432-208] + punpcklwd xmm1, xmm6 + movdqa xmm6, [esp+480-208] + punpcklwd xmm1, xmm7 + punpcklbw xmm6, xmm0 + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + movdqa xmm7, xmm3 + psubw xmm7, xmm4 + pabsw xmm7, xmm7 + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-336] + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-352] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 + movdqa xmm5, xmm3 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 + movdqa xmm5, [esp+432-400] + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, xmm3 + movdqa [esp+432-32], xmm6 + psubw xmm6, [esp+432-240] + movdqa xmm7, xmm5 + movdqa [esp+432-384], xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 - pand xmm5, xmm7 - movdqa xmm6, xmm3 - psubw xmm6, xmm2 - pabsw xmm6, xmm6 - movdqa xmm7, xmm4 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-400] - pand xmm5, xmm7 - movdqa xmm7, xmm6 - pcmpeqw xmm6, xmm0 - pcmpgtw xmm7, xmm0 - por xmm7, xmm6 - pand xmm5, xmm7 - movdqa [esp+432-320], xmm5 - movd xmm5, edx - movdqa xmm6, xmm5 - punpcklwd xmm6, xmm5 - pshufd xmm5, xmm6, 0 - movdqa [esp+432-336], xmm5 - movdqa xmm5, [esp+432-224] - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm0 - psubw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - psllw xmm5, 2 - movdqa xmm7, xmm2 - psubw xmm7, [esp+432-240] - paddw xmm7, xmm5 - paddw xmm7, [esp+432-336] - movdqa xmm5, [esp+432-368] - psraw xmm7, 3 - pmaxsw xmm6, xmm7 - pminsw xmm5, xmm6 + pand xmm5, xmm7 + movdqa xmm6, xmm3 + psubw xmm6, xmm2 + pabsw xmm6, xmm6 + movdqa xmm7, xmm4 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-400] + pand xmm5, xmm7 + movdqa xmm7, xmm6 + pcmpeqw xmm6, xmm0 + pcmpgtw xmm7, xmm0 + por xmm7, xmm6 + pand xmm5, xmm7 + movdqa [esp+432-320], xmm5 + movd xmm5, edx + movdqa xmm6, xmm5 + punpcklwd xmm6, xmm5 + pshufd xmm5, xmm6, 0 + movdqa [esp+432-336], xmm5 + movdqa xmm5, [esp+432-224] + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm0 + psubw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + psllw xmm5, 2 + movdqa xmm7, xmm2 + psubw xmm7, [esp+432-240] + paddw xmm7, xmm5 + paddw xmm7, [esp+432-336] + movdqa xmm5, [esp+432-368] + psraw xmm7, 3 + pmaxsw xmm6, xmm7 + pminsw xmm5, xmm6 - pand xmm5, [esp+432-320] - movdqa xmm6, [esp+432-400] - movdqa [esp+432-64], xmm5 - movdqa [esp+432-384], xmm6 - movdqa xmm5, xmm0 - psubw xmm5, xmm6 - movdqa [esp+432-368], xmm5 - movdqa xmm6, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm2 - paddw xmm7, xmm2 - psubw xmm5, xmm7 - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-384] - pminsw xmm5, xmm6 + pand xmm5, [esp+432-320] + movdqa xmm6, [esp+432-400] + movdqa [esp+432-64], xmm5 + movdqa [esp+432-384], xmm6 + movdqa xmm5, xmm0 + psubw xmm5, xmm6 + movdqa [esp+432-368], xmm5 + movdqa xmm6, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm2 + paddw xmm7, xmm2 + psubw xmm5, xmm7 + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-384] + pminsw xmm5, xmm6 - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-288] - movdqa xmm6, [esp+432-240] - movdqa [esp+432-96], xmm5 - movdqa xmm5, [esp+432-352] - paddw xmm5, [esp+432-304] - movdqa xmm7, xmm6 - paddw xmm7, xmm6 - movdqa xmm6, [esp+432-368] - psubw xmm5, xmm7 + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-288] + movdqa xmm6, [esp+432-240] + movdqa [esp+432-96], xmm5 + movdqa xmm5, [esp+432-352] + paddw xmm5, [esp+432-304] + movdqa xmm7, xmm6 + paddw xmm7, xmm6 + movdqa xmm6, [esp+432-368] + psubw xmm5, xmm7 - movdqa xmm7, [esp+496-208] - psraw xmm5, 1 - pmaxsw xmm6, xmm5 - movdqa xmm5, [esp+432-400] - pminsw xmm5, xmm6 - pand xmm5, [esp+432-320] - pand xmm5, [esp+432-256] - movdqa xmm6, [esp+448-208] - punpckhbw xmm7, xmm0 - movdqa [esp+432-352], xmm7 + movdqa xmm7, [esp+496-208] + psraw xmm5, 1 + pmaxsw xmm6, xmm5 + movdqa xmm5, [esp+432-400] + pminsw xmm5, xmm6 + pand xmm5, [esp+432-320] + pand xmm5, [esp+432-256] + movdqa xmm6, [esp+448-208] + punpckhbw xmm7, xmm0 + movdqa [esp+432-352], xmm7 - movdqa xmm7, [esp+512-208] - punpckhbw xmm6, xmm0 - movdqa [esp+432-48], xmm5 - movdqa xmm5, [esp+432-208] - movdqa [esp+432-368], xmm6 - movdqa xmm6, [esp+464-208] - punpckhbw xmm7, xmm0 - punpckhbw xmm5, xmm0 - movdqa [esp+432-384], xmm7 - punpckhbw xmm6, xmm0 - movdqa [esp+432-400], xmm6 + movdqa xmm7, [esp+512-208] + punpckhbw xmm6, xmm0 + movdqa [esp+432-48], xmm5 + movdqa xmm5, [esp+432-208] + movdqa [esp+432-368], xmm6 + movdqa xmm6, [esp+464-208] + punpckhbw xmm7, xmm0 + punpckhbw xmm5, xmm0 + movdqa [esp+432-384], xmm7 + punpckhbw xmm6, xmm0 + movdqa [esp+432-400], xmm6 - movdqa xmm7, [esp+432-400] - movdqa xmm6, [esp+480-208] - psubw xmm7, xmm5 - movdqa [esp+432-16], xmm5 - pabsw xmm7, xmm7 - punpckhbw xmm6, xmm0 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-288], xmm5 + movdqa xmm7, [esp+432-400] + movdqa xmm6, [esp+480-208] + psubw xmm7, xmm5 + movdqa [esp+432-16], xmm5 + pabsw xmm7, xmm7 + punpckhbw xmm6, xmm0 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-288], xmm5 - movdqa xmm7, xmm6 - psubw xmm7, [esp+432-384] - pabsw xmm7, xmm7 - movdqa xmm5, xmm4 - pcmpgtw xmm5, xmm7 - movdqa [esp+432-256], xmm5 + movdqa xmm7, xmm6 + psubw xmm7, [esp+432-384] + pabsw xmm7, xmm7 + movdqa xmm5, xmm4 + pcmpgtw xmm5, xmm7 + movdqa [esp+432-256], xmm5 - movdqa xmm5, [esp+432-400] - movdqa [esp+432-80], xmm6 - pavgw xmm5, xmm6 - movdqa [esp+432-304], xmm5 + movdqa xmm5, [esp+432-400] + movdqa [esp+432-80], xmm6 + pavgw xmm5, xmm6 + movdqa [esp+432-304], xmm5 - movdqa xmm5, xmm1 - psubw xmm5, [esp+432-288] - psubw xmm5, [esp+432-256] - movdqa [esp+432-224], xmm5 - movdqa xmm5, xmm6 - psubw xmm5, [esp+432-400] - psubw xmm6, [esp+432-352] - movdqa [esp+432-272], xmm5 - movdqa xmm7, xmm5 - movdqa xmm5, [esp+432-112] - pabsw xmm7, xmm7 - pcmpgtw xmm5, xmm7 - movdqa xmm7, xmm4 - pabsw xmm6, xmm6 - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+432-368] + movdqa xmm5, xmm1 + psubw xmm5, [esp+432-288] + psubw xmm5, [esp+432-256] + movdqa [esp+432-224], xmm5 + movdqa xmm5, xmm6 + psubw xmm5, [esp+432-400] + psubw xmm6, [esp+432-352] + movdqa [esp+432-272], xmm5 + movdqa xmm7, xmm5 + movdqa xmm5, [esp+432-112] + pabsw xmm7, xmm7 + pcmpgtw xmm5, xmm7 + movdqa xmm7, xmm4 + pabsw xmm6, xmm6 + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+432-368] - pand xmm5, xmm7 - movdqa xmm7, [esp+432-400] - psubw xmm7, xmm6 - psubw xmm6, [esp+432-352] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 - pand xmm5, xmm4 + pand xmm5, xmm7 + movdqa xmm7, [esp+432-400] + psubw xmm7, xmm6 + psubw xmm6, [esp+432-352] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 + pand xmm5, xmm4 - paddw xmm2, [esp+432-96] - movdqa xmm4, xmm1 - pcmpgtw xmm4, xmm0 - movdqa xmm7, xmm1 - pcmpeqw xmm7, xmm0 - por xmm4, xmm7 - pand xmm5, xmm4 - movdqa xmm4, [esp+432-224] - movdqa [esp+432-320], xmm5 - movdqa xmm5, [esp+432-272] - movdqa xmm7, xmm0 - psubw xmm7, xmm4 - psubw xmm0, xmm1 - psllw xmm5, 2 - paddw xmm6, xmm5 - paddw xmm6, [esp+432-336] - movdqa xmm5, [esp+432-368] - movdqa [esp+432-336], xmm0 - psraw xmm6, 3 - pmaxsw xmm7, xmm6 - pminsw xmm4, xmm7 - pand xmm4, [esp+432-320] - movdqa xmm6, xmm0 - movdqa xmm0, [esp+432-16] - paddw xmm0, [esp+432-304] - movdqa [esp+432-272], xmm4 - movdqa xmm4, [esp+432-368] - paddw xmm4, xmm4 - psubw xmm0, xmm4 + paddw xmm2, [esp+432-96] + movdqa xmm4, xmm1 + pcmpgtw xmm4, xmm0 + movdqa xmm7, xmm1 + pcmpeqw xmm7, xmm0 + por xmm4, xmm7 + pand xmm5, xmm4 + movdqa xmm4, [esp+432-224] + movdqa [esp+432-320], xmm5 + movdqa xmm5, [esp+432-272] + movdqa xmm7, xmm0 + psubw xmm7, xmm4 + psubw xmm0, xmm1 + psllw xmm5, 2 + paddw xmm6, xmm5 + paddw xmm6, [esp+432-336] + movdqa xmm5, [esp+432-368] + movdqa [esp+432-336], xmm0 + psraw xmm6, 3 + pmaxsw xmm7, xmm6 + pminsw xmm4, xmm7 + pand xmm4, [esp+432-320] + movdqa xmm6, xmm0 + movdqa xmm0, [esp+432-16] + paddw xmm0, [esp+432-304] + movdqa [esp+432-272], xmm4 + movdqa xmm4, [esp+432-368] + paddw xmm4, xmm4 + psubw xmm0, xmm4 - movdqa xmm4, [esp+432-64] - psraw xmm0, 1 - pmaxsw xmm6, xmm0 - movdqa xmm0, [esp+432-400] - movdqa xmm7, xmm1 - pminsw xmm7, xmm6 - movdqa xmm6, [esp+432-320] - pand xmm7, xmm6 - pand xmm7, [esp+432-288] - paddw xmm5, xmm7 - packuswb xmm2, xmm5 - movdqa xmm5, [esp+432-272] - paddw xmm0, xmm5 - paddw xmm3, xmm4 - packuswb xmm3, xmm0 + movdqa xmm4, [esp+432-64] + psraw xmm0, 1 + pmaxsw xmm6, xmm0 + movdqa xmm0, [esp+432-400] + movdqa xmm7, xmm1 + pminsw xmm7, xmm6 + movdqa xmm6, [esp+432-320] + pand xmm7, xmm6 + pand xmm7, [esp+432-288] + paddw xmm5, xmm7 + packuswb xmm2, xmm5 + movdqa xmm5, [esp+432-272] + paddw xmm0, xmm5 + paddw xmm3, xmm4 + packuswb xmm3, xmm0 - movdqa xmm0, [esp+432-32] - psubw xmm0, xmm4 - movdqa xmm4, [esp+432-80] - psubw xmm4, xmm5 + movdqa xmm0, [esp+432-32] + psubw xmm0, xmm4 + movdqa xmm4, [esp+432-80] + psubw xmm4, xmm5 - movdqa xmm5, [esp+432-240] - paddw xmm5, [esp+432-48] - packuswb xmm0, xmm4 - movdqa xmm4, [esp+432-384] - paddw xmm4, [esp+432-304] - movdqa [esp+480-208], xmm0 - movdqa xmm0, [esp+432-352] - movdqa xmm7, xmm0 - paddw xmm0, xmm0 + movdqa xmm5, [esp+432-240] + paddw xmm5, [esp+432-48] + packuswb xmm0, xmm4 + movdqa xmm4, [esp+432-384] + paddw xmm4, [esp+432-304] + movdqa [esp+480-208], xmm0 + movdqa xmm0, [esp+432-352] + movdqa xmm7, xmm0 + paddw xmm0, xmm0 - mov ecx, dword [esp+432-408] + mov ecx, dword [esp+432-408] - mov edx, dword [esp+432-404] - psubw xmm4, xmm0 - movdqa xmm0, [esp+432-336] - movdqa [edi], xmm2 - psraw xmm4, 1 - pmaxsw xmm0, xmm4 - pminsw xmm1, xmm0 - movdqa xmm0, [esp+480-208] + mov edx, dword [esp+432-404] + psubw xmm4, xmm0 + movdqa xmm0, [esp+432-336] + movdqa [edi], xmm2 + psraw xmm4, 1 + pmaxsw xmm0, xmm4 + pminsw xmm1, xmm0 + movdqa xmm0, [esp+480-208] - pop edi - pand xmm1, xmm6 - pand xmm1, [esp+428-256] - movdqa [ecx], xmm3 - paddw xmm7, xmm1 - pop esi - packuswb xmm5, xmm7 - movdqa [eax], xmm0 - movdqa [edx], xmm5 - pop ebx - mov esp, ebp - pop ebp - ret + pop edi + pand xmm1, xmm6 + pand xmm1, [esp+428-256] + movdqa [ecx], xmm3 + paddw xmm7, xmm1 + pop esi + packuswb xmm5, xmm7 + movdqa [eax], xmm0 + movdqa [edx], xmm5 + pop ebx + mov esp, ebp + pop ebp + ret ;******************************************************************************* @@ -4583,542 +4583,542 @@ WELS_EXTERN DeblockLumaLt4V_ssse3 WELS_EXTERN DeblockLumaEq4V_ssse3 - push ebp - mov ebp, esp - and esp, -16 ; fffffff0H - sub esp, 628 ; 00000274H - mov eax, dword [ebp+8] - mov ecx, dword [ebp+12] - push ebx - push esi + push ebp + mov ebp, esp + and esp, -16 ; fffffff0H + sub esp, 628 ; 00000274H + mov eax, dword [ebp+8] + mov ecx, dword [ebp+12] + push ebx + push esi - lea edx, [ecx*4] - pxor xmm0, xmm0 - movdqa xmm2, xmm0 + lea edx, [ecx*4] + pxor xmm0, xmm0 + movdqa xmm2, xmm0 - movdqa xmm0, [ecx+eax] - mov esi, eax - sub esi, edx - movdqa xmm3, [esi] - movdqa xmm5, [eax] - push edi - lea edi, [ecx+ecx] - lea ebx, [ecx+ecx*2] - mov dword [esp+640-600], edi - mov esi, eax - sub esi, edi - movdqa xmm1, [esi] - movdqa [esp+720-272], xmm0 - mov edi, eax - sub edi, ecx - movdqa xmm4, [edi] - add ecx, eax - mov dword [esp+640-596], ecx + movdqa xmm0, [ecx+eax] + mov esi, eax + sub esi, edx + movdqa xmm3, [esi] + movdqa xmm5, [eax] + push edi + lea edi, [ecx+ecx] + lea ebx, [ecx+ecx*2] + mov dword [esp+640-600], edi + mov esi, eax + sub esi, edi + movdqa xmm1, [esi] + movdqa [esp+720-272], xmm0 + mov edi, eax + sub edi, ecx + movdqa xmm4, [edi] + add ecx, eax + mov dword [esp+640-596], ecx - mov ecx, dword [esp+640-600] - movdqa xmm0, [ecx+eax] - movdqa [esp+736-272], xmm0 + mov ecx, dword [esp+640-600] + movdqa xmm0, [ecx+eax] + movdqa [esp+736-272], xmm0 - movdqa xmm0, [eax+ebx] - mov edx, eax - sub edx, ebx + movdqa xmm0, [eax+ebx] + mov edx, eax + sub edx, ebx - movsx ebx, word [ebp+16] - movdqa xmm6, [edx] - add ecx, eax - movdqa [esp+752-272], xmm0 - movd xmm0, ebx + movsx ebx, word [ebp+16] + movdqa xmm6, [edx] + add ecx, eax + movdqa [esp+752-272], xmm0 + movd xmm0, ebx - movsx ebx, word [ebp+20] - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 - movdqa [esp+640-320], xmm0 - movd xmm0, ebx - movdqa xmm7, xmm0 - punpcklwd xmm7, xmm0 - pshufd xmm0, xmm7, 0 + movsx ebx, word [ebp+20] + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 + movdqa [esp+640-320], xmm0 + movd xmm0, ebx + movdqa xmm7, xmm0 + punpcklwd xmm7, xmm0 + pshufd xmm0, xmm7, 0 - movdqa xmm7, [esp+736-272] - punpcklbw xmm7, xmm2 - movdqa [esp+640-416], xmm7 - movdqa [esp+640-512], xmm0 - movdqa xmm0, xmm1 - movdqa [esp+672-272], xmm1 - movdqa xmm1, xmm4 - movdqa [esp+704-272], xmm5 - punpcklbw xmm5, xmm2 - punpcklbw xmm1, xmm2 + movdqa xmm7, [esp+736-272] + punpcklbw xmm7, xmm2 + movdqa [esp+640-416], xmm7 + movdqa [esp+640-512], xmm0 + movdqa xmm0, xmm1 + movdqa [esp+672-272], xmm1 + movdqa xmm1, xmm4 + movdqa [esp+704-272], xmm5 + punpcklbw xmm5, xmm2 + punpcklbw xmm1, xmm2 - movdqa xmm7, xmm5 - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - punpcklbw xmm0, xmm2 - movdqa [esp+688-272], xmm4 - movdqa xmm4, [esp+720-272] - movdqa [esp+640-480], xmm0 + movdqa xmm7, xmm5 + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + punpcklbw xmm0, xmm2 + movdqa [esp+688-272], xmm4 + movdqa xmm4, [esp+720-272] + movdqa [esp+640-480], xmm0 - movdqa xmm7, xmm1 - psubw xmm7, xmm0 + movdqa xmm7, xmm1 + psubw xmm7, xmm0 - movdqa xmm0, [esp+640-512] - pabsw xmm7, xmm7 - punpcklbw xmm4, xmm2 - pcmpgtw xmm0, xmm7 - movdqa [esp+640-384], xmm4 - movdqa xmm7, xmm5 - psubw xmm7, xmm4 - movdqa xmm4, [esp+640-512] - movdqa [esp+656-272], xmm6 - punpcklbw xmm6, xmm2 - pabsw xmm7, xmm7 - movdqa [esp+640-48], xmm2 - movdqa [esp+640-368], xmm6 - movdqa [esp+640-144], xmm1 - movdqa [esp+640-400], xmm5 - pcmpgtw xmm4, xmm7 - pand xmm0, xmm4 - movdqa xmm4, [esp+640-320] - pcmpgtw xmm4, [esp+640-560] - pand xmm0, xmm4 + movdqa xmm0, [esp+640-512] + pabsw xmm7, xmm7 + punpcklbw xmm4, xmm2 + pcmpgtw xmm0, xmm7 + movdqa [esp+640-384], xmm4 + movdqa xmm7, xmm5 + psubw xmm7, xmm4 + movdqa xmm4, [esp+640-512] + movdqa [esp+656-272], xmm6 + punpcklbw xmm6, xmm2 + pabsw xmm7, xmm7 + movdqa [esp+640-48], xmm2 + movdqa [esp+640-368], xmm6 + movdqa [esp+640-144], xmm1 + movdqa [esp+640-400], xmm5 + pcmpgtw xmm4, xmm7 + pand xmm0, xmm4 + movdqa xmm4, [esp+640-320] + pcmpgtw xmm4, [esp+640-560] + pand xmm0, xmm4 - mov ebx, 2 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, [esp+640-320] - psraw xmm4, 2 - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm7 - movdqa [esp+640-576], xmm4 - pcmpgtw xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 + mov ebx, 2 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, [esp+640-320] + psraw xmm4, 2 + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm7 + movdqa [esp+640-576], xmm4 + pcmpgtw xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 - movdqa xmm4, [esp+640-512] - movdqa [esp+640-624], xmm7 - movdqa xmm7, xmm1 - psubw xmm7, xmm6 - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 + movdqa xmm4, [esp+640-512] + movdqa [esp+640-624], xmm7 + movdqa xmm7, xmm1 + psubw xmm7, xmm6 + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 - pand xmm4, [esp+640-560] - movdqa [esp+640-544], xmm4 - movdqa xmm4, [esp+640-512] - movdqa xmm7, xmm5 - psubw xmm7, [esp+640-416] - pabsw xmm7, xmm7 - pcmpgtw xmm4, xmm7 + pand xmm4, [esp+640-560] + movdqa [esp+640-544], xmm4 + movdqa xmm4, [esp+640-512] + movdqa xmm7, xmm5 + psubw xmm7, [esp+640-416] + pabsw xmm7, xmm7 + pcmpgtw xmm4, xmm7 - pand xmm4, [esp+640-560] - movdqa [esp+640-560], xmm4 + pand xmm4, [esp+640-560] + movdqa [esp+640-560], xmm4 - movdqa xmm4, [esp+640-544] - pandn xmm4, xmm6 - movdqa [esp+640-16], xmm4 - mov ebx, 4 - movsx ebx, bx - movd xmm4, ebx - movdqa xmm7, xmm4 - punpcklwd xmm7, xmm4 - movdqa xmm4, xmm3 - punpcklbw xmm4, xmm2 - psllw xmm4, 1 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, xmm6 - paddw xmm4, [esp+640-480] + movdqa xmm4, [esp+640-544] + pandn xmm4, xmm6 + movdqa [esp+640-16], xmm4 + mov ebx, 4 + movsx ebx, bx + movd xmm4, ebx + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm4 + movdqa xmm4, xmm3 + punpcklbw xmm4, xmm2 + psllw xmm4, 1 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, xmm6 + paddw xmm4, [esp+640-480] - movdqa xmm6, [esp+640-560] - pshufd xmm7, xmm7, 0 - paddw xmm4, xmm1 - movdqa [esp+640-592], xmm7 - paddw xmm4, xmm5 - paddw xmm4, xmm7 - movdqa xmm7, [esp+640-416] - pandn xmm6, xmm7 - movdqa [esp+640-80], xmm6 - movdqa xmm6, [esp+752-272] - punpcklbw xmm6, xmm2 - psllw xmm6, 1 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-384] + movdqa xmm6, [esp+640-560] + pshufd xmm7, xmm7, 0 + paddw xmm4, xmm1 + movdqa [esp+640-592], xmm7 + paddw xmm4, xmm5 + paddw xmm4, xmm7 + movdqa xmm7, [esp+640-416] + pandn xmm6, xmm7 + movdqa [esp+640-80], xmm6 + movdqa xmm6, [esp+752-272] + punpcklbw xmm6, xmm2 + psllw xmm6, 1 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-384] - movdqa xmm7, [esp+640-480] - paddw xmm6, xmm5 - paddw xmm6, xmm1 - paddw xmm6, [esp+640-592] - psraw xmm6, 3 - pand xmm6, [esp+640-560] - movdqa [esp+640-112], xmm6 - movdqa xmm6, [esp+640-544] - pandn xmm6, xmm7 - movdqa [esp+640-336], xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-528], xmm6 - movdqa xmm6, [esp+640-368] - paddw xmm6, xmm7 - movdqa xmm7, xmm1 - psraw xmm4, 3 - pand xmm4, [esp+640-544] - paddw xmm7, xmm5 - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] + movdqa xmm7, [esp+640-480] + paddw xmm6, xmm5 + paddw xmm6, xmm1 + paddw xmm6, [esp+640-592] + psraw xmm6, 3 + pand xmm6, [esp+640-560] + movdqa [esp+640-112], xmm6 + movdqa xmm6, [esp+640-544] + pandn xmm6, xmm7 + movdqa [esp+640-336], xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-528], xmm6 + movdqa xmm6, [esp+640-368] + paddw xmm6, xmm7 + movdqa xmm7, xmm1 + psraw xmm4, 3 + pand xmm4, [esp+640-544] + paddw xmm7, xmm5 + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] - paddw xmm5, xmm1 - psraw xmm6, 2 - pand xmm7, xmm6 + paddw xmm5, xmm1 + psraw xmm6, 2 + pand xmm7, xmm6 - movdqa xmm6, [esp+640-384] - movdqa [esp+640-64], xmm7 - movdqa xmm7, [esp+640-560] - pandn xmm7, xmm6 - movdqa [esp+640-304], xmm7 - movdqa xmm7, [esp+640-560] - movdqa [esp+640-528], xmm7 - movdqa xmm7, [esp+640-416] - paddw xmm7, xmm6 - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pand xmm5, xmm7 - movdqa [esp+640-32], xmm5 + movdqa xmm6, [esp+640-384] + movdqa [esp+640-64], xmm7 + movdqa xmm7, [esp+640-560] + pandn xmm7, xmm6 + movdqa [esp+640-304], xmm7 + movdqa xmm7, [esp+640-560] + movdqa [esp+640-528], xmm7 + movdqa xmm7, [esp+640-416] + paddw xmm7, xmm6 + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pand xmm5, xmm7 + movdqa [esp+640-32], xmm5 - movdqa xmm5, [esp+640-544] - movdqa [esp+640-528], xmm5 - movdqa xmm5, [esp+640-480] - movdqa xmm7, xmm5 - paddw xmm7, xmm5 - movdqa xmm5, xmm1 - paddw xmm5, xmm6 - paddw xmm6, [esp+640-592] - paddw xmm7, xmm5 - paddw xmm7, [esp+640-624] - movdqa xmm5, [esp+640-528] - psraw xmm7, 2 - pandn xmm5, xmm7 - movdqa xmm7, [esp+640-480] - paddw xmm7, xmm1 - paddw xmm7, [esp+640-400] - movdqa xmm1, [esp+640-544] - movdqa [esp+640-352], xmm5 - movdqa xmm5, [esp+640-368] - psllw xmm7, 1 - paddw xmm7, xmm6 - paddw xmm5, xmm7 + movdqa xmm5, [esp+640-544] + movdqa [esp+640-528], xmm5 + movdqa xmm5, [esp+640-480] + movdqa xmm7, xmm5 + paddw xmm7, xmm5 + movdqa xmm5, xmm1 + paddw xmm5, xmm6 + paddw xmm6, [esp+640-592] + paddw xmm7, xmm5 + paddw xmm7, [esp+640-624] + movdqa xmm5, [esp+640-528] + psraw xmm7, 2 + pandn xmm5, xmm7 + movdqa xmm7, [esp+640-480] + paddw xmm7, xmm1 + paddw xmm7, [esp+640-400] + movdqa xmm1, [esp+640-544] + movdqa [esp+640-352], xmm5 + movdqa xmm5, [esp+640-368] + psllw xmm7, 1 + paddw xmm7, xmm6 + paddw xmm5, xmm7 - movdqa xmm7, [esp+640-400] - psraw xmm5, 3 - pand xmm1, xmm5 - movdqa xmm5, [esp+640-480] - movdqa [esp+640-96], xmm1 - movdqa xmm1, [esp+640-560] - movdqa [esp+640-528], xmm1 - movdqa xmm1, [esp+640-384] - movdqa xmm6, xmm1 - paddw xmm6, xmm1 - paddw xmm1, [esp+640-400] - paddw xmm1, [esp+640-144] - paddw xmm7, xmm5 - paddw xmm5, [esp+640-592] - paddw xmm6, xmm7 - paddw xmm6, [esp+640-624] - movdqa xmm7, [esp+640-528] - psraw xmm6, 2 - psllw xmm1, 1 - paddw xmm1, xmm5 + movdqa xmm7, [esp+640-400] + psraw xmm5, 3 + pand xmm1, xmm5 + movdqa xmm5, [esp+640-480] + movdqa [esp+640-96], xmm1 + movdqa xmm1, [esp+640-560] + movdqa [esp+640-528], xmm1 + movdqa xmm1, [esp+640-384] + movdqa xmm6, xmm1 + paddw xmm6, xmm1 + paddw xmm1, [esp+640-400] + paddw xmm1, [esp+640-144] + paddw xmm7, xmm5 + paddw xmm5, [esp+640-592] + paddw xmm6, xmm7 + paddw xmm6, [esp+640-624] + movdqa xmm7, [esp+640-528] + psraw xmm6, 2 + psllw xmm1, 1 + paddw xmm1, xmm5 - movdqa xmm5, [esp+656-272] - pandn xmm7, xmm6 - movdqa xmm6, [esp+640-416] - paddw xmm6, xmm1 - movdqa xmm1, [esp+640-560] - psraw xmm6, 3 - pand xmm1, xmm6 + movdqa xmm5, [esp+656-272] + pandn xmm7, xmm6 + movdqa xmm6, [esp+640-416] + paddw xmm6, xmm1 + movdqa xmm1, [esp+640-560] + psraw xmm6, 3 + pand xmm1, xmm6 - movdqa xmm6, [esp+704-272] - movdqa [esp+640-128], xmm1 - movdqa xmm1, [esp+672-272] - punpckhbw xmm1, xmm2 - movdqa [esp+640-448], xmm1 - movdqa xmm1, [esp+688-272] - punpckhbw xmm1, xmm2 - punpckhbw xmm6, xmm2 - movdqa [esp+640-288], xmm7 - punpckhbw xmm5, xmm2 - movdqa [esp+640-496], xmm1 - movdqa [esp+640-432], xmm6 + movdqa xmm6, [esp+704-272] + movdqa [esp+640-128], xmm1 + movdqa xmm1, [esp+672-272] + punpckhbw xmm1, xmm2 + movdqa [esp+640-448], xmm1 + movdqa xmm1, [esp+688-272] + punpckhbw xmm1, xmm2 + punpckhbw xmm6, xmm2 + movdqa [esp+640-288], xmm7 + punpckhbw xmm5, xmm2 + movdqa [esp+640-496], xmm1 + movdqa [esp+640-432], xmm6 - movdqa xmm7, [esp+720-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-464], xmm7 + movdqa xmm7, [esp+720-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-464], xmm7 - movdqa xmm7, [esp+736-272] - punpckhbw xmm7, xmm2 - movdqa [esp+640-528], xmm7 + movdqa xmm7, [esp+736-272] + punpckhbw xmm7, xmm2 + movdqa [esp+640-528], xmm7 - movdqa xmm7, xmm6 + movdqa xmm7, xmm6 - psubw xmm6, [esp+640-464] - psubw xmm7, xmm1 - pabsw xmm7, xmm7 - movdqa [esp+640-560], xmm7 - por xmm4, [esp+640-16] - pabsw xmm6, xmm6 - movdqa xmm7, xmm1 - psubw xmm7, [esp+640-448] + psubw xmm6, [esp+640-464] + psubw xmm7, xmm1 + pabsw xmm7, xmm7 + movdqa [esp+640-560], xmm7 + por xmm4, [esp+640-16] + pabsw xmm6, xmm6 + movdqa xmm7, xmm1 + psubw xmm7, [esp+640-448] - movdqa xmm1, [esp+640-512] - pabsw xmm7, xmm7 - pcmpgtw xmm1, xmm7 - movdqa xmm7, [esp+640-512] - pcmpgtw xmm7, xmm6 - movdqa xmm6, [esp+640-320] - pand xmm1, xmm7 - movdqa xmm7, [esp+640-560] - pcmpgtw xmm6, xmm7 - pand xmm1, xmm6 + movdqa xmm1, [esp+640-512] + pabsw xmm7, xmm7 + pcmpgtw xmm1, xmm7 + movdqa xmm7, [esp+640-512] + pcmpgtw xmm7, xmm6 + movdqa xmm6, [esp+640-320] + pand xmm1, xmm7 + movdqa xmm7, [esp+640-560] + pcmpgtw xmm6, xmm7 + pand xmm1, xmm6 - movdqa xmm6, [esp+640-576] - pcmpgtw xmm6, xmm7 + movdqa xmm6, [esp+640-576] + pcmpgtw xmm6, xmm7 - movdqa xmm7, [esp+640-496] - punpckhbw xmm3, xmm2 - movdqa [esp+640-560], xmm6 - movdqa xmm6, [esp+640-512] - psubw xmm7, xmm5 - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 + movdqa xmm7, [esp+640-496] + punpckhbw xmm3, xmm2 + movdqa [esp+640-560], xmm6 + movdqa xmm6, [esp+640-512] + psubw xmm7, xmm5 + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 - pand xmm6, [esp+640-560] - movdqa xmm7, [esp+640-432] - psubw xmm7, [esp+640-528] + pand xmm6, [esp+640-560] + movdqa xmm7, [esp+640-432] + psubw xmm7, [esp+640-528] - psllw xmm3, 1 - movdqa [esp+640-544], xmm6 - movdqa xmm6, [esp+640-512] + psllw xmm3, 1 + movdqa [esp+640-544], xmm6 + movdqa xmm6, [esp+640-512] - movdqa xmm2, [esp+640-544] - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, xmm5 - paddw xmm3, [esp+640-448] - paddw xmm3, [esp+640-496] - pabsw xmm7, xmm7 - pcmpgtw xmm6, xmm7 - pand xmm6, [esp+640-560] - movdqa [esp+640-560], xmm6 + movdqa xmm2, [esp+640-544] + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, xmm5 + paddw xmm3, [esp+640-448] + paddw xmm3, [esp+640-496] + pabsw xmm7, xmm7 + pcmpgtw xmm6, xmm7 + pand xmm6, [esp+640-560] + movdqa [esp+640-560], xmm6 - movdqa xmm6, xmm0 - pand xmm6, xmm4 - movdqa xmm4, xmm0 - pandn xmm4, [esp+640-368] - por xmm6, xmm4 - movdqa xmm4, [esp+640-432] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-592] - psraw xmm3, 3 - pand xmm3, xmm2 - pandn xmm2, xmm5 - por xmm3, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm3 - movdqa xmm3, [esp+640-64] - por xmm3, [esp+640-336] - movdqa xmm2, xmm1 - pandn xmm2, xmm5 - por xmm7, xmm2 + movdqa xmm6, xmm0 + pand xmm6, xmm4 + movdqa xmm4, xmm0 + pandn xmm4, [esp+640-368] + por xmm6, xmm4 + movdqa xmm4, [esp+640-432] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-592] + psraw xmm3, 3 + pand xmm3, xmm2 + pandn xmm2, xmm5 + por xmm3, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm3 + movdqa xmm3, [esp+640-64] + por xmm3, [esp+640-336] + movdqa xmm2, xmm1 + pandn xmm2, xmm5 + por xmm7, xmm2 - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-480] - por xmm2, xmm3 - packuswb xmm6, xmm7 - movdqa [esp+640-336], xmm2 - movdqa [esp+656-272], xmm6 - movdqa xmm6, [esp+640-544] - movdqa xmm2, xmm5 - paddw xmm2, [esp+640-448] - movdqa xmm3, xmm1 - movdqa xmm7, [esp+640-496] - paddw xmm7, xmm4 - paddw xmm2, xmm7 - paddw xmm2, [esp+640-624] - movdqa xmm7, [esp+640-544] - psraw xmm2, 2 - pand xmm6, xmm2 - movdqa xmm2, [esp+640-448] - pandn xmm7, xmm2 - por xmm6, xmm7 - pand xmm3, xmm6 - movdqa xmm6, xmm1 - pandn xmm6, xmm2 - paddw xmm2, [esp+640-496] - paddw xmm2, xmm4 - por xmm3, xmm6 - movdqa xmm6, [esp+640-336] - packuswb xmm6, xmm3 - psllw xmm2, 1 - movdqa [esp+672-272], xmm6 - movdqa xmm6, [esp+640-96] - por xmm6, [esp+640-352] + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-480] + por xmm2, xmm3 + packuswb xmm6, xmm7 + movdqa [esp+640-336], xmm2 + movdqa [esp+656-272], xmm6 + movdqa xmm6, [esp+640-544] + movdqa xmm2, xmm5 + paddw xmm2, [esp+640-448] + movdqa xmm3, xmm1 + movdqa xmm7, [esp+640-496] + paddw xmm7, xmm4 + paddw xmm2, xmm7 + paddw xmm2, [esp+640-624] + movdqa xmm7, [esp+640-544] + psraw xmm2, 2 + pand xmm6, xmm2 + movdqa xmm2, [esp+640-448] + pandn xmm7, xmm2 + por xmm6, xmm7 + pand xmm3, xmm6 + movdqa xmm6, xmm1 + pandn xmm6, xmm2 + paddw xmm2, [esp+640-496] + paddw xmm2, xmm4 + por xmm3, xmm6 + movdqa xmm6, [esp+640-336] + packuswb xmm6, xmm3 + psllw xmm2, 1 + movdqa [esp+672-272], xmm6 + movdqa xmm6, [esp+640-96] + por xmm6, [esp+640-352] - movdqa xmm3, xmm0 - pand xmm3, xmm6 - movdqa xmm6, xmm0 - pandn xmm6, [esp+640-144] - por xmm3, xmm6 - movdqa xmm6, [esp+640-544] - movdqa [esp+640-352], xmm3 - movdqa xmm3, [esp+640-464] - paddw xmm3, [esp+640-592] - paddw xmm2, xmm3 - movdqa xmm3, [esp+640-448] - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-496] - psraw xmm5, 3 - pand xmm6, xmm5 - movdqa xmm5, [esp+640-464] - paddw xmm2, xmm5 - paddw xmm5, [esp+640-432] - movdqa xmm4, xmm3 - paddw xmm4, xmm3 - paddw xmm4, xmm2 - paddw xmm4, [esp+640-624] - movdqa xmm2, [esp+640-544] - paddw xmm3, [esp+640-592] - psraw xmm4, 2 - pandn xmm2, xmm4 - por xmm6, xmm2 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-496] - movdqa xmm2, xmm1 - pandn xmm2, xmm6 - por xmm7, xmm2 - movdqa xmm2, [esp+640-352] - packuswb xmm2, xmm7 - movdqa [esp+688-272], xmm2 - movdqa xmm2, [esp+640-128] - por xmm2, [esp+640-288] + movdqa xmm3, xmm0 + pand xmm3, xmm6 + movdqa xmm6, xmm0 + pandn xmm6, [esp+640-144] + por xmm3, xmm6 + movdqa xmm6, [esp+640-544] + movdqa [esp+640-352], xmm3 + movdqa xmm3, [esp+640-464] + paddw xmm3, [esp+640-592] + paddw xmm2, xmm3 + movdqa xmm3, [esp+640-448] + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-496] + psraw xmm5, 3 + pand xmm6, xmm5 + movdqa xmm5, [esp+640-464] + paddw xmm2, xmm5 + paddw xmm5, [esp+640-432] + movdqa xmm4, xmm3 + paddw xmm4, xmm3 + paddw xmm4, xmm2 + paddw xmm4, [esp+640-624] + movdqa xmm2, [esp+640-544] + paddw xmm3, [esp+640-592] + psraw xmm4, 2 + pandn xmm2, xmm4 + por xmm6, xmm2 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-496] + movdqa xmm2, xmm1 + pandn xmm2, xmm6 + por xmm7, xmm2 + movdqa xmm2, [esp+640-352] + packuswb xmm2, xmm7 + movdqa [esp+688-272], xmm2 + movdqa xmm2, [esp+640-128] + por xmm2, [esp+640-288] - movdqa xmm4, xmm0 - pand xmm4, xmm2 - paddw xmm5, xmm6 - movdqa xmm2, xmm0 - pandn xmm2, [esp+640-400] - por xmm4, xmm2 - movdqa xmm2, [esp+640-528] - psllw xmm5, 1 - paddw xmm5, xmm3 - movdqa xmm3, [esp+640-560] - paddw xmm2, xmm5 - psraw xmm2, 3 - movdqa [esp+640-288], xmm4 - movdqa xmm4, [esp+640-560] - pand xmm4, xmm2 - movdqa xmm2, [esp+640-464] - movdqa xmm5, xmm2 - paddw xmm5, xmm2 - movdqa xmm2, [esp+640-432] - paddw xmm2, [esp+640-448] - movdqa xmm7, xmm1 - paddw xmm5, xmm2 - paddw xmm5, [esp+640-624] - movdqa xmm6, [esp+640-560] - psraw xmm5, 2 - pandn xmm3, xmm5 - por xmm4, xmm3 - movdqa xmm3, [esp+640-32] - por xmm3, [esp+640-304] - pand xmm7, xmm4 - movdqa xmm4, [esp+640-432] - movdqa xmm5, [esp+640-464] - movdqa xmm2, xmm1 - pandn xmm2, xmm4 - paddw xmm4, [esp+640-496] - por xmm7, xmm2 - movdqa xmm2, [esp+640-288] - packuswb xmm2, xmm7 - movdqa [esp+704-272], xmm2 + movdqa xmm4, xmm0 + pand xmm4, xmm2 + paddw xmm5, xmm6 + movdqa xmm2, xmm0 + pandn xmm2, [esp+640-400] + por xmm4, xmm2 + movdqa xmm2, [esp+640-528] + psllw xmm5, 1 + paddw xmm5, xmm3 + movdqa xmm3, [esp+640-560] + paddw xmm2, xmm5 + psraw xmm2, 3 + movdqa [esp+640-288], xmm4 + movdqa xmm4, [esp+640-560] + pand xmm4, xmm2 + movdqa xmm2, [esp+640-464] + movdqa xmm5, xmm2 + paddw xmm5, xmm2 + movdqa xmm2, [esp+640-432] + paddw xmm2, [esp+640-448] + movdqa xmm7, xmm1 + paddw xmm5, xmm2 + paddw xmm5, [esp+640-624] + movdqa xmm6, [esp+640-560] + psraw xmm5, 2 + pandn xmm3, xmm5 + por xmm4, xmm3 + movdqa xmm3, [esp+640-32] + por xmm3, [esp+640-304] + pand xmm7, xmm4 + movdqa xmm4, [esp+640-432] + movdqa xmm5, [esp+640-464] + movdqa xmm2, xmm1 + pandn xmm2, xmm4 + paddw xmm4, [esp+640-496] + por xmm7, xmm2 + movdqa xmm2, [esp+640-288] + packuswb xmm2, xmm7 + movdqa [esp+704-272], xmm2 - movdqa xmm2, xmm0 - pand xmm2, xmm3 - movdqa xmm3, xmm0 - pandn xmm3, [esp+640-384] - por xmm2, xmm3 - movdqa [esp+640-304], xmm2 - movdqa xmm2, [esp+640-528] - movdqa xmm3, xmm2 - paddw xmm3, [esp+640-464] - paddw xmm3, xmm4 - paddw xmm3, [esp+640-624] - psraw xmm3, 2 - pand xmm6, xmm3 - movdqa xmm3, [esp+640-560] - movdqa xmm4, xmm3 - pandn xmm4, xmm5 - por xmm6, xmm4 - movdqa xmm7, xmm1 - pand xmm7, xmm6 - movdqa xmm6, [esp+640-304] - movdqa xmm4, xmm1 - pandn xmm4, xmm5 - por xmm7, xmm4 + movdqa xmm2, xmm0 + pand xmm2, xmm3 + movdqa xmm3, xmm0 + pandn xmm3, [esp+640-384] + por xmm2, xmm3 + movdqa [esp+640-304], xmm2 + movdqa xmm2, [esp+640-528] + movdqa xmm3, xmm2 + paddw xmm3, [esp+640-464] + paddw xmm3, xmm4 + paddw xmm3, [esp+640-624] + psraw xmm3, 2 + pand xmm6, xmm3 + movdqa xmm3, [esp+640-560] + movdqa xmm4, xmm3 + pandn xmm4, xmm5 + por xmm6, xmm4 + movdqa xmm7, xmm1 + pand xmm7, xmm6 + movdqa xmm6, [esp+640-304] + movdqa xmm4, xmm1 + pandn xmm4, xmm5 + por xmm7, xmm4 - movdqa xmm4, xmm0 - pandn xmm0, [esp+640-416] - packuswb xmm6, xmm7 - movdqa xmm7, [esp+640-112] - por xmm7, [esp+640-80] - pand xmm4, xmm7 - por xmm4, xmm0 - movdqa xmm0, [esp+752-272] - punpckhbw xmm0, [esp+640-48] - psllw xmm0, 1 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm2 - paddw xmm0, xmm5 - paddw xmm0, [esp+640-432] - paddw xmm0, [esp+640-496] - paddw xmm0, [esp+640-592] - psraw xmm0, 3 - pand xmm0, xmm3 - movdqa xmm7, xmm1 - pandn xmm3, xmm2 - por xmm0, xmm3 - pand xmm7, xmm0 + movdqa xmm4, xmm0 + pandn xmm0, [esp+640-416] + packuswb xmm6, xmm7 + movdqa xmm7, [esp+640-112] + por xmm7, [esp+640-80] + pand xmm4, xmm7 + por xmm4, xmm0 + movdqa xmm0, [esp+752-272] + punpckhbw xmm0, [esp+640-48] + psllw xmm0, 1 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm2 + paddw xmm0, xmm5 + paddw xmm0, [esp+640-432] + paddw xmm0, [esp+640-496] + paddw xmm0, [esp+640-592] + psraw xmm0, 3 + pand xmm0, xmm3 + movdqa xmm7, xmm1 + pandn xmm3, xmm2 + por xmm0, xmm3 + pand xmm7, xmm0 - movdqa xmm0, [esp+656-272] - movdqa [edx], xmm0 + movdqa xmm0, [esp+656-272] + movdqa [edx], xmm0 - movdqa xmm0, [esp+672-272] + movdqa xmm0, [esp+672-272] - mov edx, dword [esp+640-596] - movdqa [esi], xmm0 - movdqa xmm0, [esp+688-272] - movdqa [edi], xmm0 - movdqa xmm0, [esp+704-272] + mov edx, dword [esp+640-596] + movdqa [esi], xmm0 + movdqa xmm0, [esp+688-272] + movdqa [edi], xmm0 + movdqa xmm0, [esp+704-272] - pop edi - pandn xmm1, xmm2 - movdqa [eax], xmm0 - por xmm7, xmm1 - pop esi - packuswb xmm4, xmm7 - movdqa [edx], xmm6 - movdqa [ecx], xmm4 - pop ebx - mov esp, ebp - pop ebp - ret + pop edi + pandn xmm1, xmm2 + movdqa [eax], xmm0 + por xmm7, xmm1 + pop esi + packuswb xmm4, xmm7 + movdqa [edx], xmm6 + movdqa [ecx], xmm4 + pop ebx + mov esp, ebp + pop ebp + ret %endif diff --git a/codec/common/x86/expand_picture.asm b/codec/common/x86/expand_picture.asm index f39442ba..a3402bbe 100644 --- a/codec/common/x86/expand_picture.asm +++ b/codec/common/x86/expand_picture.asm @@ -77,280 +77,280 @@ SECTION .text ;cccc|ceeeeeeeeeeeeeeeed|dddd ;cccc|ceeeeeeeeeeeeeeeed|dddd -%macro mov_line_8x4_mmx 3 ; dst, stride, mm? - movq [%1], %3 - movq [%1+%2], %3 - lea %1, [%1+2*%2] - movq [%1], %3 - movq [%1+%2], %3 - lea %1, [%1+2*%2] +%macro mov_line_8x4_mmx 3 ; dst, stride, mm? + movq [%1], %3 + movq [%1+%2], %3 + lea %1, [%1+2*%2] + movq [%1], %3 + movq [%1+%2], %3 + lea %1, [%1+2*%2] %endmacro -%macro mov_line_end8x4_mmx 3 ; dst, stride, mm? - movq [%1], %3 - movq [%1+%2], %3 - lea %1, [%1+2*%2] - movq [%1], %3 - movq [%1+%2], %3 - lea %1, [%1+%2] +%macro mov_line_end8x4_mmx 3 ; dst, stride, mm? + movq [%1], %3 + movq [%1+%2], %3 + lea %1, [%1+2*%2] + movq [%1], %3 + movq [%1+%2], %3 + lea %1, [%1+%2] %endmacro -%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a - movdq%4 [%1], %3 ; top(bottom)_0 - movdq%4 [%1+%2], %3 ; top(bottom)_1 - lea %1, [%1+2*%2] - movdq%4 [%1], %3 ; top(bottom)_2 - movdq%4 [%1+%2], %3 ; top(bottom)_3 - lea %1, [%1+2*%2] +%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a + movdq%4 [%1], %3 ; top(bottom)_0 + movdq%4 [%1+%2], %3 ; top(bottom)_1 + lea %1, [%1+2*%2] + movdq%4 [%1], %3 ; top(bottom)_2 + movdq%4 [%1+%2], %3 ; top(bottom)_3 + lea %1, [%1+2*%2] %endmacro -%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a - movdq%4 [%1], %3 ; top(bottom)_0 - movdq%4 [%1+%2], %3 ; top(bottom)_1 - lea %1, [%1+2*%2] - movdq%4 [%1], %3 ; top(bottom)_2 - movdq%4 [%1+%2], %3 ; top(bottom)_3 - lea %1, [%1+%2] +%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a + movdq%4 [%1], %3 ; top(bottom)_0 + movdq%4 [%1+%2], %3 ; top(bottom)_1 + lea %1, [%1+2*%2] + movdq%4 [%1], %3 ; top(bottom)_2 + movdq%4 [%1+%2], %3 ; top(bottom)_3 + lea %1, [%1+%2] %endmacro -%macro mov_line_32x4_sse2 3 ; dst, stride, xmm? - movdqa [%1], %3 ; top(bottom)_0 - movdqa [%1+16], %3 ; top(bottom)_0 - movdqa [%1+%2], %3 ; top(bottom)_1 - movdqa [%1+%2+16], %3 ; top(bottom)_1 - lea %1, [%1+2*%2] - movdqa [%1], %3 ; top(bottom)_2 - movdqa [%1+16], %3 ; top(bottom)_2 - movdqa [%1+%2], %3 ; top(bottom)_3 - movdqa [%1+%2+16], %3 ; top(bottom)_3 - lea %1, [%1+2*%2] +%macro mov_line_32x4_sse2 3 ; dst, stride, xmm? + movdqa [%1], %3 ; top(bottom)_0 + movdqa [%1+16], %3 ; top(bottom)_0 + movdqa [%1+%2], %3 ; top(bottom)_1 + movdqa [%1+%2+16], %3 ; top(bottom)_1 + lea %1, [%1+2*%2] + movdqa [%1], %3 ; top(bottom)_2 + movdqa [%1+16], %3 ; top(bottom)_2 + movdqa [%1+%2], %3 ; top(bottom)_3 + movdqa [%1+%2+16], %3 ; top(bottom)_3 + lea %1, [%1+2*%2] %endmacro -%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm? - movdqa [%1], %3 ; top(bottom)_0 - movdqa [%1+16], %3 ; top(bottom)_0 - movdqa [%1+%2], %3 ; top(bottom)_1 - movdqa [%1+%2+16], %3 ; top(bottom)_1 - lea %1, [%1+2*%2] - movdqa [%1], %3 ; top(bottom)_2 - movdqa [%1+16], %3 ; top(bottom)_2 - movdqa [%1+%2], %3 ; top(bottom)_3 - movdqa [%1+%2+16], %3 ; top(bottom)_3 - lea %1, [%1+%2] +%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm? + movdqa [%1], %3 ; top(bottom)_0 + movdqa [%1+16], %3 ; top(bottom)_0 + movdqa [%1+%2], %3 ; top(bottom)_1 + movdqa [%1+%2+16], %3 ; top(bottom)_1 + lea %1, [%1+2*%2] + movdqa [%1], %3 ; top(bottom)_2 + movdqa [%1+16], %3 ; top(bottom)_2 + movdqa [%1+%2], %3 ; top(bottom)_3 + movdqa [%1+%2+16], %3 ; top(bottom)_3 + lea %1, [%1+%2] %endmacro -%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)] +%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)] ;r2 [width/16(8)] ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom -%if %1 == 32 ; for luma - sar r2, 04h ; width / 16(8) pixels +%if %1 == 32 ; for luma + sar r2, 04h ; width / 16(8) pixels .top_bottom_loops: - ; top - movdqa xmm0, [r0] ; first line of picture pData - mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_end16x4_sse2 r5, r1, xmm0, a + ; top + movdqa xmm0, [r0] ; first line of picture pData + mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_end16x4_sse2 r5, r1, xmm0, a - ; bottom - movdqa xmm1, [r3] ; last line of picture pData - mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_end16x4_sse2 r4, r1, xmm1, a + ; bottom + movdqa xmm1, [r3] ; last line of picture pData + mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_end16x4_sse2 r4, r1, xmm1, a - lea r0, [r0+16] ; top pSrc - lea r5, [r5+16] ; top dst - lea r3, [r3+16] ; bottom pSrc - lea r4, [r4+16] ; bottom dst - neg r1 ; positive/negative stride need for next loop? + lea r0, [r0+16] ; top pSrc + lea r5, [r5+16] ; top dst + lea r3, [r3+16] ; bottom pSrc + lea r4, [r4+16] ; bottom dst + neg r1 ; positive/negative stride need for next loop? - dec r2 - jnz near .top_bottom_loops -%elif %1 == 16 ; for chroma ?? - mov r6, r2 - sar r2, 04h ; (width / 16) pixels + dec r2 + jnz near .top_bottom_loops +%elif %1 == 16 ; for chroma ?? + mov r6, r2 + sar r2, 04h ; (width / 16) pixels .top_bottom_loops: - ; top - movdqa xmm0, [r0] ; first line of picture pData - mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_16x4_sse2 r5, r1, xmm0, a - mov_line_end16x4_sse2 r5, r1, xmm0, a + ; top + movdqa xmm0, [r0] ; first line of picture pData + mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_16x4_sse2 r5, r1, xmm0, a + mov_line_end16x4_sse2 r5, r1, xmm0, a - ; bottom - movdqa xmm1, [r3] ; last line of picture pData - mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_16x4_sse2 r4, r1, xmm1, a - mov_line_end16x4_sse2 r4, r1, xmm1, a + ; bottom + movdqa xmm1, [r3] ; last line of picture pData + mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_16x4_sse2 r4, r1, xmm1, a + mov_line_end16x4_sse2 r4, r1, xmm1, a - lea r0, [r0+16] ; top pSrc - lea r5, [r5+16] ; top dst - lea r3, [r3+16] ; bottom pSrc - lea r4, [r4+16] ; bottom dst - neg r1 ; positive/negative stride need for next loop? + lea r0, [r0+16] ; top pSrc + lea r5, [r5+16] ; top dst + lea r3, [r3+16] ; bottom pSrc + lea r4, [r4+16] ; bottom dst + neg r1 ; positive/negative stride need for next loop? - dec r2 - jnz near .top_bottom_loops + dec r2 + jnz near .top_bottom_loops - ; for remaining 8 bytes - and r6, 0fh ; any 8 bytes left? - test r6, r6 - jz near .to_be_continued ; no left to exit here + ; for remaining 8 bytes + and r6, 0fh ; any 8 bytes left? + test r6, r6 + jz near .to_be_continued ; no left to exit here - ; top - movq mm0, [r0] ; remained 8 byte - mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? - mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? - mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? - mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm? - ; bottom - movq mm1, [r3] - mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? - mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? - mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? - mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm? - WELSEMMS + ; top + movq mm0, [r0] ; remained 8 byte + mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? + mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? + mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? + mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm? + ; bottom + movq mm1, [r3] + mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? + mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? + mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? + mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm? + WELSEMMS .to_be_continued: %endif %endmacro -%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a +%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a ;r6 [height] ;r0 [pSrc+0] r5[pSrc-32] r1[stride] ;r3 [pSrc+(w-1)] r4[pSrc+w] -%if %1 == 32 ; for luma +%if %1 == 32 ; for luma .left_right_loops: - ; left - movzx r2d, byte [r0] ; pixel pData for left border - SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] - movdqa [r5], xmm0 - movdqa [r5+16], xmm0 + ; left + movzx r2d, byte [r0] ; pixel pData for left border + SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] + movdqa [r5], xmm0 + movdqa [r5+16], xmm0 - ; right - movzx r2d, byte [r3] - SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] - movdqa [r4], xmm1 - movdqa [r4+16], xmm1 + ; right + movzx r2d, byte [r3] + SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] + movdqa [r4], xmm1 + movdqa [r4+16], xmm1 - lea r0, [r0+r1] ; left pSrc - lea r5, [r5+r1] ; left dst - lea r3, [r3+r1] ; right pSrc - lea r4, [r4+r1] ; right dst + lea r0, [r0+r1] ; left pSrc + lea r5, [r5+r1] ; left dst + lea r3, [r3+r1] ; right pSrc + lea r4, [r4+r1] ; right dst - dec r6 - jnz near .left_right_loops -%elif %1 == 16 ; for chroma ?? + dec r6 + jnz near .left_right_loops +%elif %1 == 16 ; for chroma ?? .left_right_loops: - ; left - movzx r2d, byte [r0] ; pixel pData for left border - SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] - movdqa [r5], xmm0 + ; left + movzx r2d, byte [r0] ; pixel pData for left border + SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] + movdqa [r5], xmm0 - ; right - movzx r2d, byte [r3] - SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] - movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes + ; right + movzx r2d, byte [r3] + SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] + movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes - lea r0, [r0+r1] ; left pSrc - lea r5, [r5+r1] ; left dst - lea r3, [r3+r1] ; right pSrc - lea r4, [r4+r1] ; right dst + lea r0, [r0+r1] ; left pSrc + lea r5, [r5+r1] ; left dst + lea r3, [r3+r1] ; right pSrc + lea r4, [r4+r1] ; right dst - dec r6 - jnz near .left_right_loops + dec r6 + jnz near .left_right_loops %endif %endmacro -%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a - ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6 - ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride +%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a + ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6 + ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride -%if %1 == 32 ; luma - ; TL - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? +%if %1 == 32 ; luma + ; TL + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? + mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? - ; TR - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + ; TR + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? + mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? - ; BL - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + ; BL + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? + mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? - ; BR - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? - mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? -%elif %1 == 16 ; chroma - ; TL - mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? - mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? - mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? - mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? + ; BR + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? + mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? +%elif %1 == 16 ; chroma + ; TL + mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? + mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? + mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? + mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? - ; TR - mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? - mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? - mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? - mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? + ; TR + mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? + mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? + mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? + mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? - ; BL - mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? - mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? - mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? - mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? + ; BL + mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? + mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? + mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? + mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? - ; BR - mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? - mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? - mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? - mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? + ; BR + mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? + mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? + mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? + mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? %endif %endmacro ;***********************************************************************---------------- -; void ExpandPictureLuma_sse2( uint8_t *pDst, -; const int32_t iStride, -; const int32_t iWidth, -; const int32_t iHeight ); +; void ExpandPictureLuma_sse2( uint8_t *pDst, +; const int32_t iStride, +; const int32_t iWidth, +; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureLuma_sse2 @@ -403,8 +403,8 @@ WELS_EXTERN ExpandPictureLuma_sse2 exp_top_bottom_sse2 32 - ; for both left and right border - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for both left and right border + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 @@ -416,8 +416,8 @@ WELS_EXTERN ExpandPictureLuma_sse2 lea r4,[r3+1] ;right border dst ;prepare for cross border data: top-rigth with xmm4 - movzx r6d,byte [r3] ;top -rigth - SSE2_Copy16Times xmm4,r6d + movzx r6d,byte [r3] ;top -rigth + SSE2_Copy16Times xmm4,r6d neg r1 ;r1 = stride @@ -438,8 +438,8 @@ WELS_EXTERN ExpandPictureLuma_sse2 pop r1 pop r0 - ; for cross border [top-left, top-right, bottom-left, bottom-right] - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for cross border [top-left, top-right, bottom-left, bottom-right] + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride @@ -472,13 +472,13 @@ WELS_EXTERN ExpandPictureLuma_sse2 %assign push_num 0 - ret + ret ;***********************************************************************---------------- -; void ExpandPictureChromaAlign_sse2( uint8_t *pDst, -; const int32_t iStride, -; const int32_t iWidth, -; const int32_t iHeight ); +; void ExpandPictureChromaAlign_sse2( uint8_t *pDst, +; const int32_t iStride, +; const int32_t iWidth, +; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureChromaAlign_sse2 @@ -531,8 +531,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2 exp_top_bottom_sse2 16 - ; for both left and right border - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for both left and right border + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 @@ -557,7 +557,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2 push r0 push r1 push r2 - push r6 + push r6 exp_left_right_sse2 16,a pop r6 @@ -565,8 +565,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2 pop r1 pop r0 - ; for cross border [top-left, top-right, bottom-left, bottom-right] - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for cross border [top-left, top-right, bottom-left, bottom-right] + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride @@ -599,16 +599,16 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2 %assign push_num 0 - ret + ret ;***********************************************************************---------------- -; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst, -; const int32_t iStride, -; const int32_t iWidth, -; const int32_t iHeight ); +; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst, +; const int32_t iStride, +; const int32_t iWidth, +; const int32_t iHeight ); ;***********************************************************************---------------- WELS_EXTERN ExpandPictureChromaUnalign_sse2 - push r4 + push r4 push r5 push r6 @@ -657,8 +657,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 exp_top_bottom_sse2 16 - ; for both left and right border - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for both left and right border + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pop r2 pop r1 @@ -683,7 +683,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 push r0 push r1 push r2 - push r6 + push r6 exp_left_right_sse2 16,u pop r6 @@ -691,8 +691,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 pop r1 pop r0 - ; for cross border [top-left, top-right, bottom-left, bottom-right] - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; for cross border [top-left, top-right, bottom-left, bottom-right] + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. neg r1 ;r1 = -stride @@ -725,4 +725,4 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2 %assign push_num 0 - ret + ret diff --git a/codec/common/x86/mb_copy.asm b/codec/common/x86/mb_copy.asm index 510748fb..dc680893 100644 --- a/codec/common/x86/mb_copy.asm +++ b/codec/common/x86/mb_copy.asm @@ -36,9 +36,9 @@ ;* ;* History ;* 15/09/2009 Created -;* 12/28/2009 Modified with larger throughput -;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, -;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; +;* 12/28/2009 Modified with larger throughput +;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, +;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; ;* ;* ;*********************************************************************************************/ @@ -56,174 +56,174 @@ SECTION .text ;*********************************************************************** -; void WelsCopy16x16_sse2( uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) +; void WelsCopy16x16_sse2( uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy16x16_sse2 - push r4 - push r5 - %assign push_num 2 + push r4 + push r5 + %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+2*r3] - movdqa xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+2*r3] - movdqa xmm7, [r2+r5] - lea r2, [r2+4*r3] + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+2*r3] + movdqa xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+2*r3] + movdqa xmm7, [r2+r5] + lea r2, [r2+4*r3] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + lea r0, [r0+4*r1] - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+2*r3] - movdqa xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+2*r3] - movdqa xmm7, [r2+r5] + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+2*r3] + movdqa xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+2*r3] + movdqa xmm7, [r2+r5] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - POP_XMM - LOAD_4_PARA_POP - pop r5 - pop r4 - ret + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + POP_XMM + LOAD_4_PARA_POP + pop r5 + pop r4 + ret ;*********************************************************************** -; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) +; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) ;*********************************************************************** ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 WELS_EXTERN WelsCopy16x16NotAligned_sse2 - push r4 - push r5 - %assign push_num 2 + push r4 + push r5 + %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] - lea r2, [r2+4*r3] + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] + lea r2, [r2+4*r3] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + lea r0, [r0+4*r1] - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - POP_XMM - LOAD_4_PARA_POP - pop r5 - pop r4 - ret + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + POP_XMM + LOAD_4_PARA_POP + pop r5 + pop r4 + ret ; , 12/29/2011 ;*********************************************************************** ; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, -; int32_t iStrideD, -; uint8_t* Src, -; int32_t iStrideS ) +; int32_t iStrideD, +; uint8_t* Src, +; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy16x8NotAligned_sse2 - push r4 - push r5 - %assign push_num 2 + push r4 + push r5 + %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 - lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 - lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 + lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 + lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 - movdqu xmm0, [r2] - movdqu xmm1, [r2+r3] - movdqu xmm2, [r2+2*r3] - movdqu xmm3, [r2+r5] - lea r2, [r2+4*r3] - movdqu xmm4, [r2] - movdqu xmm5, [r2+r3] - movdqu xmm6, [r2+2*r3] - movdqu xmm7, [r2+r5] + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+2*r3] + movdqu xmm3, [r2+r5] + lea r2, [r2+4*r3] + movdqu xmm4, [r2] + movdqu xmm5, [r2+r3] + movdqu xmm6, [r2+2*r3] + movdqu xmm7, [r2+r5] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm2 - movdqa [r0+r4], xmm3 - lea r0, [r0+4*r1] - movdqa [r0], xmm4 - movdqa [r0+r1], xmm5 - movdqa [r0+2*r1], xmm6 - movdqa [r0+r4], xmm7 - POP_XMM - LOAD_4_PARA_POP - pop r5 - pop r4 - ret + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm2 + movdqa [r0+r4], xmm3 + lea r0, [r0+4*r1] + movdqa [r0], xmm4 + movdqa [r0+r1], xmm5 + movdqa [r0+2*r1], xmm6 + movdqa [r0+r4], xmm7 + POP_XMM + LOAD_4_PARA_POP + pop r5 + pop r4 + ret ;*********************************************************************** @@ -233,62 +233,62 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2 ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy8x16_mmx - %assign push_num 0 + %assign push_num 0 LOAD_4_PARA - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] - lea r2, [r2+2*r3] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] + lea r2, [r2+2*r3] - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 - lea r0, [r0+2*r1] + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 + lea r0, [r0+2*r1] - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 - WELSEMMS - LOAD_4_PARA_POP - ret + WELSEMMS + LOAD_4_PARA_POP + ret ;*********************************************************************** ; void WelsCopy8x8_mmx( uint8_t* Dst, @@ -297,48 +297,48 @@ WELS_EXTERN WelsCopy8x16_mmx ; int32_t iStrideS ) ;*********************************************************************** WELS_EXTERN WelsCopy8x8_mmx - push r4 - %assign push_num 1 + push r4 + %assign push_num 1 LOAD_4_PARA - lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] + lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - ; to prefetch next loop - prefetchnta [r2+2*r3] - prefetchnta [r2+r4] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] - movq mm7, [r2+r3] + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + ; to prefetch next loop + prefetchnta [r2+2*r3] + prefetchnta [r2+r4] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] + movq mm7, [r2+r3] - movq [r0], mm0 - movq [r0+r1], mm1 - lea r0, [r0+2*r1] - movq [r0], mm2 - movq [r0+r1], mm3 - lea r0, [r0+2*r1] - movq [r0], mm4 - movq [r0+r1], mm5 - lea r0, [r0+2*r1] - movq [r0], mm6 - movq [r0+r1], mm7 + movq [r0], mm0 + movq [r0+r1], mm1 + lea r0, [r0+2*r1] + movq [r0], mm2 + movq [r0+r1], mm3 + lea r0, [r0+2*r1] + movq [r0], mm4 + movq [r0+r1], mm5 + lea r0, [r0+2*r1] + movq [r0], mm6 + movq [r0+r1], mm7 - WELSEMMS - LOAD_4_PARA_POP - pop r4 - ret + WELSEMMS + LOAD_4_PARA_POP + pop r4 + ret ; (dunhuang@cisco), 12/21/2011 ;*********************************************************************** @@ -349,13 +349,13 @@ WELS_EXTERN UpdateMbMv_sse2 %assign push_num 0 LOAD_2_PARA - movd xmm0, r1d ; _mv - pshufd xmm1, xmm0, $00 - movdqa [r0 ], xmm1 - movdqa [r0+0x10], xmm1 - movdqa [r0+0x20], xmm1 - movdqa [r0+0x30], xmm1 - ret + movd xmm0, r1d ; _mv + pshufd xmm1, xmm0, $00 + movdqa [r0 ], xmm1 + movdqa [r0+0x10], xmm1 + movdqa [r0+0x20], xmm1 + movdqa [r0+0x30], xmm1 + ret ;******************************************************************************* ; Macros and other preprocessor constants @@ -381,14 +381,14 @@ WELS_EXTERN PixelAvgWidthEq4_mmx %assign push_num 0 LOAD_7_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d - SIGN_EXTENSION r6, r6d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: - movd mm0, [r4] + movd mm0, [r4] pavgb mm0, [r2] movd [r0], mm0 @@ -398,8 +398,8 @@ ALIGN 4 lea r4, [r4+r5] jne .height_loop - WELSEMMS - LOAD_7_PARA_POP + WELSEMMS + LOAD_7_PARA_POP ret @@ -413,29 +413,29 @@ WELS_EXTERN PixelAvgWidthEq8_mmx %assign push_num 0 LOAD_7_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d - SIGN_EXTENSION r6, r6d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: - movq mm0, [r2] + movq mm0, [r2] pavgb mm0, [r4] movq [r0], mm0 movq mm0, [r2+r3] pavgb mm0, [r4+r5] - movq [r0+r1], mm0 + movq [r0+r1], mm0 - lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] sub r6, 2 jnz .height_loop - WELSEMMS - LOAD_7_PARA_POP + WELSEMMS + LOAD_7_PARA_POP ret @@ -450,46 +450,46 @@ WELS_EXTERN PixelAvgWidthEq16_sse2 %assign push_num 0 LOAD_7_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d - SIGN_EXTENSION r6, r6d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r6, r6d ALIGN 4 .height_loop: - movdqu xmm0, [r2] - movdqu xmm1, [r4] - pavgb xmm0, xmm1 - ;pavgb xmm0, [r4] + movdqu xmm0, [r2] + movdqu xmm1, [r4] + pavgb xmm0, xmm1 + ;pavgb xmm0, [r4] movdqu [r0], xmm0 - movdqu xmm0, [r2+r3] - movdqu xmm1, [r4+r5] - pavgb xmm0, xmm1 + movdqu xmm0, [r2+r3] + movdqu xmm1, [r4+r5] + pavgb xmm0, xmm1 movdqu [r0+r1], xmm0 - movdqu xmm0, [r2+2*r3] - movdqu xmm1, [r4+2*r5] - pavgb xmm0, xmm1 + movdqu xmm0, [r2+2*r3] + movdqu xmm1, [r4+2*r5] + pavgb xmm0, xmm1 movdqu [r0+2*r1], xmm0 lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] - movdqu xmm0, [r2+r3] - movdqu xmm1, [r4+r5] - pavgb xmm0, xmm1 + movdqu xmm0, [r2+r3] + movdqu xmm1, [r4+r5] + pavgb xmm0, xmm1 movdqu [r0+r1], xmm0 lea r2, [r2+2*r3] - lea r4, [r4+2*r5] - lea r0, [r0+2*r1] + lea r4, [r4+2*r5] + lea r0, [r0+2*r1] sub r6, 4 jne .height_loop - WELSEMMS - LOAD_7_PARA_POP + WELSEMMS + LOAD_7_PARA_POP ret ;******************************************************************************* @@ -497,26 +497,26 @@ ALIGN 4 ; uint8_t *pDst, int iDstStride, int iHeight ) ;******************************************************************************* WELS_EXTERN McCopyWidthEq4_mmx - push r5 + push r5 %assign push_num 1 LOAD_5_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: - mov r5d, [r0] - mov [r2], r5d + mov r5d, [r0] + mov [r2], r5d - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop - WELSEMMS + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop + WELSEMMS LOAD_5_PARA_POP - pop r5 + pop r5 ret ;******************************************************************************* @@ -527,21 +527,21 @@ WELS_EXTERN McCopyWidthEq8_mmx %assign push_num 0 LOAD_5_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: - movq mm0, [r0] - movq [r2], mm0 - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop + movq mm0, [r0] + movq [r2], mm0 + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop - WELSEMMS - LOAD_5_PARA_POP + WELSEMMS + LOAD_5_PARA_POP ret @@ -550,32 +550,32 @@ ALIGN 4 ;******************************************************************************* ;read unaligned memory %macro SSE_READ_UNA 2 - movq %1, [%2] - movhps %1, [%2+8] + movq %1, [%2] + movhps %1, [%2+8] %endmacro ;write unaligned memory %macro SSE_WRITE_UNA 2 - movq [%1], %2 - movhps [%1+8], %2 + movq [%1], %2 + movhps [%1+8], %2 %endmacro WELS_EXTERN McCopyWidthEq16_sse2 %assign push_num 0 LOAD_5_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d ALIGN 4 .height_loop: - SSE_READ_UNA xmm0, r0 - SSE_READ_UNA xmm1, r0+r1 - SSE_WRITE_UNA r2, xmm0 - SSE_WRITE_UNA r2+r3, xmm1 + SSE_READ_UNA xmm0, r0 + SSE_READ_UNA xmm1, r0+r1 + SSE_WRITE_UNA r2, xmm0 + SSE_WRITE_UNA r2+r3, xmm1 - sub r4, 2 + sub r4, 2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] jnz .height_loop - LOAD_5_PARA_POP + LOAD_5_PARA_POP ret diff --git a/codec/common/x86/mc_chroma.asm b/codec/common/x86/mc_chroma.asm index f0c214d4..cf02ef6e 100644 --- a/codec/common/x86/mc_chroma.asm +++ b/codec/common/x86/mc_chroma.asm @@ -53,10 +53,10 @@ SECTION .rodata align=16 ALIGN 16 h264_d0x20_sse2: - dw 32,32,32,32,32,32,32,32 + dw 32,32,32,32,32,32,32,32 ALIGN 16 h264_d0x20_mmx: - dw 32,32,32,32 + dw 32,32,32,32 ;============================================================================= @@ -67,171 +67,171 @@ SECTION .text ;******************************************************************************* ; void McChromaWidthEq4_mmx( const uint8_t *src, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; const uint8_t *pABCD, -; int32_t iHeigh ); +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; const uint8_t *pABCD, +; int32_t iHeigh ); ;******************************************************************************* WELS_EXTERN McChromaWidthEq4_mmx - %assign push_num 0 - LOAD_6_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_6_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d - movd mm3, [r4]; [eax] - WELS_Zero mm7 - punpcklbw mm3, mm3 - movq mm4, mm3 - punpcklwd mm3, mm3 - punpckhwd mm4, mm4 + movd mm3, [r4]; [eax] + WELS_Zero mm7 + punpcklbw mm3, mm3 + movq mm4, mm3 + punpcklwd mm3, mm3 + punpckhwd mm4, mm4 - movq mm5, mm3 - punpcklbw mm3, mm7 - punpckhbw mm5, mm7 + movq mm5, mm3 + punpcklbw mm3, mm7 + punpckhbw mm5, mm7 - movq mm6, mm4 - punpcklbw mm4, mm7 - punpckhbw mm6, mm7 + movq mm6, mm4 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 - lea r4, [r0 + r1] ;lea ebx, [esi + eax] - movd mm0, [r0] - movd mm1, [r0+1] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 + lea r4, [r0 + r1] ;lea ebx, [esi + eax] + movd mm0, [r0] + movd mm1, [r0+1] + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 .xloop: - pmullw mm0, mm3 - pmullw mm1, mm5 - paddw mm0, mm1 + pmullw mm0, mm3 + pmullw mm1, mm5 + paddw mm0, mm1 - movd mm1, [r4] - punpcklbw mm1, mm7 - movq mm2, mm1 - pmullw mm1, mm4 - paddw mm0, mm1 + movd mm1, [r4] + punpcklbw mm1, mm7 + movq mm2, mm1 + pmullw mm1, mm4 + paddw mm0, mm1 - movd mm1, [r4+1] - punpcklbw mm1, mm7 - movq mm7, mm1 - pmullw mm1,mm6 - paddw mm0, mm1 - movq mm1,mm7 + movd mm1, [r4+1] + punpcklbw mm1, mm7 + movq mm7, mm1 + pmullw mm1,mm6 + paddw mm0, mm1 + movq mm1,mm7 - paddw mm0, [h264_d0x20_mmx] - psrlw mm0, 6 + paddw mm0, [h264_d0x20_mmx] + psrlw mm0, 6 - WELS_Zero mm7 - packuswb mm0, mm7 - movd [r2], mm0 + WELS_Zero mm7 + packuswb mm0, mm7 + movd [r2], mm0 - movq mm0, mm2 + movq mm0, mm2 - lea r2, [r2 + r3] - lea r4, [r4 + r1] + lea r2, [r2 + r3] + lea r4, [r4 + r1] - dec r5 - jnz near .xloop - WELSEMMS - LOAD_6_PARA_POP - ret + dec r5 + jnz near .xloop + WELSEMMS + LOAD_6_PARA_POP + ret ;******************************************************************************* ; void McChromaWidthEq8_sse2( const uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; const uint8_t *pABCD, -; int32_t iheigh ); +; int32_t iSrcStride, +; uint8_t *pDst, +; int32_t iDstStride, +; const uint8_t *pABCD, +; int32_t iheigh ); ;******************************************************************************* WELS_EXTERN McChromaWidthEq8_sse2 - %assign push_num 0 - LOAD_6_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d - movd xmm3, [r4] - WELS_Zero xmm7 - punpcklbw xmm3, xmm3 - punpcklwd xmm3, xmm3 + movd xmm3, [r4] + WELS_Zero xmm7 + punpcklbw xmm3, xmm3 + punpcklwd xmm3, xmm3 - movdqa xmm4, xmm3 - punpckldq xmm3, xmm3 - punpckhdq xmm4, xmm4 - movdqa xmm5, xmm3 - movdqa xmm6, xmm4 + movdqa xmm4, xmm3 + punpckldq xmm3, xmm3 + punpckhdq xmm4, xmm4 + movdqa xmm5, xmm3 + movdqa xmm6, xmm4 - punpcklbw xmm3, xmm7 - punpckhbw xmm5, xmm7 - punpcklbw xmm4, xmm7 - punpckhbw xmm6, xmm7 + punpcklbw xmm3, xmm7 + punpckhbw xmm5, xmm7 + punpcklbw xmm4, xmm7 + punpckhbw xmm6, xmm7 - lea r4, [r0 + r1] ;lea ebx, [esi + eax] - movq xmm0, [r0] - movq xmm1, [r0+1] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 + lea r4, [r0 + r1] ;lea ebx, [esi + eax] + movq xmm0, [r0] + movq xmm1, [r0+1] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 .xloop: - pmullw xmm0, xmm3 - pmullw xmm1, xmm5 - paddw xmm0, xmm1 + pmullw xmm0, xmm3 + pmullw xmm1, xmm5 + paddw xmm0, xmm1 - movq xmm1, [r4] - punpcklbw xmm1, xmm7 - movdqa xmm2, xmm1 - pmullw xmm1, xmm4 - paddw xmm0, xmm1 + movq xmm1, [r4] + punpcklbw xmm1, xmm7 + movdqa xmm2, xmm1 + pmullw xmm1, xmm4 + paddw xmm0, xmm1 - movq xmm1, [r4+1] - punpcklbw xmm1, xmm7 - movdqa xmm7, xmm1 - pmullw xmm1, xmm6 - paddw xmm0, xmm1 - movdqa xmm1,xmm7 + movq xmm1, [r4+1] + punpcklbw xmm1, xmm7 + movdqa xmm7, xmm1 + pmullw xmm1, xmm6 + paddw xmm0, xmm1 + movdqa xmm1,xmm7 - paddw xmm0, [h264_d0x20_sse2] - psrlw xmm0, 6 + paddw xmm0, [h264_d0x20_sse2] + psrlw xmm0, 6 - WELS_Zero xmm7 - packuswb xmm0, xmm7 - movq [r2], xmm0 + WELS_Zero xmm7 + packuswb xmm0, xmm7 + movq [r2], xmm0 - movdqa xmm0, xmm2 + movdqa xmm0, xmm2 - lea r2, [r2 + r3] - lea r4, [r4 + r1] + lea r2, [r2 + r3] + lea r4, [r4 + r1] - dec r5 - jnz near .xloop + dec r5 + jnz near .xloop - POP_XMM - LOAD_6_PARA_POP + POP_XMM + LOAD_6_PARA_POP - ret + ret ;*********************************************************************** ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc, -; int32_t iSrcStride, +; int32_t iSrcStride, ; uint8_t *pDst, ; int32_t iDstStride, ; const uint8_t *pABCD, -; int32_t iHeigh); +; int32_t iHeigh); ;*********************************************************************** WELS_EXTERN McChromaWidthEq8_ssse3 - %assign push_num 0 - LOAD_6_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_6_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d pxor xmm7, xmm7 movd xmm5, [r4] @@ -243,27 +243,27 @@ WELS_EXTERN McChromaWidthEq8_ssse3 sub r2, r3 ;sub esi, edi sub r2, r3 - movdqa xmm7, [h264_d0x20_sse2] + movdqa xmm7, [h264_d0x20_sse2] - movdqu xmm0, [r0] - movdqa xmm1, xmm0 - psrldq xmm1, 1 - punpcklbw xmm0, xmm1 + movdqu xmm0, [r0] + movdqa xmm1, xmm0 + psrldq xmm1, 1 + punpcklbw xmm0, xmm1 .hloop_chroma: - lea r2, [r2+2*r3] + lea r2, [r2+2*r3] - movdqu xmm2, [r0+r1] - movdqa xmm3, xmm2 - psrldq xmm3, 1 - punpcklbw xmm2, xmm3 - movdqa xmm4, xmm2 + movdqu xmm2, [r0+r1] + movdqa xmm3, xmm2 + psrldq xmm3, 1 + punpcklbw xmm2, xmm3 + movdqa xmm4, xmm2 pmaddubsw xmm0, xmm5 pmaddubsw xmm2, xmm6 paddw xmm0, xmm2 paddw xmm0, xmm7 - psrlw xmm0, 6 + psrlw xmm0, 6 packuswb xmm0, xmm0 movq [r2],xmm0 @@ -278,16 +278,16 @@ WELS_EXTERN McChromaWidthEq8_ssse3 pmaddubsw xmm2, xmm6 paddw xmm4, xmm2 paddw xmm4, xmm7 - psrlw xmm4, 6 + psrlw xmm4, 6 packuswb xmm4, xmm4 movq [r2+r3],xmm4 - sub r5, 2 - jnz .hloop_chroma + sub r5, 2 + jnz .hloop_chroma - POP_XMM - LOAD_6_PARA_POP + POP_XMM + LOAD_6_PARA_POP - ret + ret diff --git a/codec/common/x86/mc_luma.asm b/codec/common/x86/mc_luma.asm index d2d2131a..122cc06d 100644 --- a/codec/common/x86/mc_luma.asm +++ b/codec/common/x86/mc_luma.asm @@ -52,13 +52,13 @@ SECTION .rodata align=16 ALIGN 16 h264_w0x10: - dw 16, 16, 16, 16 + dw 16, 16, 16, 16 ALIGN 16 h264_w0x10_1: - dw 16, 16, 16, 16, 16, 16, 16, 16 + dw 16, 16, 16, 16, 16, 16, 16, 16 ALIGN 16 h264_mc_hc_32: - dw 32, 32, 32, 32, 32, 32, 32, 32 + dw 32, 32, 32, 32, 32, 32, 32, 32 ;******************************************************************************* @@ -72,55 +72,55 @@ SECTION .text ;******************************************************************************* ; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc, ; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight) +; uint8_t *pDst, +; int iDstStride, +; int iHeight) ;******************************************************************************* WELS_EXTERN McHorVer20WidthEq4_mmx %assign push_num 0 LOAD_5_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d - sub r0, 2 - WELS_Zero mm7 - movq mm6, [h264_w0x10] + sub r0, 2 + WELS_Zero mm7 + movq mm6, [h264_w0x10] .height_loop: - movd mm0, [r0] - punpcklbw mm0, mm7 - movd mm1, [r0+5] - punpcklbw mm1, mm7 - movd mm2, [r0+1] - punpcklbw mm2, mm7 - movd mm3, [r0+4] - punpcklbw mm3, mm7 - movd mm4, [r0+2] - punpcklbw mm4, mm7 - movd mm5, [r0+3] - punpcklbw mm5, mm7 + movd mm0, [r0] + punpcklbw mm0, mm7 + movd mm1, [r0+5] + punpcklbw mm1, mm7 + movd mm2, [r0+1] + punpcklbw mm2, mm7 + movd mm3, [r0+4] + punpcklbw mm3, mm7 + movd mm4, [r0+2] + punpcklbw mm4, mm7 + movd mm5, [r0+3] + punpcklbw mm5, mm7 - paddw mm2, mm3 - paddw mm4, mm5 - psllw mm4, 2 - psubw mm4, mm2 - paddw mm0, mm1 - paddw mm0, mm4 - psllw mm4, 2 - paddw mm0, mm4 - paddw mm0, mm6 - psraw mm0, 5 - packuswb mm0, mm7 - movd [r2], mm0 + paddw mm2, mm3 + paddw mm4, mm5 + psllw mm4, 2 + psubw mm4, mm2 + paddw mm0, mm1 + paddw mm0, mm4 + psllw mm4, 2 + paddw mm0, mm4 + paddw mm0, mm6 + psraw mm0, 5 + packuswb mm0, mm7 + movd [r2], mm0 - add r0, r1 - add r2, r3 - dec r4 - jnz .height_loop + add r0, r1 + add r2, r3 + dec r4 + jnz .height_loop - WELSEMMS - LOAD_5_PARA_POP - ret + WELSEMMS + LOAD_5_PARA_POP + ret ;******************************************************************************* ; Macros and other preprocessor constants @@ -128,26 +128,26 @@ WELS_EXTERN McHorVer20WidthEq4_mmx %macro SSE_LOAD_8P 3 - movq %1, %3 - punpcklbw %1, %2 + movq %1, %3 + punpcklbw %1, %2 %endmacro %macro FILTER_HV_W8 9 - paddw %1, %6 - movdqa %8, %3 - movdqa %7, %2 - paddw %1, [h264_w0x10_1] - paddw %8, %4 - paddw %7, %5 - psllw %8, 2 - psubw %8, %7 - paddw %1, %8 - psllw %8, 2 - paddw %1, %8 - psraw %1, 5 - WELS_Zero %8 - packuswb %1, %8 - movq %9, %1 + paddw %1, %6 + movdqa %8, %3 + movdqa %7, %2 + paddw %1, [h264_w0x10_1] + paddw %8, %4 + paddw %7, %5 + psllw %8, 2 + psubw %8, %7 + paddw %1, %8 + psllw %8, 2 + paddw %1, %8 + psraw %1, 5 + WELS_Zero %8 + packuswb %1, %8 + movq %9, %1 %endmacro ;******************************************************************************* @@ -159,192 +159,192 @@ SECTION .text ;*********************************************************************** ; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc, ; int16_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride -; int32_t iHeight +; uint8_t *pDst, +; int32_t iDstStride +; int32_t iHeight ; ) ;*********************************************************************** WELS_EXTERN McHorVer22Width8HorFirst_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - pxor xmm7, xmm7 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + pxor xmm7, xmm7 - sub r0, r1 ;;;;;;;;need more 5 lines. - sub r0, r1 + sub r0, r1 ;;;;;;;;need more 5 lines. + sub r0, r1 .yloop_width_8: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - movdqa [r2], xmm0 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + movdqa [r2], xmm0 - add r0, r1 - add r2, r3 - dec r4 - jnz .yloop_width_8 - POP_XMM - LOAD_5_PARA_POP - ret + add r0, r1 + add r2, r3 + dec r4 + jnz .yloop_width_8 + POP_XMM + LOAD_5_PARA_POP + ret ;******************************************************************************* ; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc, ; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight, +; uint8_t *pDst, +; int iDstStride, +; int iHeight, ; ); ;******************************************************************************* WELS_EXTERN McHorVer20WidthEq8_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - lea r0, [r0-2] ;pSrc -= 2; + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + lea r0, [r0-2] ;pSrc -= 2; - pxor xmm7, xmm7 - movdqa xmm6, [h264_w0x10_1] + pxor xmm7, xmm7 + movdqa xmm6, [h264_w0x10_1] .y_loop: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 - packuswb xmm0, xmm7 - movq [r2], xmm0 + packuswb xmm0, xmm7 + movq [r2], xmm0 - lea r2, [r2+r3] - lea r0, [r0+r1] - dec r4 - jnz near .y_loop + lea r2, [r2+r3] + lea r0, [r0+r1] + dec r4 + jnz near .y_loop - POP_XMM - LOAD_5_PARA_POP - ret + POP_XMM + LOAD_5_PARA_POP + ret ;******************************************************************************* ; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc, ; int iSrcStride, -; uint8_t *pDst, -; int iDstStride, -; int iHeight, +; uint8_t *pDst, +; int iDstStride, +; int iHeight, ; ); ;******************************************************************************* WELS_EXTERN McHorVer20WidthEq16_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - lea r0, [r0-2] ;pSrc -= 2; + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + lea r0, [r0-2] ;pSrc -= 2; - pxor xmm7, xmm7 - movdqa xmm6, [h264_w0x10_1] + pxor xmm7, xmm7 + movdqa xmm6, [h264_w0x10_1] .y_loop: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 - packuswb xmm0, xmm7 - movq [r2], xmm0 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 + packuswb xmm0, xmm7 + movq [r2], xmm0 - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, xmm6 - psraw xmm0, 5 - packuswb xmm0, xmm7 - movq [r2+8], xmm0 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, xmm6 + psraw xmm0, 5 + packuswb xmm0, xmm7 + movq [r2+8], xmm0 - lea r2, [r2+r3] - lea r0, [r0+r1] - dec r4 - jnz near .y_loop + lea r2, [r2+r3] + lea r0, [r0+r1] + dec r4 + jnz near .y_loop - POP_XMM - LOAD_5_PARA_POP - ret + POP_XMM + LOAD_5_PARA_POP + ret ;******************************************************************************* @@ -355,81 +355,81 @@ WELS_EXTERN McHorVer20WidthEq16_sse2 ; int iHeight ) ;******************************************************************************* WELS_EXTERN McHorVer02WidthEq8_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - sub r0, r1 - sub r0, r1 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + sub r0, r1 + sub r0, r1 - WELS_Zero xmm7 + WELS_Zero xmm7 - SSE_LOAD_8P xmm0, xmm7, [r0] - SSE_LOAD_8P xmm1, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm7, [r0] - SSE_LOAD_8P xmm3, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm7, [r0] - SSE_LOAD_8P xmm5, xmm7, [r0+r1] + SSE_LOAD_8P xmm0, xmm7, [r0] + SSE_LOAD_8P xmm1, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm7, [r0] + SSE_LOAD_8P xmm3, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm7, [r0] + SSE_LOAD_8P xmm5, xmm7, [r0+r1] .start: - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r4 - jz near .xx_exit + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r4 + jz near .xx_exit - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] - dec r4 - jz near .xx_exit + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] + dec r4 + jz near .xx_exit - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm7, xmm0, [r0+r1] - FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r4 - jz near .xx_exit + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm7, xmm0, [r0+r1] + FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r4 + jz near .xx_exit - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm0, xmm1, [r0] - FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] - dec r4 - jz near .xx_exit + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm0, xmm1, [r0] + FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] + dec r4 + jz near .xx_exit - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm1, xmm2, [r0+r1] - FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] - dec r4 - jz near .xx_exit + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm1, xmm2, [r0+r1] + FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] + dec r4 + jz near .xx_exit - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm3, [r0] - FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] - dec r4 - jz near .xx_exit + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm3, [r0] + FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] + dec r4 + jz near .xx_exit - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm3, xmm4, [r0+r1] - FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] - dec r4 - jz near .xx_exit + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm3, xmm4, [r0+r1] + FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] + dec r4 + jz near .xx_exit - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm5, [r0] - FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] - dec r4 - jz near .xx_exit + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm5, [r0] + FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] + dec r4 + jz near .xx_exit - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm5, xmm6, [r0+r1] - jmp near .start + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm5, xmm6, [r0+r1] + jmp near .start .xx_exit: - POP_XMM - LOAD_5_PARA_POP - ret + POP_XMM + LOAD_5_PARA_POP + ret ;*********************************************************************** ; Code @@ -440,725 +440,725 @@ SECTION .text ;*********************************************************************** -; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc, +; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc, ; int32_t iSrcStride, ; uint8_t *pDst, ; int32_t iDstStride, -; int32_t iWidth, +; int32_t iWidth, ; int32_t iHeight ) ;*********************************************************************** WELS_EXTERN McHorVer02Height9Or17_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d %ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 %endif - shr r4, 3 - sub r0, r1 - sub r0, r1 + shr r4, 3 + sub r0, r1 + sub r0, r1 .xloop: - WELS_Zero xmm7 - SSE_LOAD_8P xmm0, xmm7, [r0] - SSE_LOAD_8P xmm1, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm7, [r0] - SSE_LOAD_8P xmm3, xmm7, [r0+r1] - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm7, [r0] - SSE_LOAD_8P xmm5, xmm7, [r0+r1] + WELS_Zero xmm7 + SSE_LOAD_8P xmm0, xmm7, [r0] + SSE_LOAD_8P xmm1, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm7, [r0] + SSE_LOAD_8P xmm3, xmm7, [r0+r1] + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm7, [r0] + SSE_LOAD_8P xmm5, xmm7, [r0+r1] - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - movdqa xmm0,xmm1 - movdqa xmm1,xmm2 - movdqa xmm2,xmm3 - movdqa xmm3,xmm4 - movdqa xmm4,xmm5 - movdqa xmm5,xmm6 - add r2, r3 - sub r0, r1 + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + movdqa xmm0,xmm1 + movdqa xmm1,xmm2 + movdqa xmm2,xmm3 + movdqa xmm3,xmm4 + movdqa xmm4,xmm5 + movdqa xmm5,xmm6 + add r2, r3 + sub r0, r1 .start: - FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec + FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm6, xmm7, [r0] - FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm6, xmm7, [r0] + FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm7, xmm0, [r0+r1] - FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm7, xmm0, [r0+r1] + FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm0, xmm1, [r0] - FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm0, xmm1, [r0] + FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm1, xmm2, [r0+r1] - FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm1, xmm2, [r0+r1] + FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm2, xmm3, [r0] - FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm2, xmm3, [r0] + FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm3, xmm4, [r0+r1] - FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm3, xmm4, [r0+r1] + FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - SSE_LOAD_8P xmm4, xmm5, [r0] - FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + SSE_LOAD_8P xmm4, xmm5, [r0] + FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - SSE_LOAD_8P xmm5, xmm6, [r0+r1] - jmp near .start + lea r2, [r2+2*r3] + SSE_LOAD_8P xmm5, xmm6, [r0+r1] + jmp near .start .x_loop_dec: - dec r4 - jz near .xx_exit + dec r4 + jz near .xx_exit %ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 %else - mov r0, r12 - mov r2, r13 - mov r5, r14 + mov r0, r12 + mov r2, r13 + mov r5, r14 %endif - sub r0, r1 - sub r0, r1 - add r0, 8 - add r2, 8 - jmp near .xloop + sub r0, r1 + sub r0, r1 + add r0, 8 + add r2, 8 + jmp near .xloop .xx_exit: %ifndef X86_32 - pop r14 - pop r13 - pop r12 + pop r14 + pop r13 + pop r12 %endif - POP_XMM - LOAD_6_PARA_POP - ret + POP_XMM + LOAD_6_PARA_POP + ret ;*********************************************************************** -; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc, +; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc, ; int32_t iSrcStride, -; uint8_t *pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight +; uint8_t *pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight ; ); ;*********************************************************************** WELS_EXTERN McHorVer20Width9Or17_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - SIGN_EXTENSION r5, r5d - sub r0, 2 - pxor xmm7, xmm7 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + sub r0, 2 + pxor xmm7, xmm7 - cmp r4, 9 - jne near .width_17 + cmp r4, 9 + jne near .width_17 .yloop_width_9: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movd [r2], xmm0 + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movd [r2], xmm0 - pxor xmm7, xmm7 - movq xmm0, [r0+6] - punpcklbw xmm0, xmm7 + pxor xmm7, xmm7 + movq xmm0, [r0+6] + punpcklbw xmm0, xmm7 - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - paddw xmm2, [h264_w0x10_1] - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r2+1], xmm2 + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + paddw xmm2, [h264_w0x10_1] + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r2+1], xmm2 - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_9 - POP_XMM - LOAD_6_PARA_POP - ret + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_9 + POP_XMM + LOAD_6_PARA_POP + ret .width_17: .yloop_width_17: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movq [r2], xmm0 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movq [r2], xmm0 - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - paddw xmm0, [h264_w0x10_1] - psraw xmm0, 5 - packuswb xmm0, xmm0 - movd [r2+8], xmm0 + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + paddw xmm0, [h264_w0x10_1] + psraw xmm0, 5 + packuswb xmm0, xmm0 + movd [r2+8], xmm0 - pxor xmm7, xmm7 - movq xmm0, [r0+6+8] - punpcklbw xmm0, xmm7 + pxor xmm7, xmm7 + movq xmm0, [r0+6+8] + punpcklbw xmm0, xmm7 - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - paddw xmm2, [h264_w0x10_1] - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r2+9], xmm2 - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_17 - POP_XMM - LOAD_6_PARA_POP - ret + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + paddw xmm2, [h264_w0x10_1] + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r2+9], xmm2 + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_17 + POP_XMM + LOAD_6_PARA_POP + ret ;*********************************************************************** ;void McHorVer22HorFirst_sse2 -; (const uint8_t *pSrc, -; int32_t iSrcStride, -; uint8_t * pTap, -; int32_t iTapStride, -; int32_t iWidth,int32_t iHeight); +; (const uint8_t *pSrc, +; int32_t iSrcStride, +; uint8_t * pTap, +; int32_t iTapStride, +; int32_t iWidth,int32_t iHeight); ;*********************************************************************** WELS_EXTERN McHorVer22HorFirst_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - SIGN_EXTENSION r5, r5d - pxor xmm7, xmm7 - sub r0, r1 ;;;;;;;;need more 5 lines. - sub r0, r1 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d + pxor xmm7, xmm7 + sub r0, r1 ;;;;;;;;need more 5 lines. + sub r0, r1 - cmp r4, 9 - jne near .width_17 + cmp r4, 9 + jne near .width_17 .yloop_width_9: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - movd [r2], xmm0 + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + movd [r2], xmm0 - pxor xmm7, xmm7 - movq xmm0, [r0+6] - punpcklbw xmm0, xmm7 + pxor xmm7, xmm7 + movq xmm0, [r0+6] + punpcklbw xmm0, xmm7 - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - movq [r2+2], xmm2 - movhps [r2+2+8], xmm2 + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + movq [r2+2], xmm2 + movhps [r2+2+8], xmm2 - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_9 - POP_XMM - LOAD_6_PARA_POP - ret + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_9 + POP_XMM + LOAD_6_PARA_POP + ret .width_17: .yloop_width_17: - movq xmm0, [r0] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3] - punpcklbw xmm5, xmm7 + movq xmm0, [r0] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3] + punpcklbw xmm5, xmm7 - paddw xmm2, xmm3 - paddw xmm4, xmm5 - psllw xmm4, 2 - psubw xmm4, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm4 - psllw xmm4, 2 - paddw xmm0, xmm4 - movdqa [r2], xmm0 + paddw xmm2, xmm3 + paddw xmm4, xmm5 + psllw xmm4, 2 + psubw xmm4, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm4 + psllw xmm4, 2 + paddw xmm0, xmm4 + movdqa [r2], xmm0 - movq xmm0, [r0+8] - punpcklbw xmm0, xmm7 - movq xmm1, [r0+5+8] - punpcklbw xmm1, xmm7 - movq xmm2, [r0+1+8] - punpcklbw xmm2, xmm7 - movq xmm3, [r0+4+8] - punpcklbw xmm3, xmm7 - movq xmm4, [r0+2+8] - punpcklbw xmm4, xmm7 - movq xmm5, [r0+3+8] - punpcklbw xmm5, xmm7 + movq xmm0, [r0+8] + punpcklbw xmm0, xmm7 + movq xmm1, [r0+5+8] + punpcklbw xmm1, xmm7 + movq xmm2, [r0+1+8] + punpcklbw xmm2, xmm7 + movq xmm3, [r0+4+8] + punpcklbw xmm3, xmm7 + movq xmm4, [r0+2+8] + punpcklbw xmm4, xmm7 + movq xmm5, [r0+3+8] + punpcklbw xmm5, xmm7 - movdqa xmm7, xmm2 - paddw xmm7, xmm3 - movdqa xmm6, xmm4 - paddw xmm6, xmm5 - psllw xmm6, 2 - psubw xmm6, xmm7 - paddw xmm0, xmm1 - paddw xmm0, xmm6 - psllw xmm6, 2 - paddw xmm0, xmm6 - movd [r2+16], xmm0 + movdqa xmm7, xmm2 + paddw xmm7, xmm3 + movdqa xmm6, xmm4 + paddw xmm6, xmm5 + psllw xmm6, 2 + psubw xmm6, xmm7 + paddw xmm0, xmm1 + paddw xmm0, xmm6 + psllw xmm6, 2 + paddw xmm0, xmm6 + movd [r2+16], xmm0 - pxor xmm7, xmm7 - movq xmm0, [r0+6+8] - punpcklbw xmm0, xmm7 + pxor xmm7, xmm7 + movq xmm0, [r0+6+8] + punpcklbw xmm0, xmm7 - paddw xmm4, xmm1 - paddw xmm5, xmm3 - psllw xmm5, 2 - psubw xmm5, xmm4 - paddw xmm2, xmm0 - paddw xmm2, xmm5 - psllw xmm5, 2 - paddw xmm2, xmm5 - movq [r2+18], xmm2 - movhps [r2+18+8], xmm2 + paddw xmm4, xmm1 + paddw xmm5, xmm3 + psllw xmm5, 2 + psubw xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm2, xmm5 + psllw xmm5, 2 + paddw xmm2, xmm5 + movq [r2+18], xmm2 + movhps [r2+18+8], xmm2 - add r0, r1 - add r2, r3 - dec r5 - jnz .yloop_width_17 - POP_XMM - LOAD_6_PARA_POP - ret + add r0, r1 + add r2, r3 + dec r5 + jnz .yloop_width_17 + POP_XMM + LOAD_6_PARA_POP + ret %macro FILTER_VER 9 - paddw %1, %6 - movdqa %7, %2 - movdqa %8, %3 + paddw %1, %6 + movdqa %7, %2 + movdqa %8, %3 - paddw %7, %5 - paddw %8, %4 + paddw %7, %5 + paddw %8, %4 - psubw %1, %7 - psraw %1, 2 - paddw %1, %8 - psubw %1, %7 - psraw %1, 2 - paddw %8, %1 - paddw %8, [h264_mc_hc_32] - psraw %8, 6 - packuswb %8, %8 - movq %9, %8 + psubw %1, %7 + psraw %1, 2 + paddw %1, %8 + psubw %1, %7 + psraw %1, 2 + paddw %8, %1 + paddw %8, [h264_mc_hc_32] + psraw %8, 6 + packuswb %8, %8 + movq %9, %8 %endmacro ;*********************************************************************** ;void McHorVer22Width8VerLastAlign_sse2( -; const uint8_t *pTap, -; int32_t iTapStride, -; uint8_t * pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight); +; const uint8_t *pTap, +; int32_t iTapStride, +; uint8_t * pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight); ;*********************************************************************** WELS_EXTERN McHorVer22Width8VerLastAlign_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d %ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 %endif - shr r4, 3 + shr r4, 3 .width_loop: - movdqa xmm0, [r0] - movdqa xmm1, [r0+r1] - lea r0, [r0+2*r1] - movdqa xmm2, [r0] - movdqa xmm3, [r0+r1] - lea r0, [r0+2*r1] - movdqa xmm4, [r0] - movdqa xmm5, [r0+r1] + movdqa xmm0, [r0] + movdqa xmm1, [r0+r1] + lea r0, [r0+2*r1] + movdqa xmm2, [r0] + movdqa xmm3, [r0+r1] + lea r0, [r0+2*r1] + movdqa xmm4, [r0] + movdqa xmm5, [r0+r1] - FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - movdqa xmm6, [r0] + FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + movdqa xmm6, [r0] - movdqa xmm0, xmm1 - movdqa xmm1, xmm2 - movdqa xmm2, xmm3 - movdqa xmm3, xmm4 - movdqa xmm4, xmm5 - movdqa xmm5, xmm6 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm2, xmm3 + movdqa xmm3, xmm4 + movdqa xmm4, xmm5 + movdqa xmm5, xmm6 - add r2, r3 - sub r0, r1 + add r2, r3 + sub r0, r1 .start: - FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec + FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqa xmm6, [r0] - FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqa xmm6, [r0] + FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqa xmm7, [r0+r1] - FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqa xmm7, [r0+r1] + FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqa xmm0, [r0] - FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqa xmm0, [r0] + FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqa xmm1, [r0+r1] - FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqa xmm1, [r0+r1] + FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqa xmm2, [r0] - FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqa xmm2, [r0] + FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqa xmm3, [r0+r1] - FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqa xmm3, [r0+r1] + FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqa xmm4, [r0] - FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqa xmm4, [r0] + FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqa xmm5, [r0+r1] - jmp near .start + lea r2, [r2+2*r3] + movdqa xmm5, [r0+r1] + jmp near .start .x_loop_dec: - dec r4 - jz near .exit + dec r4 + jz near .exit %ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 %else - mov r0, r12 - mov r2, r13 - mov r5, r14 + mov r0, r12 + mov r2, r13 + mov r5, r14 %endif - add r0, 16 - add r2, 8 - jmp .width_loop + add r0, 16 + add r2, 8 + jmp .width_loop .exit: %ifndef X86_32 - pop r14 - pop r13 - pop r12 + pop r14 + pop r13 + pop r12 %endif - POP_XMM - LOAD_6_PARA_POP - ret + POP_XMM + LOAD_6_PARA_POP + ret ;*********************************************************************** ;void McHorVer22Width8VerLastUnAlign_sse2( -; const uint8_t *pTap, -; int32_t iTapStride, -; uint8_t * pDst, -; int32_t iDstStride, -; int32_t iWidth, -; int32_t iHeight); +; const uint8_t *pTap, +; int32_t iTapStride, +; uint8_t * pDst, +; int32_t iDstStride, +; int32_t iWidth, +; int32_t iHeight); ;*********************************************************************** WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2 - %assign push_num 0 + %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - SIGN_EXTENSION r5, r5d + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + SIGN_EXTENSION r5, r5d %ifndef X86_32 - push r12 - push r13 - push r14 - mov r12, r0 - mov r13, r2 - mov r14, r5 + push r12 + push r13 + push r14 + mov r12, r0 + mov r13, r2 + mov r14, r5 %endif - shr r4, 3 + shr r4, 3 .width_loop: - movdqu xmm0, [r0] - movdqu xmm1, [r0+r1] - lea r0, [r0+2*r1] - movdqu xmm2, [r0] - movdqu xmm3, [r0+r1] - lea r0, [r0+2*r1] - movdqu xmm4, [r0] - movdqu xmm5, [r0+r1] + movdqu xmm0, [r0] + movdqu xmm1, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm2, [r0] + movdqu xmm3, [r0+r1] + lea r0, [r0+2*r1] + movdqu xmm4, [r0] + movdqu xmm5, [r0+r1] - FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - lea r0, [r0+2*r1] - movdqu xmm6, [r0] + FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + lea r0, [r0+2*r1] + movdqu xmm6, [r0] - movdqa xmm0, xmm1 - movdqa xmm1, xmm2 - movdqa xmm2, xmm3 - movdqa xmm3, xmm4 - movdqa xmm4, xmm5 - movdqa xmm5, xmm6 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm2, xmm3 + movdqa xmm3, xmm4 + movdqa xmm4, xmm5 + movdqa xmm5, xmm6 - add r2, r3 - sub r0, r1 + add r2, r3 + sub r0, r1 .start: - FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] - dec r5 - jz near .x_loop_dec + FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqu xmm6, [r0] - FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqu xmm6, [r0] + FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqu xmm7, [r0+r1] - FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqu xmm7, [r0+r1] + FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqu xmm0, [r0] - FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqu xmm0, [r0] + FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqu xmm1, [r0+r1] - FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqu xmm1, [r0+r1] + FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqu xmm2, [r0] - FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqu xmm2, [r0] + FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqu xmm3, [r0+r1] - FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] - dec r5 - jz near .x_loop_dec + lea r2, [r2+2*r3] + movdqu xmm3, [r0+r1] + FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] + dec r5 + jz near .x_loop_dec - lea r0, [r0+2*r1] - movdqu xmm4, [r0] - FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] - dec r5 - jz near .x_loop_dec + lea r0, [r0+2*r1] + movdqu xmm4, [r0] + FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] + dec r5 + jz near .x_loop_dec - lea r2, [r2+2*r3] - movdqu xmm5, [r0+r1] - jmp near .start + lea r2, [r2+2*r3] + movdqu xmm5, [r0+r1] + jmp near .start .x_loop_dec: - dec r4 - jz near .exit + dec r4 + jz near .exit %ifdef X86_32 - mov r0, arg1 - mov r2, arg3 - mov r5, arg6 + mov r0, arg1 + mov r2, arg3 + mov r5, arg6 %else - mov r0, r12 - mov r2, r13 - mov r5, r14 + mov r0, r12 + mov r2, r13 + mov r5, r14 %endif - add r0, 16 - add r2, 8 - jmp .width_loop + add r0, 16 + add r2, 8 + jmp .width_loop .exit: %ifndef X86_32 - pop r14 - pop r13 - pop r12 + pop r14 + pop r13 + pop r12 %endif - POP_XMM - LOAD_6_PARA_POP - ret + POP_XMM + LOAD_6_PARA_POP + ret diff --git a/codec/common/x86/satd_sad.asm b/codec/common/x86/satd_sad.asm index 226e579e..dd2a22fa 100644 --- a/codec/common/x86/satd_sad.asm +++ b/codec/common/x86/satd_sad.asm @@ -77,77 +77,77 @@ SECTION .text ; ;*********************************************************************** %macro MMX_DW_1_2REG 2 - pxor %1, %1 - pcmpeqw %2, %2 - psubw %1, %2 + pxor %1, %1 + pcmpeqw %2, %2 + psubw %1, %2 %endmacro %macro SSE2_SumWHorizon1 2 - movdqa %2, %1 - psrldq %2, 8 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 4 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 2 - paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 8 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 4 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 2 + paddusw %1, %2 %endmacro %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 - SSE2_SumSub %1, %2, %5 - SSE2_SumSub %3, %4, %5 - SSE2_SumSub %2, %4, %5 - SSE2_SumSub %1, %3, %5 + SSE2_SumSub %1, %2, %5 + SSE2_SumSub %3, %4, %5 + SSE2_SumSub %2, %4, %5 + SSE2_SumSub %1, %3, %5 %endmacro %macro SSE2_SumAbs4 7 - WELS_AbsW %1, %3 - WELS_AbsW %2, %3 - WELS_AbsW %4, %6 - WELS_AbsW %5, %6 - paddusw %1, %2 - paddusw %4, %5 - paddusw %7, %1 - paddusw %7, %4 + WELS_AbsW %1, %3 + WELS_AbsW %2, %3 + WELS_AbsW %4, %6 + WELS_AbsW %5, %6 + paddusw %1, %2 + paddusw %4, %5 + paddusw %7, %1 + paddusw %7, %4 %endmacro %macro SSE2_SumWHorizon 3 - movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 - paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 - punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 - movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 - paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 - pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 - paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 + movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 + paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 + punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 + movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 + paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 + pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 + paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 %endmacro %macro SSE2_GetSatd8x8 0 - SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] + SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 - SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 + SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] - SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] + SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 - SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 + SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 %endmacro ;*********************************************************************** @@ -156,11 +156,11 @@ SECTION .text ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d movd xmm0, [r0] movd xmm1, [r0+r1] lea r0 , [r0+2*r1] @@ -199,14 +199,14 @@ WELS_EXTERN WelsSampleSatd4x4_sse2 punpcklwd xmm0, xmm4 punpckhwd xmm4, xmm2 - SSE2_XSawp dq, xmm0, xmm4, xmm3 - SSE2_XSawp qdq, xmm0, xmm3, xmm5 + SSE2_XSawp dq, xmm0, xmm4, xmm3 + SSE2_XSawp qdq, xmm0, xmm3, xmm5 movdqa xmm7, xmm0 paddw xmm0, xmm5 psubw xmm7, xmm5 - SSE2_XSawp qdq, xmm0, xmm7, xmm1 + SSE2_XSawp qdq, xmm0, xmm7, xmm1 movdqa xmm2, xmm0 paddw xmm0, xmm1 @@ -214,15 +214,15 @@ WELS_EXTERN WelsSampleSatd4x4_sse2 WELS_AbsW xmm0, xmm3 paddusw xmm6, xmm0 - WELS_AbsW xmm2, xmm4 + WELS_AbsW xmm2, xmm4 paddusw xmm6, xmm2 SSE2_SumWHorizon1 xmm6, xmm4 - movd retrd, xmm6 + movd retrd, xmm6 and retrd, 0xffff shr retrd, 1 - POP_XMM - LOAD_4_PARA_POP - ret + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -230,20 +230,20 @@ WELS_EXTERN WelsSampleSatd4x4_sse2 ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm6, xmm6 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - POP_XMM - LOAD_4_PARA_POP - ret + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -251,25 +251,25 @@ WELS_EXTERN WelsSampleSatd8x8_sse2 ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm6, xmm6 - pxor xmm7, xmm7 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm6, xmm6 + pxor xmm7, xmm7 - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - POP_XMM - LOAD_4_PARA_POP - ret + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -277,30 +277,30 @@ WELS_EXTERN WelsSampleSatd8x16_sse2 ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - push r0 - push r2 - pxor xmm6, xmm6 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + push r0 + push r2 + pxor xmm6, xmm6 pxor xmm7, xmm7 - SSE2_GetSatd8x8 + SSE2_GetSatd8x8 - pop r2 - pop r0 + pop r2 + pop r0 add r0, 8 add r2, 8 - SSE2_GetSatd8x8 + SSE2_GetSatd8x8 - psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - POP_XMM - LOAD_4_PARA_POP - ret + psrlw xmm6, 1 + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -308,38 +308,38 @@ WELS_EXTERN WelsSampleSatd16x8_sse2 ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - push r0 - push r2 - pxor xmm6, xmm6 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + push r0 + push r2 + pxor xmm6, xmm6 pxor xmm7, xmm7 - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 - pop r2 - pop r0 - add r0, 8 - add r2, 8 + pop r2 + pop r0 + add r0, 8 + add r2, 8 - SSE2_GetSatd8x8 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSatd8x8 + SSE2_GetSatd8x8 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSatd8x8 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw xmm6, 1 - SSE2_SumWHorizon xmm6,xmm4,xmm7 - movd retrd, xmm6 - POP_XMM - LOAD_4_PARA_POP - ret + SSE2_SumWHorizon xmm6,xmm4,xmm7 + movd retrd, xmm6 + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -355,9 +355,9 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 %macro SSE_DB_1_2REG 2 - pxor %1, %1 - pcmpeqw %2, %2 - psubb %1, %2 + pxor %1, %1 + pcmpeqw %2, %2 + psubb %1, %2 %endmacro ;*********************************************************************** @@ -369,668 +369,668 @@ WELS_EXTERN WelsSampleSatd16x16_sse2 WELS_EXTERN WelsSampleSatdThree4x4_sse2 %ifdef X86_32 - push r3 - push r4 - push r5 - push r6 - %assign push_num 4 + push r3 + push r4 + push r5 + push r6 + %assign push_num 4 %else - %assign push_num 0 + %assign push_num 0 %endif - PUSH_XMM 8 + PUSH_XMM 8 - mov r2, arg3 - mov r3, arg4 - SIGN_EXTENSION r3, r3d + mov r2, arg3 + mov r3, arg4 + SIGN_EXTENSION r3, r3d - ; load source 4x4 samples and Hadamard transform - movd xmm0, [r2] - movd xmm1, [r2+r3] - lea r2 , [r2+2*r3] - movd xmm2, [r2] - movd xmm3, [r2+r3] - punpckldq xmm0, xmm2 - punpckldq xmm1, xmm3 + ; load source 4x4 samples and Hadamard transform + movd xmm0, [r2] + movd xmm1, [r2+r3] + lea r2 , [r2+2*r3] + movd xmm2, [r2] + movd xmm3, [r2+r3] + punpckldq xmm0, xmm2 + punpckldq xmm1, xmm3 - pxor xmm6, xmm6 - punpcklbw xmm0, xmm6 - punpcklbw xmm1, xmm6 + pxor xmm6, xmm6 + punpcklbw xmm0, xmm6 + punpcklbw xmm1, xmm6 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - SSE2_XSawp qdq, xmm0, xmm2, xmm3 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 + SSE2_XSawp qdq, xmm0, xmm2, xmm3 - movdqa xmm4, xmm0 - paddw xmm0, xmm3 - psubw xmm4, xmm3 + movdqa xmm4, xmm0 + paddw xmm0, xmm3 + psubw xmm4, xmm3 - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm4 - punpckhwd xmm4, xmm2 + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm4 + punpckhwd xmm4, xmm2 - SSE2_XSawp dq, xmm0, xmm4, xmm3 - SSE2_XSawp qdq, xmm0, xmm3, xmm5 + SSE2_XSawp dq, xmm0, xmm4, xmm3 + SSE2_XSawp qdq, xmm0, xmm3, xmm5 - movdqa xmm7, xmm0 - paddw xmm0, xmm5 - psubw xmm7, xmm5 + movdqa xmm7, xmm0 + paddw xmm0, xmm5 + psubw xmm7, xmm5 - SSE2_XSawp qdq, xmm0, xmm7, xmm1 + SSE2_XSawp qdq, xmm0, xmm7, xmm1 - ; Hadamard transform results are saved in xmm0 and xmm2 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 + ; Hadamard transform results are saved in xmm0 and xmm2 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + psubw xmm2, xmm1 - ;load top boundary samples: [a b c d] - mov r0, arg1 - mov r1, arg2 - SIGN_EXTENSION r1, r1d - sub r0, r1 + ;load top boundary samples: [a b c d] + mov r0, arg1 + mov r1, arg2 + SIGN_EXTENSION r1, r1d + sub r0, r1 %ifdef UNIX64 - push r4 - push r5 + push r4 + push r5 %endif - movzx r2d, byte [r0] - movzx r3d, byte [r0+1] - movzx r4d, byte [r0+2] - movzx r5d, byte [r0+3] + movzx r2d, byte [r0] + movzx r3d, byte [r0+1] + movzx r4d, byte [r0+2] + movzx r5d, byte [r0+3] - ; get the transform results of top boundary samples: [a b c d] - add r3d, r2d ; r3d = a + b - add r5d, r4d ; r5d = c + d - add r2d, r2d ; r2d = a + a - add r4d, r4d ; r4d = c + c - sub r2d, r3d ; r2d = a + a - a - b = a - b - sub r4d, r5d ; r4d = c + c - c - d = c - d - add r5d, r3d ; r5d = (a + b) + (c + d) - add r3d, r3d - sub r3d, r5d ; r3d = (a + b) - (c + d) - add r4d, r2d ; r4d = (a - b) + (c - d) - add r2d, r2d - sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] + ; get the transform results of top boundary samples: [a b c d] + add r3d, r2d ; r3d = a + b + add r5d, r4d ; r5d = c + d + add r2d, r2d ; r2d = a + a + add r4d, r4d ; r4d = c + c + sub r2d, r3d ; r2d = a + a - a - b = a - b + sub r4d, r5d ; r4d = c + c - c - d = c - d + add r5d, r3d ; r5d = (a + b) + (c + d) + add r3d, r3d + sub r3d, r5d ; r3d = (a + b) - (c + d) + add r4d, r2d ; r4d = (a - b) + (c - d) + add r2d, r2d + sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] - movdqa xmm6, xmm0 - movdqa xmm7, xmm2 - movd xmm5, r5d ; store the edi for DC mode - pxor xmm3, xmm3 - pxor xmm4, xmm4 - pinsrw xmm3, r5d, 0 - pinsrw xmm3, r4d, 4 - psllw xmm3, 2 - pinsrw xmm4, r3d, 0 - pinsrw xmm4, r2d, 4 - psllw xmm4, 2 + movdqa xmm6, xmm0 + movdqa xmm7, xmm2 + movd xmm5, r5d ; store the edi for DC mode + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pinsrw xmm3, r5d, 0 + pinsrw xmm3, r4d, 4 + psllw xmm3, 2 + pinsrw xmm4, r3d, 0 + pinsrw xmm4, r2d, 4 + psllw xmm4, 2 - ; get the satd of H - psubw xmm0, xmm3 - psubw xmm2, xmm4 + ; get the satd of H + psubw xmm0, xmm3 + psubw xmm2, xmm4 - WELS_AbsW xmm0, xmm1 - WELS_AbsW xmm2, xmm1 - paddusw xmm0, xmm2 - SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0 + WELS_AbsW xmm0, xmm1 + WELS_AbsW xmm2, xmm1 + paddusw xmm0, xmm2 + SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0 - ;load left boundary samples: [a b c d]' - add r0, r1 + ;load left boundary samples: [a b c d]' + add r0, r1 - movzx r2d, byte [r0-1] - movzx r3d, byte [r0+r1-1] - lea r0 , [r0+2*r1] - movzx r4d, byte [r0-1] - movzx r5d, byte [r0+r1-1] + movzx r2d, byte [r0-1] + movzx r3d, byte [r0+r1-1] + lea r0 , [r0+2*r1] + movzx r4d, byte [r0-1] + movzx r5d, byte [r0+r1-1] - ; get the transform results of left boundary samples: [a b c d]' - add r3d, r2d ; r3d = a + b - add r5d, r4d ; r5d = c + d - add r2d, r2d ; r2d = a + a - add r4d, r4d ; r4d = c + c - sub r2d, r3d ; r2d = a + a - a - b = a - b - sub r4d, r5d ; r4d = c + c - c - d = c - d - add r5d, r3d ; r5d = (a + b) + (c + d) - add r3d, r3d - sub r3d, r5d ; r3d = (a + b) - (c + d) - add r4d, r2d ; r4d = (a - b) + (c - d) - add r2d, r2d - sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] + ; get the transform results of left boundary samples: [a b c d]' + add r3d, r2d ; r3d = a + b + add r5d, r4d ; r5d = c + d + add r2d, r2d ; r2d = a + a + add r4d, r4d ; r4d = c + c + sub r2d, r3d ; r2d = a + a - a - b = a - b + sub r4d, r5d ; r4d = c + c - c - d = c - d + add r5d, r3d ; r5d = (a + b) + (c + d) + add r3d, r3d + sub r3d, r5d ; r3d = (a + b) - (c + d) + add r4d, r2d ; r4d = (a - b) + (c - d) + add r2d, r2d + sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] - ; store the transform results in xmm3 - movd xmm3, r5d - pinsrw xmm3, r3d, 1 - pinsrw xmm3, r2d, 2 - pinsrw xmm3, r4d, 3 - psllw xmm3, 2 + ; store the transform results in xmm3 + movd xmm3, r5d + pinsrw xmm3, r3d, 1 + pinsrw xmm3, r2d, 2 + pinsrw xmm3, r4d, 3 + psllw xmm3, 2 - ; get the satd of V - movdqa xmm2, xmm6 - movdqa xmm4, xmm7 - psubw xmm2, xmm3 - WELS_AbsW xmm2, xmm1 - WELS_AbsW xmm4, xmm1 - paddusw xmm2, xmm4 - SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2 + ; get the satd of V + movdqa xmm2, xmm6 + movdqa xmm4, xmm7 + psubw xmm2, xmm3 + WELS_AbsW xmm2, xmm1 + WELS_AbsW xmm4, xmm1 + paddusw xmm2, xmm4 + SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2 - ; DC result is stored in xmm1 - add r5d, 4 - movd xmm1, r5d - paddw xmm1, xmm5 - psrlw xmm1, 3 - movdqa xmm5, xmm1 - psllw xmm1, 4 + ; DC result is stored in xmm1 + add r5d, 4 + movd xmm1, r5d + paddw xmm1, xmm5 + psrlw xmm1, 3 + movdqa xmm5, xmm1 + psllw xmm1, 4 - ; get the satd of DC - psubw xmm6, xmm1 - WELS_AbsW xmm6, xmm1 - WELS_AbsW xmm7, xmm1 - paddusw xmm6, xmm7 - SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6 + ; get the satd of DC + psubw xmm6, xmm1 + WELS_AbsW xmm6, xmm1 + WELS_AbsW xmm7, xmm1 + paddusw xmm6, xmm7 + SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6 %ifdef UNIX64 - pop r5 - pop r4 + pop r5 + pop r4 %endif - ; comparing order: DC H V + ; comparing order: DC H V - mov r4, arg5 - movd r2d, xmm6 - movd r3d, xmm2 - movd r6d, xmm0 + mov r4, arg5 + movd r2d, xmm6 + movd r3d, xmm2 + movd r6d, xmm0 - and r2d, 0xffff - shr r2d, 1 - and r3d, 0xffff - shr r3d, 1 - and r6d, 0xffff - shr r6d, 1 - add r2d, dword arg7 - add r3d, dword arg8 - add r6d, dword arg9 - cmp r2w, r3w - jg near not_dc - cmp r2w, r6w - jg near not_dc_h + and r2d, 0xffff + shr r2d, 1 + and r3d, 0xffff + shr r3d, 1 + and r6d, 0xffff + shr r6d, 1 + add r2d, dword arg7 + add r3d, dword arg8 + add r6d, dword arg9 + cmp r2w, r3w + jg near not_dc + cmp r2w, r6w + jg near not_dc_h - ; for DC mode - movd r3d, xmm5 - imul r3d, 0x01010101 - movd xmm5, r3d - pshufd xmm5, xmm5, 0 - movdqa [r4], xmm5 - mov r5, arg6 - mov dword [r5], 0x02 - mov retrd, r2d - POP_XMM + ; for DC mode + movd r3d, xmm5 + imul r3d, 0x01010101 + movd xmm5, r3d + pshufd xmm5, xmm5, 0 + movdqa [r4], xmm5 + mov r5, arg6 + mov dword [r5], 0x02 + mov retrd, r2d + POP_XMM %ifdef X86_32 - pop r6 - pop r5 - pop r4 - pop r3 + pop r6 + pop r5 + pop r4 + pop r3 %endif - ret + ret not_dc: - cmp r3w, r6w - jg near not_dc_h + cmp r3w, r6w + jg near not_dc_h - ; for H mode - SSE_DB_1_2REG xmm6, xmm7 - sub r0, r1 - sub r0, r1 - movzx r6d, byte [r0-1] - movd xmm0, r6d - pmuludq xmm0, xmm6 + ; for H mode + SSE_DB_1_2REG xmm6, xmm7 + sub r0, r1 + sub r0, r1 + movzx r6d, byte [r0-1] + movd xmm0, r6d + pmuludq xmm0, xmm6 - movzx r6d, byte [r0+r1-1] - movd xmm1, r6d - pmuludq xmm1, xmm6 - punpckldq xmm0, xmm1 + movzx r6d, byte [r0+r1-1] + movd xmm1, r6d + pmuludq xmm1, xmm6 + punpckldq xmm0, xmm1 - lea r0, [r0+r1*2] - movzx r6d, byte [r0-1] - movd xmm2, r6d - pmuludq xmm2, xmm6 + lea r0, [r0+r1*2] + movzx r6d, byte [r0-1] + movd xmm2, r6d + pmuludq xmm2, xmm6 - movzx r6d, byte [r0+r1-1] - movd xmm3, r6d - pmuludq xmm3, xmm6 - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 + movzx r6d, byte [r0+r1-1] + movd xmm3, r6d + pmuludq xmm3, xmm6 + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 - movdqa [r4],xmm0 + movdqa [r4],xmm0 - mov retrd, r3d - mov r5, arg6 - mov dword [r5], 0x01 - POP_XMM + mov retrd, r3d + mov r5, arg6 + mov dword [r5], 0x01 + POP_XMM %ifdef X86_32 - pop r6 - pop r5 - pop r4 - pop r3 + pop r6 + pop r5 + pop r4 + pop r3 %endif - ret + ret not_dc_h: - sub r0, r1 - sub r0, r1 - sub r0, r1 - movd xmm0, [r0] - pshufd xmm0, xmm0, 0 - movdqa [r4],xmm0 - mov retrd, r6d - mov r5, arg6 - mov dword [r5], 0x00 - POP_XMM + sub r0, r1 + sub r0, r1 + sub r0, r1 + movd xmm0, [r0] + pshufd xmm0, xmm0, 0 + movdqa [r4],xmm0 + mov retrd, r6d + mov r5, arg6 + mov dword [r5], 0x00 + POP_XMM %ifdef X86_32 - pop r6 - pop r5 - pop r4 - pop r3 + pop r6 + pop r5 + pop r4 + pop r3 %endif - ret + ret %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 - pmaddubsw %1, xmm5 - movdqa %2, %1 - pmaddwd %1, xmm7 - pmaddwd %2, xmm6 - movdqa %3, %1 - punpckldq %1, %2 - punpckhdq %2, %3 - movdqa %3, %1 - punpcklqdq %1, %2 - punpckhqdq %3, %2 - paddd xmm4, %1 ;for dc - paddd xmm4, %3 ;for dc - packssdw %1, %3 - psllw %1, 2 + pmaddubsw %1, xmm5 + movdqa %2, %1 + pmaddwd %1, xmm7 + pmaddwd %2, xmm6 + movdqa %3, %1 + punpckldq %1, %2 + punpckhdq %2, %3 + movdqa %3, %1 + punpcklqdq %1, %2 + punpckhqdq %3, %2 + paddd xmm4, %1 ;for dc + paddd xmm4, %3 ;for dc + packssdw %1, %3 + psllw %1, 2 %endmacro %macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 - pmaddubsw %1, xmm5 - movdqa %2, %1 - pmaddwd %1, xmm7 - pmaddwd %2, xmm6 - movdqa %3, %1 - punpckldq %1, %2 - punpckhdq %2, %3 - movdqa %3, %1 - punpcklqdq %1, %2 - punpckhqdq %3, %2 + pmaddubsw %1, xmm5 + movdqa %2, %1 + pmaddwd %1, xmm7 + pmaddwd %2, xmm6 + movdqa %3, %1 + punpckldq %1, %2 + punpckhdq %2, %3 + movdqa %3, %1 + punpcklqdq %1, %2 + punpckhqdq %3, %2 ; paddd xmm4, %1 ;for dc -; paddd xmm4, %3 ;for dc - movdqa %4, %1 - punpcklqdq %4, %3 - packssdw %1, %3 - psllw %1, 2 +; paddd xmm4, %3 ;for dc + movdqa %4, %1 + punpcklqdq %4, %3 + packssdw %1, %3 + psllw %1, 2 %endmacro %macro SSE41_GetX38x4SatdDec 0 - pxor xmm7, xmm7 - movq xmm0, [r2] - movq xmm1, [r2+r3] - lea r2, [r2+2*r3] - movq xmm2, [r2] - movq xmm3, [r2+r3] - lea r2, [r2+2*r3] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 - SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 - SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 - ;doesn't need another transpose + pxor xmm7, xmm7 + movq xmm0, [r2] + movq xmm1, [r2+r3] + lea r2, [r2+2*r3] + movq xmm2, [r2] + movq xmm3, [r2+r3] + lea r2, [r2+2*r3] + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 + SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 + SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 + ;doesn't need another transpose %endmacro %macro SSE41_GetX38x4SatdV 2 - pxor xmm0, xmm0 - pinsrw xmm0, word[r6+%2], 0 - pinsrw xmm0, word[r6+%2+8], 4 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[r6+%2+2], 0 - pinsrw xmm0, word[r6+%2+10], 4 - psubsw xmm0, xmm1 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[r6+%2+4], 0 - pinsrw xmm0, word[r6+%2+12], 4 - psubsw xmm0, xmm3 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 - pxor xmm0, xmm0 - pinsrw xmm0, word[r6+%2+6], 0 - pinsrw xmm0, word[r6+%2+14], 4 - psubsw xmm0, xmm2 - pabsw xmm0, xmm0 - paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[r6+%2], 0 + pinsrw xmm0, word[r6+%2+8], 4 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[r6+%2+2], 0 + pinsrw xmm0, word[r6+%2+10], 4 + psubsw xmm0, xmm1 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[r6+%2+4], 0 + pinsrw xmm0, word[r6+%2+12], 4 + psubsw xmm0, xmm3 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 + pxor xmm0, xmm0 + pinsrw xmm0, word[r6+%2+6], 0 + pinsrw xmm0, word[r6+%2+14], 4 + psubsw xmm0, xmm2 + pabsw xmm0, xmm0 + paddw xmm4, xmm0 %endmacro %macro SSE41_GetX38x4SatdH 3 - movq xmm0, [r6+%3+8*%1] - punpcklqdq xmm0, xmm0 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm5, xmm0 - pabsw xmm1, xmm1 - pabsw xmm2, xmm2 - pabsw xmm3, xmm3 - paddw xmm2, xmm1;for DC - paddw xmm2, xmm3;for DC - paddw xmm5, xmm2 + movq xmm0, [r6+%3+8*%1] + punpcklqdq xmm0, xmm0 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm5, xmm0 + pabsw xmm1, xmm1 + pabsw xmm2, xmm2 + pabsw xmm3, xmm3 + paddw xmm2, xmm1;for DC + paddw xmm2, xmm3;for DC + paddw xmm5, xmm2 %endmacro %macro SSE41_I16X16GetX38x4SatdDC 0 - pxor xmm0, xmm0 - movq2dq xmm0, mm4 - punpcklqdq xmm0, xmm0 - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm6, xmm0 - paddw xmm6, xmm2 + pxor xmm0, xmm0 + movq2dq xmm0, mm4 + punpcklqdq xmm0, xmm0 + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm6, xmm0 + paddw xmm6, xmm2 %endmacro %macro SSE41_ChromaGetX38x4SatdDC 1 - shl %1, 4 - movdqa xmm0, [r6+32+%1] - psubsw xmm0, xmm7 - pabsw xmm0, xmm0 - paddw xmm6, xmm0 - paddw xmm6, xmm2 + shl %1, 4 + movdqa xmm0, [r6+32+%1] + psubsw xmm0, xmm7 + pabsw xmm0, xmm0 + paddw xmm6, xmm0 + paddw xmm6, xmm2 %endmacro %macro SSE41_I16x16GetX38x4Satd 2 - SSE41_GetX38x4SatdDec - SSE41_GetX38x4SatdV %1, %2 - SSE41_GetX38x4SatdH %1, %2, 32 - SSE41_I16X16GetX38x4SatdDC + SSE41_GetX38x4SatdDec + SSE41_GetX38x4SatdV %1, %2 + SSE41_GetX38x4SatdH %1, %2, 32 + SSE41_I16X16GetX38x4SatdDC %endmacro %macro SSE41_ChromaGetX38x4Satd 2 - SSE41_GetX38x4SatdDec - SSE41_GetX38x4SatdV %1, %2 - SSE41_GetX38x4SatdH %1, %2, 16 - SSE41_ChromaGetX38x4SatdDC %1 + SSE41_GetX38x4SatdDec + SSE41_GetX38x4SatdV %1, %2 + SSE41_GetX38x4SatdH %1, %2, 16 + SSE41_ChromaGetX38x4SatdDC %1 %endmacro %macro SSE41_HSum8W 3 - pmaddwd %1, %2 - movhlps %3, %1 - paddd %1, %3 - pshuflw %3, %1,0Eh - paddd %1, %3 + pmaddwd %1, %2 + movhlps %3, %1 + paddd %1, %3 + pshuflw %3, %1,0Eh + paddd %1, %3 %endmacro WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 - %assign push_num 0 - LOAD_7_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d %ifndef X86_32 - push r12 - mov r12, r2 + push r12 + mov r12, r2 %endif - pxor xmm4, xmm4 - movdqa xmm5, [HSumSubDB1] - movdqa xmm6, [HSumSubDW1] - movdqa xmm7, [PDW1] - sub r0, r1 - movdqu xmm0, [r0] - movhlps xmm1, xmm0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 - SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [r6], xmm0 ;V - movdqa [r6+16], xmm1 - add r0, r1 - pinsrb xmm0, byte[r0-1], 0 - pinsrb xmm0, byte[r0+r1-1], 1 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 2 - pinsrb xmm0, byte[r0+r1-1], 3 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 4 - pinsrb xmm0, byte[r0+r1-1], 5 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 6 - pinsrb xmm0, byte[r0+r1-1], 7 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 8 - pinsrb xmm0, byte[r0+r1-1], 9 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 10 - pinsrb xmm0, byte[r0+r1-1], 11 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 12 - pinsrb xmm0, byte[r0+r1-1], 13 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 14 - pinsrb xmm0, byte[r0+r1-1], 15 - movhlps xmm1, xmm0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 - SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 - movdqa [r6+32], xmm0 ;H - movdqa [r6+48], xmm1 - movd r0d, xmm4 ;dc - add r0d, 16 ;(sum+16) - shr r0d, 5 ;((sum+16)>>5) - shl r0d, 4 ; - movd mm4, r0d ; mm4 copy DC - pxor xmm4, xmm4 ;V - pxor xmm5, xmm5 ;H - pxor xmm6, xmm6 ;DC + pxor xmm4, xmm4 + movdqa xmm5, [HSumSubDB1] + movdqa xmm6, [HSumSubDW1] + movdqa xmm7, [PDW1] + sub r0, r1 + movdqu xmm0, [r0] + movhlps xmm1, xmm0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 + SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 + movdqa [r6], xmm0 ;V + movdqa [r6+16], xmm1 + add r0, r1 + pinsrb xmm0, byte[r0-1], 0 + pinsrb xmm0, byte[r0+r1-1], 1 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 2 + pinsrb xmm0, byte[r0+r1-1], 3 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 4 + pinsrb xmm0, byte[r0+r1-1], 5 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 6 + pinsrb xmm0, byte[r0+r1-1], 7 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 8 + pinsrb xmm0, byte[r0+r1-1], 9 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 10 + pinsrb xmm0, byte[r0+r1-1], 11 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 12 + pinsrb xmm0, byte[r0+r1-1], 13 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 14 + pinsrb xmm0, byte[r0+r1-1], 15 + movhlps xmm1, xmm0 + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 + SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 + movdqa [r6+32], xmm0 ;H + movdqa [r6+48], xmm1 + movd r0d, xmm4 ;dc + add r0d, 16 ;(sum+16) + shr r0d, 5 ;((sum+16)>>5) + shl r0d, 4 ; + movd mm4, r0d ; mm4 copy DC + pxor xmm4, xmm4 ;V + pxor xmm5, xmm5 ;H + pxor xmm6, xmm6 ;DC %ifdef UNIX64 - push r4 + push r4 %endif - mov r0, 0 - mov r4, 0 + mov r0, 0 + mov r4, 0 .loop16x16_get_satd: .loopStart1: - SSE41_I16x16GetX38x4Satd r0, r4 - inc r0 - cmp r0, 4 - jl .loopStart1 - cmp r4, 16 - je .loop16x16_get_satd_end + SSE41_I16x16GetX38x4Satd r0, r4 + inc r0 + cmp r0, 4 + jl .loopStart1 + cmp r4, 16 + je .loop16x16_get_satd_end %ifdef X86_32 - mov r2, arg3 + mov r2, arg3 %else - mov r2, r12 + mov r2, r12 %endif - add r2, 8 - mov r0, 0 - add r4, 16 - jmp .loop16x16_get_satd + add r2, 8 + mov r0, 0 + add r4, 16 + jmp .loop16x16_get_satd .loop16x16_get_satd_end: - MMX_DW_1_2REG xmm0, xmm1 - psrlw xmm4, 1 ;/2 - psrlw xmm5, 1 ;/2 - psrlw xmm6, 1 ;/2 - SSE41_HSum8W xmm4, xmm0, xmm1 - SSE41_HSum8W xmm5, xmm0, xmm1 - SSE41_HSum8W xmm6, xmm0, xmm1 + MMX_DW_1_2REG xmm0, xmm1 + psrlw xmm4, 1 ;/2 + psrlw xmm5, 1 ;/2 + psrlw xmm6, 1 ;/2 + SSE41_HSum8W xmm4, xmm0, xmm1 + SSE41_HSum8W xmm5, xmm0, xmm1 + SSE41_HSum8W xmm6, xmm0, xmm1 %ifdef UNIX64 - pop r4 + pop r4 %endif - ; comparing order: DC H V - movd r3d, xmm6 ;DC - movd r1d, xmm5 ;H - movd r0d, xmm4 ;V + ; comparing order: DC H V + movd r3d, xmm6 ;DC + movd r1d, xmm5 ;H + movd r0d, xmm4 ;V %ifndef X86_32 - pop r12 + pop r12 %endif - shl r5d, 1 - add r1d, r5d - add r3d, r5d - mov r4, arg5 - cmp r3d, r1d - jge near not_dc_16x16 - cmp r3d, r0d - jge near not_dc_h_16x16 + shl r5d, 1 + add r1d, r5d + add r3d, r5d + mov r4, arg5 + cmp r3d, r1d + jge near not_dc_16x16 + cmp r3d, r0d + jge near not_dc_h_16x16 - ; for DC mode - mov dword[r4], 2;I16_PRED_DC - mov retrd, r3d - jmp near return_satd_intra_16x16_x3 + ; for DC mode + mov dword[r4], 2;I16_PRED_DC + mov retrd, r3d + jmp near return_satd_intra_16x16_x3 not_dc_16x16: - ; for H mode - cmp r1d, r0d - jge near not_dc_h_16x16 - mov dword[r4], 1;I16_PRED_H - mov retrd, r1d - jmp near return_satd_intra_16x16_x3 + ; for H mode + cmp r1d, r0d + jge near not_dc_h_16x16 + mov dword[r4], 1;I16_PRED_H + mov retrd, r1d + jmp near return_satd_intra_16x16_x3 not_dc_h_16x16: - ; for V mode - mov dword[r4], 0;I16_PRED_V - mov retrd, r0d + ; for V mode + mov dword[r4], 0;I16_PRED_V + mov retrd, r0d return_satd_intra_16x16_x3: - WELSEMMS - POP_XMM - LOAD_7_PARA_POP + WELSEMMS + POP_XMM + LOAD_7_PARA_POP ret %macro SSE41_ChromaGetX38x8Satd 0 - movdqa xmm5, [HSumSubDB1] - movdqa xmm6, [HSumSubDW1] - movdqa xmm7, [PDW1] - sub r0, r1 - movq xmm0, [r0] - punpcklqdq xmm0, xmm0 - SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 - movdqa [r6], xmm0 ;V - add r0, r1 - pinsrb xmm0, byte[r0-1], 0 - pinsrb xmm0, byte[r0+r1-1], 1 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 2 - pinsrb xmm0, byte[r0+r1-1], 3 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 4 - pinsrb xmm0, byte[r0+r1-1], 5 - lea r0, [r0+2*r1] - pinsrb xmm0, byte[r0-1], 6 - pinsrb xmm0, byte[r0+r1-1], 7 - punpcklqdq xmm0, xmm0 - SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 - movdqa [r6+16], xmm0 ;H + movdqa xmm5, [HSumSubDB1] + movdqa xmm6, [HSumSubDW1] + movdqa xmm7, [PDW1] + sub r0, r1 + movq xmm0, [r0] + punpcklqdq xmm0, xmm0 + SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 + movdqa [r6], xmm0 ;V + add r0, r1 + pinsrb xmm0, byte[r0-1], 0 + pinsrb xmm0, byte[r0+r1-1], 1 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 2 + pinsrb xmm0, byte[r0+r1-1], 3 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 4 + pinsrb xmm0, byte[r0+r1-1], 5 + lea r0, [r0+2*r1] + pinsrb xmm0, byte[r0-1], 6 + pinsrb xmm0, byte[r0+r1-1], 7 + punpcklqdq xmm0, xmm0 + SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 + movdqa [r6+16], xmm0 ;H ;(sum+2)>>2 - movdqa xmm6, [PDQ2] - movdqa xmm5, xmm4 - punpckhqdq xmm5, xmm1 - paddd xmm5, xmm6 - psrld xmm5, 2 + movdqa xmm6, [PDQ2] + movdqa xmm5, xmm4 + punpckhqdq xmm5, xmm1 + paddd xmm5, xmm6 + psrld xmm5, 2 ;(sum1+sum2+4)>>3 - paddd xmm6, xmm6 - paddd xmm4, xmm1 - paddd xmm4, xmm6 - psrld xmm4, 3 + paddd xmm6, xmm6 + paddd xmm4, xmm1 + paddd xmm4, xmm6 + psrld xmm4, 3 ;satd *16 - pslld xmm5, 4 - pslld xmm4, 4 + pslld xmm5, 4 + pslld xmm4, 4 ;temp satd - movdqa xmm6, xmm4 - punpcklqdq xmm4, xmm5 - psllq xmm4, 32 - psrlq xmm4, 32 - movdqa [r6+32], xmm4 - punpckhqdq xmm5, xmm6 - psllq xmm5, 32 - psrlq xmm5, 32 - movdqa [r6+48], xmm5 + movdqa xmm6, xmm4 + punpcklqdq xmm4, xmm5 + psllq xmm4, 32 + psrlq xmm4, 32 + movdqa [r6+32], xmm4 + punpckhqdq xmm5, xmm6 + psllq xmm5, 32 + psrlq xmm5, 32 + movdqa [r6+48], xmm5 - pxor xmm4, xmm4 ;V - pxor xmm5, xmm5 ;H - pxor xmm6, xmm6 ;DC - mov r0, 0 - SSE41_ChromaGetX38x4Satd r0, 0 - inc r0 - SSE41_ChromaGetX38x4Satd r0, 0 + pxor xmm4, xmm4 ;V + pxor xmm5, xmm5 ;H + pxor xmm6, xmm6 ;DC + mov r0, 0 + SSE41_ChromaGetX38x4Satd r0, 0 + inc r0 + SSE41_ChromaGetX38x4Satd r0, 0 %endmacro %macro SSEReg2MMX 3 - movdq2q %2, %1 - movhlps %1, %1 - movdq2q %3, %1 + movdq2q %2, %1 + movhlps %1, %1 + movdq2q %3, %1 %endmacro %macro MMXReg2SSE 4 - movq2dq %1, %3 - movq2dq %2, %4 - punpcklqdq %1, %2 + movq2dq %1, %3 + movq2dq %2, %4 + punpcklqdq %1, %2 %endmacro ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 - %assign push_num 0 - LOAD_7_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d loop_chroma_satdx3: - SSE41_ChromaGetX38x8Satd - SSEReg2MMX xmm4, mm0,mm1 - SSEReg2MMX xmm5, mm2,mm3 - SSEReg2MMX xmm6, mm5,mm6 - mov r0, arg8 - mov r2, arg9 + SSE41_ChromaGetX38x8Satd + SSEReg2MMX xmm4, mm0,mm1 + SSEReg2MMX xmm5, mm2,mm3 + SSEReg2MMX xmm6, mm5,mm6 + mov r0, arg8 + mov r2, arg9 - SSE41_ChromaGetX38x8Satd + SSE41_ChromaGetX38x8Satd - MMXReg2SSE xmm0, xmm3, mm0, mm1 - MMXReg2SSE xmm1, xmm3, mm2, mm3 - MMXReg2SSE xmm2, xmm3, mm5, mm6 + MMXReg2SSE xmm0, xmm3, mm0, mm1 + MMXReg2SSE xmm1, xmm3, mm2, mm3 + MMXReg2SSE xmm2, xmm3, mm5, mm6 - paddw xmm4, xmm0 - paddw xmm5, xmm1 - paddw xmm6, xmm2 + paddw xmm4, xmm0 + paddw xmm5, xmm1 + paddw xmm6, xmm2 - MMX_DW_1_2REG xmm0, xmm1 - psrlw xmm4, 1 ;/2 - psrlw xmm5, 1 ;/2 - psrlw xmm6, 1 ;/2 - SSE41_HSum8W xmm4, xmm0, xmm1 - SSE41_HSum8W xmm5, xmm0, xmm1 - SSE41_HSum8W xmm6, xmm0, xmm1 - ; comparing order: DC H V - movd r3d, xmm6 ;DC - movd r1d, xmm5 ;H - movd r0d, xmm4 ;V + MMX_DW_1_2REG xmm0, xmm1 + psrlw xmm4, 1 ;/2 + psrlw xmm5, 1 ;/2 + psrlw xmm6, 1 ;/2 + SSE41_HSum8W xmm4, xmm0, xmm1 + SSE41_HSum8W xmm5, xmm0, xmm1 + SSE41_HSum8W xmm6, xmm0, xmm1 + ; comparing order: DC H V + movd r3d, xmm6 ;DC + movd r1d, xmm5 ;H + movd r0d, xmm4 ;V - shl r5d, 1 - add r1d, r5d - add r0d, r5d - cmp r3d, r1d - jge near not_dc_8x8 - cmp r3d, r0d - jge near not_dc_h_8x8 + shl r5d, 1 + add r1d, r5d + add r0d, r5d + cmp r3d, r1d + jge near not_dc_8x8 + cmp r3d, r0d + jge near not_dc_h_8x8 - ; for DC mode - mov dword[r4], 0;I8_PRED_DC - mov retrd, r3d - jmp near return_satd_intra_8x8_x3 + ; for DC mode + mov dword[r4], 0;I8_PRED_DC + mov retrd, r3d + jmp near return_satd_intra_8x8_x3 not_dc_8x8: - ; for H mode - cmp r1d, r0d - jge near not_dc_h_8x8 - mov dword[r4], 1;I8_PRED_H - mov retrd, r1d - jmp near return_satd_intra_8x8_x3 + ; for H mode + cmp r1d, r0d + jge near not_dc_h_8x8 + mov dword[r4], 1;I8_PRED_H + mov retrd, r1d + jmp near return_satd_intra_8x8_x3 not_dc_h_8x8: - ; for V mode - mov dword[r4], 2;I8_PRED_V - mov retrd, r0d + ; for V mode + mov dword[r4], 2;I8_PRED_V + mov retrd, r0d return_satd_intra_8x8_x3: - WELSEMMS - POP_XMM - LOAD_7_PARA_POP + WELSEMMS + POP_XMM + LOAD_7_PARA_POP ret @@ -1040,22 +1040,22 @@ ret ; ;*********************************************************************** %macro SSSE3_Get16BSadHVDC 2 - movd xmm6,%1 - pshufb xmm6,xmm1 - movdqa %1, xmm6 - movdqa xmm0,%2 - psadbw xmm0,xmm7 - paddw xmm4,xmm0 - movdqa xmm0,%2 - psadbw xmm0,xmm5 - paddw xmm2,xmm0 - psadbw xmm6,%2 - paddw xmm3,xmm6 + movd xmm6,%1 + pshufb xmm6,xmm1 + movdqa %1, xmm6 + movdqa xmm0,%2 + psadbw xmm0,xmm7 + paddw xmm4,xmm0 + movdqa xmm0,%2 + psadbw xmm0,xmm5 + paddw xmm2,xmm0 + psadbw xmm6,%2 + paddw xmm3,xmm6 %endmacro %macro WelsAddDCValue 4 - movzx %2, byte %1 - mov %3, %2 - add %4, %2 + movzx %2, byte %1 + mov %3, %2 + add %4, %2 %endmacro ;*********************************************************************** @@ -1064,138 +1064,138 @@ ret ; ;*********************************************************************** WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 - %assign push_num 0 - LOAD_7_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r5, r5d + %assign push_num 0 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r5, r5d - push r5 - push r4 - push r3 + push r5 + push r4 + push r3 - sub r0, r1 - movdqa xmm5,[r0] - pxor xmm0,xmm0 - psadbw xmm0,xmm5 - movhlps xmm1,xmm0 - paddw xmm0,xmm1 - movd r5d, xmm0 + sub r0, r1 + movdqa xmm5,[r0] + pxor xmm0,xmm0 + psadbw xmm0,xmm5 + movhlps xmm1,xmm0 + paddw xmm0,xmm1 + movd r5d, xmm0 - add r0,r1 - lea r3,[r1+2*r1] ;ebx r3 - WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d - WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d - WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d - WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d - lea r0, [r0+4*r1] - add r6, 64 - WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d - WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d - WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d - WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d - lea r0, [r0+4*r1] - add r6, 64 - WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d - WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d - WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d - WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d - lea r0, [r0+4*r1] - add r6, 64 - WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d - WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d - WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d - WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d - sub r6, 192 - add r5d,10h - shr r5d,5 - movd xmm7,r5d - pxor xmm1,xmm1 - pshufb xmm7,xmm1 - pxor xmm4,xmm4 - pxor xmm3,xmm3 - pxor xmm2,xmm2 - ;sad begin - pop r3 - lea r4, [r3+2*r3] ;esi r4 - SSSE3_Get16BSadHVDC [r6], [r2] - SSSE3_Get16BSadHVDC [r6+16], [r2+r3] - SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] - SSSE3_Get16BSadHVDC [r6+48], [r2+r4] - add r6, 64 - lea r2, [r2+4*r3] - SSSE3_Get16BSadHVDC [r6], [r2] - SSSE3_Get16BSadHVDC [r6+16], [r2+r3] - SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] - SSSE3_Get16BSadHVDC [r6+48], [r2+r4] - add r6, 64 - lea r2, [r2+4*r3] - SSSE3_Get16BSadHVDC [r6], [r2] - SSSE3_Get16BSadHVDC [r6+16], [r2+r3] - SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] - SSSE3_Get16BSadHVDC [r6+48], [r2+r4] - add r6, 64 - lea r2, [r2+4*r3] - SSSE3_Get16BSadHVDC [r6], [r2] - SSSE3_Get16BSadHVDC [r6+16], [r2+r3] - SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] - SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r0,r1 + lea r3,[r1+2*r1] ;ebx r3 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + lea r0, [r0+4*r1] + add r6, 64 + WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d + WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d + WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d + WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d + sub r6, 192 + add r5d,10h + shr r5d,5 + movd xmm7,r5d + pxor xmm1,xmm1 + pshufb xmm7,xmm1 + pxor xmm4,xmm4 + pxor xmm3,xmm3 + pxor xmm2,xmm2 + ;sad begin + pop r3 + lea r4, [r3+2*r3] ;esi r4 + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] + add r6, 64 + lea r2, [r2+4*r3] + SSSE3_Get16BSadHVDC [r6], [r2] + SSSE3_Get16BSadHVDC [r6+16], [r2+r3] + SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] + SSSE3_Get16BSadHVDC [r6+48], [r2+r4] - pop r4 - pop r5 - pslldq xmm3,4 - por xmm3,xmm2 - movhlps xmm1,xmm3 - paddw xmm3,xmm1 - movhlps xmm0,xmm4 - paddw xmm4,xmm0 - ; comparing order: DC H V - movd r1d, xmm4 ;DC ;ebx r1d - movd r0d, xmm3 ;V ;ecx r0d - psrldq xmm3, 4 - movd r2d, xmm3 ;H ;esi r2d + pop r4 + pop r5 + pslldq xmm3,4 + por xmm3,xmm2 + movhlps xmm1,xmm3 + paddw xmm3,xmm1 + movhlps xmm0,xmm4 + paddw xmm4,xmm0 + ; comparing order: DC H V + movd r1d, xmm4 ;DC ;ebx r1d + movd r0d, xmm3 ;V ;ecx r0d + psrldq xmm3, 4 + movd r2d, xmm3 ;H ;esi r2d - ;mov eax, [esp+36] ;lamda ;eax r5 - shl r5d, 1 - add r2d, r5d - add r1d, r5d - ;mov edx, [esp+32] ;edx r4 - cmp r1d, r2d - jge near not_dc_16x16_sad - cmp r1d, r0d - jge near not_dc_h_16x16_sad - ; for DC mode - mov dword[r4], 2;I16_PRED_DC - mov retrd, r1d - sub r6, 192 + ;mov eax, [esp+36] ;lamda ;eax r5 + shl r5d, 1 + add r2d, r5d + add r1d, r5d + ;mov edx, [esp+32] ;edx r4 + cmp r1d, r2d + jge near not_dc_16x16_sad + cmp r1d, r0d + jge near not_dc_h_16x16_sad + ; for DC mode + mov dword[r4], 2;I16_PRED_DC + mov retrd, r1d + sub r6, 192 %assign x 0 %rep 16 - movdqa [r6+16*x], xmm7 + movdqa [r6+16*x], xmm7 %assign x x+1 %endrep - jmp near return_sad_intra_16x16_x3 + jmp near return_sad_intra_16x16_x3 not_dc_16x16_sad: - ; for H mode - cmp r2d, r0d - jge near not_dc_h_16x16_sad - mov dword[r4], 1;I16_PRED_H - mov retrd, r2d - jmp near return_sad_intra_16x16_x3 + ; for H mode + cmp r2d, r0d + jge near not_dc_h_16x16_sad + mov dword[r4], 1;I16_PRED_H + mov retrd, r2d + jmp near return_sad_intra_16x16_x3 not_dc_h_16x16_sad: - ; for V mode - mov dword[r4], 0;I16_PRED_V - mov retrd, r0d - sub r6, 192 + ; for V mode + mov dword[r4], 0;I16_PRED_V + mov retrd, r0d + sub r6, 192 %assign x 0 %rep 16 - movdqa [r6+16*x], xmm5 + movdqa [r6+16*x], xmm5 %assign x x+1 %endrep return_sad_intra_16x16_x3: - POP_XMM - LOAD_7_PARA_POP - ret + POP_XMM + LOAD_7_PARA_POP + ret ;*********************************************************************** ; @@ -1210,63 +1210,63 @@ return_sad_intra_16x16_x3: ;SSE4.1 %macro SSE41_GetSatd8x4 0 - movq xmm0, [r0] - punpcklqdq xmm0, xmm0 - pmaddubsw xmm0, xmm7 - movq xmm1, [r0+r1] - punpcklqdq xmm1, xmm1 - pmaddubsw xmm1, xmm7 - movq xmm2, [r2] - punpcklqdq xmm2, xmm2 - pmaddubsw xmm2, xmm7 - movq xmm3, [r2+r3] - punpcklqdq xmm3, xmm3 - pmaddubsw xmm3, xmm7 - psubsw xmm0, xmm2 - psubsw xmm1, xmm3 - movq xmm2, [r0+2*r1] - punpcklqdq xmm2, xmm2 - pmaddubsw xmm2, xmm7 - movq xmm3, [r0+r4] - punpcklqdq xmm3, xmm3 - pmaddubsw xmm3, xmm7 - movq xmm4, [r2+2*r3] - punpcklqdq xmm4, xmm4 - pmaddubsw xmm4, xmm7 - movq xmm5, [r2+r5] - punpcklqdq xmm5, xmm5 - pmaddubsw xmm5, xmm7 - psubsw xmm2, xmm4 - psubsw xmm3, xmm5 - SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 - pabsw xmm0, xmm0 - pabsw xmm2, xmm2 - pabsw xmm1, xmm1 - pabsw xmm3, xmm3 - movdqa xmm4, xmm3 - pblendw xmm3, xmm1, 0xAA - pslld xmm1, 16 - psrld xmm4, 16 - por xmm1, xmm4 - pmaxuw xmm1, xmm3 - paddw xmm6, xmm1 - movdqa xmm4, xmm0 - pblendw xmm0, xmm2, 0xAA - pslld xmm2, 16 - psrld xmm4, 16 - por xmm2, xmm4 - pmaxuw xmm0, xmm2 - paddw xmm6, xmm0 + movq xmm0, [r0] + punpcklqdq xmm0, xmm0 + pmaddubsw xmm0, xmm7 + movq xmm1, [r0+r1] + punpcklqdq xmm1, xmm1 + pmaddubsw xmm1, xmm7 + movq xmm2, [r2] + punpcklqdq xmm2, xmm2 + pmaddubsw xmm2, xmm7 + movq xmm3, [r2+r3] + punpcklqdq xmm3, xmm3 + pmaddubsw xmm3, xmm7 + psubsw xmm0, xmm2 + psubsw xmm1, xmm3 + movq xmm2, [r0+2*r1] + punpcklqdq xmm2, xmm2 + pmaddubsw xmm2, xmm7 + movq xmm3, [r0+r4] + punpcklqdq xmm3, xmm3 + pmaddubsw xmm3, xmm7 + movq xmm4, [r2+2*r3] + punpcklqdq xmm4, xmm4 + pmaddubsw xmm4, xmm7 + movq xmm5, [r2+r5] + punpcklqdq xmm5, xmm5 + pmaddubsw xmm5, xmm7 + psubsw xmm2, xmm4 + psubsw xmm3, xmm5 + SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 + pabsw xmm0, xmm0 + pabsw xmm2, xmm2 + pabsw xmm1, xmm1 + pabsw xmm3, xmm3 + movdqa xmm4, xmm3 + pblendw xmm3, xmm1, 0xAA + pslld xmm1, 16 + psrld xmm4, 16 + por xmm1, xmm4 + pmaxuw xmm1, xmm3 + paddw xmm6, xmm1 + movdqa xmm4, xmm0 + pblendw xmm0, xmm2, 0xAA + pslld xmm2, 16 + psrld xmm4, 16 + por xmm2, xmm4 + pmaxuw xmm0, xmm2 + paddw xmm6, xmm0 %endmacro %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE - MMX_DW_1_2REG %3, %4 - pmaddwd %2, %3 - movhlps %4, %2 - paddd %2, %4 - pshuflw %4, %2,0Eh - paddd %2, %4 - movd %1, %2 + MMX_DW_1_2REG %3, %4 + pmaddwd %2, %3 + movhlps %4, %2 + paddd %2, %4 + pshuflw %4, %2,0Eh + paddd %2, %4 + movd %1, %2 %endmacro ;*********************************************************************** ; @@ -1274,53 +1274,53 @@ return_sad_intra_16x16_x3: ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse41 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movdqa xmm4,[HSwapSumSubDB1] - movd xmm2,[r2] - movd xmm5,[r2+r3] - shufps xmm2,xmm5,0 - movd xmm3,[r2+r3*2] - lea r2, [r3*2+r2] - movd xmm5,[r2+r3] - shufps xmm3,xmm5,0 - movd xmm0,[r0] - movd xmm5,[r0+r1] - shufps xmm0,xmm5,0 - movd xmm1,[r0+r1*2] - lea r0, [r1*2+r0] - movd xmm5,[r0+r1] - shufps xmm1,xmm5,0 - pmaddubsw xmm0,xmm4 - pmaddubsw xmm1,xmm4 - pmaddubsw xmm2,xmm4 - pmaddubsw xmm3,xmm4 - psubw xmm0,xmm2 - psubw xmm1,xmm3 - movdqa xmm2,xmm0 - paddw xmm0,xmm1 - psubw xmm1,xmm2 - movdqa xmm2,xmm0 - punpcklqdq xmm0,xmm1 - punpckhqdq xmm2,xmm1 - movdqa xmm1,xmm0 - paddw xmm0,xmm2 - psubw xmm2,xmm1 - movdqa xmm1,xmm0 - pblendw xmm0,xmm2,0AAh - pslld xmm2,16 - psrld xmm1,16 - por xmm2,xmm1 - pabsw xmm0,xmm0 - pabsw xmm2,xmm2 - pmaxsw xmm0,xmm2 - SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 - POP_XMM - LOAD_4_PARA_POP - ret + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movdqa xmm4,[HSwapSumSubDB1] + movd xmm2,[r2] + movd xmm5,[r2+r3] + shufps xmm2,xmm5,0 + movd xmm3,[r2+r3*2] + lea r2, [r3*2+r2] + movd xmm5,[r2+r3] + shufps xmm3,xmm5,0 + movd xmm0,[r0] + movd xmm5,[r0+r1] + shufps xmm0,xmm5,0 + movd xmm1,[r0+r1*2] + lea r0, [r1*2+r0] + movd xmm5,[r0+r1] + shufps xmm1,xmm5,0 + pmaddubsw xmm0,xmm4 + pmaddubsw xmm1,xmm4 + pmaddubsw xmm2,xmm4 + pmaddubsw xmm3,xmm4 + psubw xmm0,xmm2 + psubw xmm1,xmm3 + movdqa xmm2,xmm0 + paddw xmm0,xmm1 + psubw xmm1,xmm2 + movdqa xmm2,xmm0 + punpcklqdq xmm0,xmm1 + punpckhqdq xmm2,xmm1 + movdqa xmm1,xmm0 + paddw xmm0,xmm2 + psubw xmm2,xmm1 + movdqa xmm1,xmm0 + pblendw xmm0,xmm2,0AAh + pslld xmm2,16 + psrld xmm1,16 + por xmm2,xmm1 + pabsw xmm0,xmm0 + pabsw xmm2,xmm2 + pmaxsw xmm0,xmm2 + SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 + POP_XMM + LOAD_4_PARA_POP + ret ;*********************************************************************** ; @@ -1329,30 +1329,30 @@ WELS_EXTERN WelsSampleSatd4x4_sse41 ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse41 %ifdef X86_32 - push r4 - push r5 + push r4 + push r5 %endif - %assign push_num 2 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - POP_XMM - LOAD_4_PARA_POP + %assign push_num 2 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + POP_XMM + LOAD_4_PARA_POP %ifdef X86_32 - pop r5 - pop r4 + pop r5 + pop r4 %endif - ret + ret ;*********************************************************************** ; @@ -1361,36 +1361,36 @@ WELS_EXTERN WelsSampleSatd8x8_sse41 ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse41 %ifdef X86_32 - push r4 - push r5 - push r6 + push r4 + push r5 + push r6 %endif - %assign push_num 3 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - mov r6, 0 + %assign push_num 3 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + mov r6, 0 loop_get_satd_8x16: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_8x16 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - POP_XMM - LOAD_4_PARA_POP + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_8x16 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + POP_XMM + LOAD_4_PARA_POP %ifdef X86_32 - pop r6 - pop r5 - pop r4 + pop r6 + pop r5 + pop r4 %endif - ret + ret ;*********************************************************************** ; @@ -1399,42 +1399,42 @@ loop_get_satd_8x16: ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse41 %ifdef X86_32 - push r4 - push r5 + push r4 + push r5 %endif - %assign push_num 2 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - push r0 - push r2 + %assign push_num 2 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + push r0 + push r2 - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 - pop r2 - pop r0 - add r0, 8 - add r2, 8 - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE41_GetSatd8x4 - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - POP_XMM - LOAD_4_PARA_POP + pop r2 + pop r0 + add r0, 8 + add r2, 8 + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE41_GetSatd8x4 + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + POP_XMM + LOAD_4_PARA_POP %ifdef X86_32 - pop r5 - pop r4 + pop r5 + pop r4 %endif - ret + ret ;*********************************************************************** ; @@ -1444,53 +1444,53 @@ WELS_EXTERN WelsSampleSatd16x8_sse41 WELS_EXTERN WelsSampleSatd16x16_sse41 %ifdef X86_32 - push r4 - push r5 - push r6 + push r4 + push r5 + push r6 %endif - %assign push_num 3 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d + %assign push_num 3 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d - push r0 - push r2 + push r0 + push r2 - movdqa xmm7, [HSumSubDB1] - lea r4, [r1+r1*2] - lea r5, [r3+r3*2] - pxor xmm6, xmm6 - mov r6, 0 + movdqa xmm7, [HSumSubDB1] + lea r4, [r1+r1*2] + lea r5, [r3+r3*2] + pxor xmm6, xmm6 + mov r6, 0 loop_get_satd_16x16_left: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_16x16_left + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_16x16_left - pop r2 - pop r0 - add r0, 8 - add r2, 8 - mov r6, 0 + pop r2 + pop r0 + add r0, 8 + add r2, 8 + mov r6, 0 loop_get_satd_16x16_right: - SSE41_GetSatd8x4 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - inc r6 - cmp r6, 4 - jl loop_get_satd_16x16_right - SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 - POP_XMM - LOAD_4_PARA_POP + SSE41_GetSatd8x4 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + inc r6 + cmp r6, 4 + jl loop_get_satd_16x16_right + SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 + POP_XMM + LOAD_4_PARA_POP %ifdef X86_32 - pop r6 - pop r5 - pop r4 + pop r6 + pop r5 + pop r4 %endif - ret + ret ;*********************************************************************** ; @@ -1505,55 +1505,55 @@ loop_get_satd_16x16_right: ;*********************************************************************** %macro SSE2_GetSad2x16 0 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqu xmm1, [r2] - MOVDQ xmm2, [r0];[eax] must aligned 16 - psadbw xmm1, xmm2 - paddw xmm0, xmm1 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm0, xmm1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqu xmm1, [r2] + MOVDQ xmm2, [r0];[eax] must aligned 16 + psadbw xmm1, xmm2 + paddw xmm0, xmm1 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm0, xmm1 %endmacro %macro SSE2_GetSad4x16 0 - movdqu xmm0, [r2] - MOVDQ xmm2, [r0] - psadbw xmm0, xmm2 - paddw xmm7, xmm0 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm7, xmm1 - movdqu xmm1, [r2+2*r3] - MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 - psadbw xmm1, xmm2 - paddw xmm7, xmm1 - movdqu xmm1, [r2+r5] - MOVDQ xmm2, [r0+r4] - psadbw xmm1, xmm2 - paddw xmm7, xmm1 + movdqu xmm0, [r2] + MOVDQ xmm2, [r0] + psadbw xmm0, xmm2 + paddw xmm7, xmm0 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm7, xmm1 + movdqu xmm1, [r2+2*r3] + MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 + psadbw xmm1, xmm2 + paddw xmm7, xmm1 + movdqu xmm1, [r2+r5] + MOVDQ xmm2, [r0+r4] + psadbw xmm1, xmm2 + paddw xmm7, xmm1 %endmacro %macro SSE2_GetSad8x4 0 - movq xmm0, [r0] - movq xmm1, [r0+r1] - lea r0, [r0+2*r1] - movhps xmm0, [r0] - movhps xmm1, [r0+r1] + movq xmm0, [r0] + movq xmm1, [r0+r1] + lea r0, [r0+2*r1] + movhps xmm0, [r0] + movhps xmm1, [r0+r1] - movq xmm2, [r2] - movq xmm3, [r2+r3] - lea r2, [r2+2*r3] - movhps xmm2, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm2 - psadbw xmm1, xmm3 - paddw xmm6, xmm0 - paddw xmm6, xmm1 + movq xmm2, [r2] + movq xmm3, [r2+r3] + lea r2, [r2+2*r3] + movhps xmm2, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm2 + psadbw xmm1, xmm3 + paddw xmm6, xmm0 + paddw xmm6, xmm1 %endmacro ;*********************************************************************** @@ -1565,39 +1565,39 @@ loop_get_satd_16x16_right: ;*********************************************************************** WELS_EXTERN WelsSampleSad16x16_sse2 %ifdef X86_32 - push r4 - push r5 + push r4 + push r5 %endif - %assign push_num 2 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - lea r4, [3*r1] - lea r5, [3*r3] + %assign push_num 2 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + lea r4, [3*r1] + lea r5, [3*r3] - pxor xmm7, xmm7 - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - SSE2_GetSad4x16 - movhlps xmm0, xmm7 - paddw xmm0, xmm7 - movd retrd, xmm0 - POP_XMM - LOAD_4_PARA_POP + pxor xmm7, xmm7 + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + SSE2_GetSad4x16 + movhlps xmm0, xmm7 + paddw xmm0, xmm7 + movd retrd, xmm0 + POP_XMM + LOAD_4_PARA_POP %ifdef X86_32 - pop r5 - pop r4 + pop r5 + pop r4 %endif - ret + ret ;*********************************************************************** ; @@ -1607,55 +1607,55 @@ WELS_EXTERN WelsSampleSad16x16_sse2 ; ;*********************************************************************** WELS_EXTERN WelsSampleSad16x8_sse2 - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movdqu xmm0, [r2] - MOVDQ xmm2, [r0] - psadbw xmm0, xmm2 - movdqu xmm1, [r2+r3] - MOVDQ xmm2, [r0+r1] - psadbw xmm1, xmm2 - paddw xmm0, xmm1 + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movdqu xmm0, [r2] + MOVDQ xmm2, [r0] + psadbw xmm0, xmm2 + movdqu xmm1, [r2+r3] + MOVDQ xmm2, [r0+r1] + psadbw xmm1, xmm2 + paddw xmm0, xmm1 - SSE2_GetSad2x16 - SSE2_GetSad2x16 - SSE2_GetSad2x16 + SSE2_GetSad2x16 + SSE2_GetSad2x16 + SSE2_GetSad2x16 - movhlps xmm1, xmm0 - paddw xmm0, xmm1 - movd retrd, xmm0 - LOAD_4_PARA_POP - ret + movhlps xmm1, xmm0 + paddw xmm0, xmm1 + movd retrd, xmm0 + LOAD_4_PARA_POP + ret WELS_EXTERN WelsSampleSad8x16_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 7 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 7 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d pxor xmm6, xmm6 - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - SSE2_GetSad8x4 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + SSE2_GetSad8x4 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] SSE2_GetSad8x4 movhlps xmm0, xmm6 - paddw xmm0, xmm6 - movd retrd, xmm0 - POP_XMM - LOAD_4_PARA_POP - ret + paddw xmm0, xmm6 + movd retrd, xmm0 + POP_XMM + LOAD_4_PARA_POP + ret %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline @@ -1664,22 +1664,22 @@ cmp %1, (32-%2)|(%3>>1) %endmacro WELS_EXTERN WelsSampleSad8x8_sse21 - %assign push_num 0 - mov r2, arg3 - push r2 - CACHE_SPLIT_CHECK r2, 8, 64 - jle near .pixel_sad_8x8_nsplit - pop r2 + %assign push_num 0 + mov r2, arg3 + push r2 + CACHE_SPLIT_CHECK r2, 8, 64 + jle near .pixel_sad_8x8_nsplit + pop r2 %ifdef X86_32 - push r3 - push r4 - push r5 + push r3 + push r4 + push r5 %endif - %assign push_num 3 - PUSH_XMM 8 - mov r0, arg1 - mov r1, arg2 - SIGN_EXTENSION r1, r1d + %assign push_num 3 + PUSH_XMM 8 + mov r0, arg1 + mov r1, arg2 + SIGN_EXTENSION r1, r1d pxor xmm7, xmm7 ;ecx r2, edx r4, edi r5 @@ -1694,109 +1694,109 @@ WELS_EXTERN WelsSampleSad8x8_sse21 shl r4, 3 movd xmm5, r5d movd xmm6, r4d - mov r5, 8 - add r5, r2 + mov r5, 8 + add r5, r2 mov r3, arg4 - SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r3, r3d movq xmm0, [r0] - movhps xmm0, [r0+r1] + movhps xmm0, [r0+r1] - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 - psadbw xmm0, xmm1 - paddw xmm7, xmm0 + psadbw xmm0, xmm1 + paddw xmm7, xmm0 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] movq xmm0, [r0] - movhps xmm0, [r0+r1] + movhps xmm0, [r0+r1] - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 - psadbw xmm0, xmm1 - paddw xmm7, xmm0 + psadbw xmm0, xmm1 + paddw xmm7, xmm0 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] movq xmm0, [r0] - movhps xmm0, [r0+r1] + movhps xmm0, [r0+r1] - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 - psadbw xmm0, xmm1 - paddw xmm7, xmm0 + psadbw xmm0, xmm1 + paddw xmm7, xmm0 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - lea r5, [r5+2*r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r5, [r5+2*r3] movq xmm0, [r0] - movhps xmm0, [r0+r1] + movhps xmm0, [r0+r1] - movq xmm1, [r2] - movq xmm2, [r5] - movhps xmm1, [r2+r3] - movhps xmm2, [r5+r3] - psrlq xmm1, xmm5 - psllq xmm2, xmm6 - por xmm1, xmm2 + movq xmm1, [r2] + movq xmm2, [r5] + movhps xmm1, [r2+r3] + movhps xmm2, [r5+r3] + psrlq xmm1, xmm5 + psllq xmm2, xmm6 + por xmm1, xmm2 - psadbw xmm0, xmm1 - paddw xmm7, xmm0 + psadbw xmm0, xmm1 + paddw xmm7, xmm0 movhlps xmm0, xmm7 - paddw xmm0, xmm7 - movd retrd, xmm0 - POP_XMM + paddw xmm0, xmm7 + movd retrd, xmm0 + POP_XMM %ifdef X86_32 - pop r5 - pop r4 - pop r3 + pop r5 + pop r4 + pop r3 %endif - jmp .return + jmp .return .pixel_sad_8x8_nsplit: - pop r2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 7 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm6, xmm6 - SSE2_GetSad8x4 + pop r2 + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 7 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm6, xmm6 + SSE2_GetSad8x4 lea r0, [r0+2*r1] - lea r2, [r2+2*r3] + lea r2, [r2+2*r3] SSE2_GetSad8x4 movhlps xmm0, xmm6 - paddw xmm0, xmm6 - movd retrd, xmm0 - POP_XMM - LOAD_4_PARA_POP + paddw xmm0, xmm6 + movd retrd, xmm0 + POP_XMM + LOAD_4_PARA_POP .return: - ret + ret ;*********************************************************************** @@ -1814,624 +1814,624 @@ WELS_EXTERN WelsSampleSad8x8_sse21 %macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address - psadbw %1, %4 - paddw xmm5, %1 - psadbw %4, %3 - paddw xmm4, %4 - movdqu %4, [%5-1] - psadbw %4, %2 - paddw xmm6, %4 - movdqu %4, [%5+1] - psadbw %4, %2 - paddw xmm7, %4 + psadbw %1, %4 + paddw xmm5, %1 + psadbw %4, %3 + paddw xmm4, %4 + movdqu %4, [%5-1] + psadbw %4, %2 + paddw xmm6, %4 + movdqu %4, [%5+1] + psadbw %4, %2 + paddw xmm7, %4 %endmacro WELS_EXTERN WelsSampleSadFour16x16_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movdqa xmm0, [r0] - sub r2, r3 - movdqu xmm3, [r2] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movdqa xmm0, [r0] + sub r2, r3 + movdqu xmm3, [r2] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - psadbw xmm3, xmm1 - paddw xmm4, xmm3 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + psadbw xmm3, xmm1 + paddw xmm4, xmm3 - movdqu xmm2, [r2+r3-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 + movdqu xmm2, [r2+r3-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 - movdqu xmm3, [r2+r3+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + movdqu xmm3, [r2+r3+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r2, [r2+2*r3] - movdqu xmm3, [r2] - psadbw xmm2, xmm3 - paddw xmm5, xmm2 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + psadbw xmm2, xmm3 + paddw xmm5, xmm2 - movdqu xmm2, [r2-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 + movdqu xmm2, [r2-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 - movdqu xmm3, [r2+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + movdqu xmm3, [r2+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movdqu xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movdqu xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - POP_XMM - LOAD_5_PARA_POP - ret + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + POP_XMM + LOAD_5_PARA_POP + ret WELS_EXTERN WelsSampleSadFour16x8_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movdqa xmm0, [r0] - sub r2, r3 - movdqu xmm3, [r2] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movdqa xmm0, [r0] + sub r2, r3 + movdqu xmm3, [r2] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - psadbw xmm3, xmm1 - paddw xmm4, xmm3 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + psadbw xmm3, xmm1 + paddw xmm4, xmm3 - movdqu xmm2, [r2+r3-1] - psadbw xmm2, xmm0 - paddw xmm6, xmm2 + movdqu xmm2, [r2+r3-1] + psadbw xmm2, xmm0 + paddw xmm6, xmm2 - movdqu xmm3, [r2+r3+1] - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + movdqu xmm3, [r2+r3+1] + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm2, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 - movdqa xmm0, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm1, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 - movdqa xmm2, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movdqa xmm0, [r0] - movdqu xmm3, [r2] - SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 - movdqa xmm1, [r0+r1] - movdqu xmm3, [r2+r3] - SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 - lea r2, [r2+2*r3] - movdqu xmm3, [r2] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm2, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 + movdqa xmm0, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm1, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 + movdqa xmm2, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movdqa xmm0, [r0] + movdqu xmm3, [r2] + SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 + movdqa xmm1, [r0+r1] + movdqu xmm3, [r2+r3] + SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 + lea r2, [r2+2*r3] + movdqu xmm3, [r2] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movdqu xmm0, [r2-1] - psadbw xmm0, xmm1 - paddw xmm6, xmm0 + movdqu xmm0, [r2-1] + psadbw xmm0, xmm1 + paddw xmm6, xmm0 - movdqu xmm3, [r2+1] - psadbw xmm3, xmm1 - paddw xmm7, xmm3 + movdqu xmm3, [r2+1] + psadbw xmm3, xmm1 + paddw xmm7, xmm3 - movdqu xmm3, [r2+r3] - psadbw xmm1, xmm3 - paddw xmm5, xmm1 + movdqu xmm3, [r2+r3] + psadbw xmm1, xmm3 + paddw xmm5, xmm1 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - POP_XMM - LOAD_5_PARA_POP - ret + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + POP_XMM + LOAD_5_PARA_POP + ret WELS_EXTERN WelsSampleSadFour8x16_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - sub r2, r3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + sub r2, r3 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - POP_XMM - LOAD_5_PARA_POP - ret + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + POP_XMM + LOAD_5_PARA_POP + ret WELS_EXTERN WelsSampleSadFour8x8_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref - pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref - pxor xmm6, xmm6 ;sad pRefMb-1 - pxor xmm7, xmm7 ;sad pRefMb+1 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - sub r2, r3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref + pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref + pxor xmm6, xmm6 ;sad pRefMb-1 + pxor xmm7, xmm7 ;sad pRefMb+1 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + sub r2, r3 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movq xmm0, [r0] - movhps xmm0, [r0+r1] - psadbw xmm3, xmm0 - paddw xmm4, xmm3 + movq xmm0, [r0] + movhps xmm0, [r0+r1] + psadbw xmm3, xmm0 + paddw xmm4, xmm3 - movq xmm1, [r2+r3-1] - movq xmm3, [r2+r3+1] + movq xmm1, [r2+r3-1] + movq xmm3, [r2+r3+1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - movhps xmm1, [r2-1] - movhps xmm3, [r2+1] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + movhps xmm1, [r2-1] + movhps xmm3, [r2+1] - psadbw xmm1, xmm0 - paddw xmm6, xmm1 - psadbw xmm3, xmm0 - paddw xmm7, xmm3 + psadbw xmm1, xmm0 + paddw xmm6, xmm1 + psadbw xmm3, xmm0 + paddw xmm7, xmm3 - movq xmm3, [r2] - movhps xmm3, [r2+r3] - psadbw xmm0, xmm3 - paddw xmm5, xmm0 + movq xmm3, [r2] + movhps xmm3, [r2+r3] + psadbw xmm0, xmm3 + paddw xmm5, xmm0 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - movhlps xmm0, xmm5 - paddw xmm5, xmm0 - movhlps xmm0, xmm6 - paddw xmm6, xmm0 - movhlps xmm0, xmm7 - paddw xmm7, xmm0 - punpckldq xmm4, xmm5 - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 - movdqa [r4],xmm4 - POP_XMM - LOAD_5_PARA_POP - ret + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + movhlps xmm0, xmm5 + paddw xmm5, xmm0 + movhlps xmm0, xmm6 + paddw xmm6, xmm0 + movhlps xmm0, xmm7 + paddw xmm7, xmm0 + punpckldq xmm4, xmm5 + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 + movdqa [r4],xmm4 + POP_XMM + LOAD_5_PARA_POP + ret WELS_EXTERN WelsSampleSadFour4x4_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movd xmm0, [r0] - movd xmm1, [r0+r1] - lea r0, [r0+2*r1] - movd xmm2, [r0] - movd xmm3, [r0+r1] - punpckldq xmm0, xmm1 - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - sub r2, r3 - movd xmm1, [r2] - movd xmm2, [r2+r3] - punpckldq xmm1, xmm2 - movd xmm2, [r2+r3-1] - movd xmm3, [r2+r3+1] + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movd xmm0, [r0] + movd xmm1, [r0+r1] + lea r0, [r0+2*r1] + movd xmm2, [r0] + movd xmm3, [r0+r1] + punpckldq xmm0, xmm1 + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub r2, r3 + movd xmm1, [r2] + movd xmm2, [r2+r3] + punpckldq xmm1, xmm2 + movd xmm2, [r2+r3-1] + movd xmm3, [r2+r3+1] - lea r2, [r2+2*r3] + lea r2, [r2+2*r3] - movd xmm4, [r2] - movd xmm5, [r2-1] - punpckldq xmm2, xmm5 - movd xmm5, [r2+1] - punpckldq xmm3, xmm5 + movd xmm4, [r2] + movd xmm5, [r2-1] + punpckldq xmm2, xmm5 + movd xmm5, [r2+1] + punpckldq xmm3, xmm5 - movd xmm5, [r2+r3] - punpckldq xmm4, xmm5 + movd xmm5, [r2+r3] + punpckldq xmm4, xmm5 - punpcklqdq xmm1, xmm4 ;-L + punpcklqdq xmm1, xmm4 ;-L - movd xmm5, [r2+r3-1] - movd xmm6, [r2+r3+1] + movd xmm5, [r2+r3-1] + movd xmm6, [r2+r3+1] - lea r2, [r2+2*r3] - movd xmm7, [r2-1] - punpckldq xmm5, xmm7 - punpcklqdq xmm2, xmm5 ;-1 - movd xmm7, [r2+1] - punpckldq xmm6, xmm7 - punpcklqdq xmm3, xmm6 ;+1 - movd xmm6, [r2] - movd xmm7, [r2+r3] - punpckldq xmm6, xmm7 - punpcklqdq xmm4, xmm6 ;+L - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - psadbw xmm4, xmm0 + lea r2, [r2+2*r3] + movd xmm7, [r2-1] + punpckldq xmm5, xmm7 + punpcklqdq xmm2, xmm5 ;-1 + movd xmm7, [r2+1] + punpckldq xmm6, xmm7 + punpcklqdq xmm3, xmm6 ;+1 + movd xmm6, [r2] + movd xmm7, [r2+r3] + punpckldq xmm6, xmm7 + punpcklqdq xmm4, xmm6 ;+L + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + psadbw xmm4, xmm0 - movhlps xmm0, xmm1 - paddw xmm1, xmm0 - movhlps xmm0, xmm2 - paddw xmm2, xmm0 - movhlps xmm0, xmm3 - paddw xmm3, xmm0 - movhlps xmm0, xmm4 - paddw xmm4, xmm0 - punpckldq xmm1, xmm4 - punpckldq xmm2, xmm3 - punpcklqdq xmm1, xmm2 - movdqa [r4],xmm1 - POP_XMM - LOAD_5_PARA_POP - ret + movhlps xmm0, xmm1 + paddw xmm1, xmm0 + movhlps xmm0, xmm2 + paddw xmm2, xmm0 + movhlps xmm0, xmm3 + paddw xmm3, xmm0 + movhlps xmm0, xmm4 + paddw xmm4, xmm0 + punpckldq xmm1, xmm4 + punpckldq xmm2, xmm3 + punpcklqdq xmm1, xmm2 + movdqa [r4],xmm1 + POP_XMM + LOAD_5_PARA_POP + ret ;*********************************************************************** ; @@ -2444,33 +2444,33 @@ WELS_EXTERN WelsSampleSadFour4x4_sse2 ;*********************************************************************** WELS_EXTERN WelsSampleSad4x4_mmx %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movd mm0, [r0] - movd mm1, [r0+r1] - punpckldq mm0, mm1 + LOAD_4_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movd mm0, [r0] + movd mm1, [r0+r1] + punpckldq mm0, mm1 - movd mm3, [r2] - movd mm4, [r2+r3] - punpckldq mm3, mm4 - psadbw mm0, mm3 + movd mm3, [r2] + movd mm4, [r2+r3] + punpckldq mm3, mm4 + psadbw mm0, mm3 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] - movd mm1, [r0] - movd mm2, [r0+r1] - punpckldq mm1, mm2 + movd mm1, [r0] + movd mm2, [r0+r1] + punpckldq mm1, mm2 - movd mm3, [r2] - movd mm4, [r2+r3] - punpckldq mm3, mm4 - psadbw mm1, mm3 - paddw mm0, mm1 + movd mm3, [r2] + movd mm4, [r2+r3] + punpckldq mm3, mm4 + psadbw mm1, mm3 + paddw mm0, mm1 movd retrd, mm0 - WELSEMMS + WELSEMMS LOAD_4_PARA_POP ret diff --git a/codec/common/x86/vaa.asm b/codec/common/x86/vaa.asm index ae5a0233..1edb9f6d 100644 --- a/codec/common/x86/vaa.asm +++ b/codec/common/x86/vaa.asm @@ -29,16 +29,16 @@ ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* -;* vaa.asm +;* vaa.asm ;* -;* Abstract +;* Abstract ;* sse2 for pVaa routines ;* ;* History -;* 04/14/2010 Created -;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) -;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement -;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 +;* 04/14/2010 Created +;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) +;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement +;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 ;* ;*************************************************************************/ %include "asm_inc.asm" @@ -49,87 +49,87 @@ ;*********************************************************************** ; by comparing it outperforms than phaddw(SSSE3) sets -%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp - ; @sum_8x2 begin - pshufd %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 0B1h ; 10110001 B - paddw %1, %2 - ; end of @sum_8x2 -%endmacro ; END of SUM_WORD_8x2_SSE2 +%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp + ; @sum_8x2 begin + pshufd %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 0B1h ; 10110001 B + paddw %1, %2 + ; end of @sum_8x2 +%endmacro ; END of SUM_WORD_8x2_SSE2 %macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 - movdqa %1, [r0 ] ; line 0 - movdqa %2, [r0+r1] ; line 1 - movdqa %3, %1 - punpcklbw %1, xmm7 - punpckhbw %3, xmm7 - movdqa %4, %2 - punpcklbw %4, xmm7 - punpckhbw %2, xmm7 - paddw %1, %4 - paddw %2, %3 - movdqa %3, [r0+r2] ; line 2 - movdqa %4, [r0+r3] ; line 3 - movdqa %5, %3 - punpcklbw %3, xmm7 - punpckhbw %5, xmm7 - movdqa %6, %4 - punpcklbw %6, xmm7 - punpckhbw %4, xmm7 - paddw %3, %6 - paddw %4, %5 - paddw %1, %3 ; block 0, 1 - paddw %2, %4 ; block 2, 3 - pshufd %3, %1, 0B1h - pshufd %4, %2, 0B1h - paddw %1, %3 - paddw %2, %4 - movdqa %3, %1 - movdqa %4, %2 - pshuflw %5, %1, 0B1h - pshufhw %6, %3, 0B1h - paddw %1, %5 - paddw %3, %6 - pshuflw %5, %2, 0B1h - pshufhw %6, %4, 0B1h - paddw %2, %5 - paddw %4, %6 - punpcklwd %1, %2 - punpckhwd %3, %4 - punpcklwd %1, %3 - psraw %1, $04 + movdqa %1, [r0 ] ; line 0 + movdqa %2, [r0+r1] ; line 1 + movdqa %3, %1 + punpcklbw %1, xmm7 + punpckhbw %3, xmm7 + movdqa %4, %2 + punpcklbw %4, xmm7 + punpckhbw %2, xmm7 + paddw %1, %4 + paddw %2, %3 + movdqa %3, [r0+r2] ; line 2 + movdqa %4, [r0+r3] ; line 3 + movdqa %5, %3 + punpcklbw %3, xmm7 + punpckhbw %5, xmm7 + movdqa %6, %4 + punpcklbw %6, xmm7 + punpckhbw %4, xmm7 + paddw %3, %6 + paddw %4, %5 + paddw %1, %3 ; block 0, 1 + paddw %2, %4 ; block 2, 3 + pshufd %3, %1, 0B1h + pshufd %4, %2, 0B1h + paddw %1, %3 + paddw %2, %4 + movdqa %3, %1 + movdqa %4, %2 + pshuflw %5, %1, 0B1h + pshufhw %6, %3, 0B1h + paddw %1, %5 + paddw %3, %6 + pshuflw %5, %2, 0B1h + pshufhw %6, %4, 0B1h + paddw %2, %5 + paddw %4, %6 + punpcklwd %1, %2 + punpckhwd %3, %4 + punpcklwd %1, %3 + psraw %1, $04 %endmacro %macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 - movdqa %1, [r0 ] ; line 0 - movdqa %2, [r0+r1] ; line 1 - movdqa %3, %1 - punpcklbw %1, xmm7 - punpckhbw %3, xmm7 - movdqa %4, %2 - punpcklbw %4, xmm7 - punpckhbw %2, xmm7 - paddw %1, %4 - paddw %2, %3 - movdqa %3, [r0+r2] ; line 2 - movdqa %4, [r0+r3] ; line 3 - movdqa %5, %3 - punpcklbw %3, xmm7 - punpckhbw %5, xmm7 - movdqa %6, %4 - punpcklbw %6, xmm7 - punpckhbw %4, xmm7 - paddw %3, %6 - paddw %4, %5 - paddw %1, %3 ; block 0, 1 - paddw %2, %4 ; block 2, 3 - phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. - phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... - psraw %1, $04 + movdqa %1, [r0 ] ; line 0 + movdqa %2, [r0+r1] ; line 1 + movdqa %3, %1 + punpcklbw %1, xmm7 + punpckhbw %3, xmm7 + movdqa %4, %2 + punpcklbw %4, xmm7 + punpckhbw %2, xmm7 + paddw %1, %4 + paddw %2, %3 + movdqa %3, [r0+r2] ; line 2 + movdqa %4, [r0+r3] ; line 3 + movdqa %5, %3 + punpcklbw %3, xmm7 + punpckhbw %5, xmm7 + movdqa %6, %4 + punpcklbw %6, xmm7 + punpckhbw %4, xmm7 + paddw %3, %6 + paddw %4, %5 + paddw %1, %3 ; block 0, 1 + paddw %2, %4 ; block 2, 3 + phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. + phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... + psraw %1, $04 %endmacro @@ -143,7 +143,7 @@ SECTION .text ; , 6/7/2010 ;*********************************************************************** -; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize ); +; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize ); ;*********************************************************************** WELS_EXTERN AnalysisVaaInfoIntra_sse2 @@ -174,71 +174,71 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2 mov r4,r2 sal r4,$01 ;r4 = 4*iLineSize - pxor xmm7, xmm7 + pxor xmm7, xmm7 - ; loops - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [r7], xmm0 + ; loops + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [r7], xmm0 - lea r0, [r0+r4] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [r7+8], xmm0 + lea r0, [r0+r4] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [r7+8], xmm0 - lea r0, [r0+r4] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [r7+16], xmm0 + lea r0, [r0+r4] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [r7+16], xmm0 - lea r0, [r0+r4] - VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - movq [r7+24], xmm0 + lea r0, [r0+r4] + VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + movq [r7+24], xmm0 - movdqa xmm0, [r7] ; block 0~7 - movdqa xmm1, [r7+16] ; block 8~15 - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - SUM_WORD_8x2_SSE2 xmm0, xmm3 + movdqa xmm0, [r7] ; block 0~7 + movdqa xmm1, [r7+16] ; block 8~15 + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + SUM_WORD_8x2_SSE2 xmm0, xmm3 - pmullw xmm1, xmm1 - pmullw xmm2, xmm2 - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - punpcklwd xmm1, xmm7 - punpckhwd xmm3, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm4, xmm7 - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 01Bh - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 + pmullw xmm1, xmm1 + pmullw xmm2, xmm2 + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + punpcklwd xmm1, xmm7 + punpckhwd xmm3, xmm7 + punpcklwd xmm2, xmm7 + punpckhwd xmm4, xmm7 + paddd xmm1, xmm2 + paddd xmm3, xmm4 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 01Bh + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 - movd r2d, xmm0 - and r2, 0ffffh ; effective low work truncated - mov r3, r2 - imul r2, r3 - sar r2, $04 - movd retrd, xmm1 - sub retrd, r2d + movd r2d, xmm0 + and r2, 0ffffh ; effective low work truncated + mov r3, r2 + imul r2, r3 + sar r2, $04 + movd retrd, xmm1 + sub retrd, r2d - add r7,32 - add r7,r5 + add r7,32 + add r7,r5 %ifdef X86_32 - pop r6 - pop r5 - pop r4 - pop r3 + pop r6 + pop r5 + pop r4 + pop r3 %endif - POP_XMM + POP_XMM - ret + ret ;*********************************************************************** -; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize ); +; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize ); ;*********************************************************************** WELS_EXTERN AnalysisVaaInfoIntra_ssse3 @@ -269,47 +269,47 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3 mov r4,r2 sal r4,$01 ;r4 = 4*iLineSize - pxor xmm7, xmm7 + pxor xmm7, xmm7 - ; loops - VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + ; loops + VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7],xmm0 - lea r0,[r0+r4] - VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + lea r0,[r0+r4] + VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 movq [r7+8],xmm1 - lea r0,[r0+r4] - VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + lea r0,[r0+r4] + VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 movq [r7+16],xmm0 - lea r0,[r0+r4] - VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + lea r0,[r0+r4] + VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 movq [r7+24],xmm1 - movdqa xmm0,[r7] - movdqa xmm1,[r7+16] - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets + movdqa xmm0,[r7] + movdqa xmm1,[r7+16] + movdqa xmm2, xmm0 + paddw xmm0, xmm1 + SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets - pmullw xmm1, xmm1 - pmullw xmm2, xmm2 - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - punpcklwd xmm1, xmm7 - punpckhwd xmm3, xmm7 - punpcklwd xmm2, xmm7 - punpckhwd xmm4, xmm7 - paddd xmm1, xmm2 - paddd xmm3, xmm4 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 01Bh - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 + pmullw xmm1, xmm1 + pmullw xmm2, xmm2 + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 + punpcklwd xmm1, xmm7 + punpckhwd xmm3, xmm7 + punpcklwd xmm2, xmm7 + punpckhwd xmm4, xmm7 + paddd xmm1, xmm2 + paddd xmm3, xmm4 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 01Bh + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 movd r2d, xmm0 @@ -318,94 +318,94 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3 imul r2, r3 sar r2, $04 movd retrd, xmm1 - sub retrd, r2d + sub retrd, r2d - add r7,32 - add r7,r5 + add r7,32 + add r7,r5 %ifdef X86_32 - pop r6 - pop r5 - pop r4 - pop r3 + pop r6 + pop r5 + pop r4 + pop r3 %endif - POP_XMM + POP_XMM - ret + ret ;*********************************************************************** -; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 ) +; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 ) ;*********************************************************************** WELS_EXTERN MdInterAnalysisVaaInfo_sse41 - %assign push_num 0 - LOAD_1_PARA - movdqa xmm0,[r0] - pshufd xmm1, xmm0, 01Bh - paddd xmm1, xmm0 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 - psrad xmm1, 02h ; iAverageSad - movdqa xmm2, xmm1 - psrad xmm2, 06h - movdqa xmm3, xmm0 ; iSadBlock - psrad xmm3, 06h - psubd xmm3, xmm2 - pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets - pshufd xmm4, xmm3, 01Bh - paddd xmm4, xmm3 - pshufd xmm3, xmm4, 0B1h - paddd xmm3, xmm4 - movd r0d, xmm3 - cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD + %assign push_num 0 + LOAD_1_PARA + movdqa xmm0,[r0] + pshufd xmm1, xmm0, 01Bh + paddd xmm1, xmm0 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 + psrad xmm1, 02h ; iAverageSad + movdqa xmm2, xmm1 + psrad xmm2, 06h + movdqa xmm3, xmm0 ; iSadBlock + psrad xmm3, 06h + psubd xmm3, xmm2 + pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets + pshufd xmm4, xmm3, 01Bh + paddd xmm4, xmm3 + pshufd xmm3, xmm4, 0B1h + paddd xmm3, xmm4 + movd r0d, xmm3 + cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD - jb near .threshold_exit - pshufd xmm0, xmm0, 01Bh - pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad - movmskps retrd, xmm0 - ret + jb near .threshold_exit + pshufd xmm0, xmm0, 01Bh + pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad + movmskps retrd, xmm0 + ret .threshold_exit: - mov retrd, 15 - ret + mov retrd, 15 + ret ;*********************************************************************** -; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 ) +; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 ) ;*********************************************************************** WELS_EXTERN MdInterAnalysisVaaInfo_sse2 - %assign push_num 0 - LOAD_1_PARA - movdqa xmm0, [r0] - pshufd xmm1, xmm0, 01Bh - paddd xmm1, xmm0 - pshufd xmm2, xmm1, 0B1h - paddd xmm1, xmm2 - psrad xmm1, 02h ; iAverageSad - movdqa xmm2, xmm1 - psrad xmm2, 06h - movdqa xmm3, xmm0 ; iSadBlock - psrad xmm3, 06h - psubd xmm3, xmm2 + %assign push_num 0 + LOAD_1_PARA + movdqa xmm0, [r0] + pshufd xmm1, xmm0, 01Bh + paddd xmm1, xmm0 + pshufd xmm2, xmm1, 0B1h + paddd xmm1, xmm2 + psrad xmm1, 02h ; iAverageSad + movdqa xmm2, xmm1 + psrad xmm2, 06h + movdqa xmm3, xmm0 ; iSadBlock + psrad xmm3, 06h + psubd xmm3, xmm2 - ; to replace pmulld functionality as below - movdqa xmm2, xmm3 - pmuludq xmm2, xmm3 - pshufd xmm4, xmm3, 0B1h - pmuludq xmm4, xmm4 - movdqa xmm5, xmm2 - punpckldq xmm5, xmm4 - punpckhdq xmm2, xmm4 - punpcklqdq xmm5, xmm2 + ; to replace pmulld functionality as below + movdqa xmm2, xmm3 + pmuludq xmm2, xmm3 + pshufd xmm4, xmm3, 0B1h + pmuludq xmm4, xmm4 + movdqa xmm5, xmm2 + punpckldq xmm5, xmm4 + punpckhdq xmm2, xmm4 + punpcklqdq xmm5, xmm2 - pshufd xmm4, xmm5, 01Bh - paddd xmm4, xmm5 - pshufd xmm5, xmm4, 0B1h - paddd xmm5, xmm4 + pshufd xmm4, xmm5, 01Bh + paddd xmm4, xmm5 + pshufd xmm5, xmm4, 0B1h + paddd xmm5, xmm4 - movd r0d, xmm5 - cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD - jb near .threshold_exit - pshufd xmm0, xmm0, 01Bh - pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad - movmskps retrd, xmm0 - ret + movd r0d, xmm5 + cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD + jb near .threshold_exit + pshufd xmm0, xmm0, 01Bh + pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad + movmskps retrd, xmm0 + ret .threshold_exit: - mov retrd, 15 - ret + mov retrd, 15 + ret diff --git a/codec/decoder/core/arm/block_add_neon.S b/codec/decoder/core/arm/block_add_neon.S index fd9ae417..890654ff 100644 --- a/codec/decoder/core/arm/block_add_neon.S +++ b/codec/decoder/core/arm/block_add_neon.S @@ -36,128 +36,128 @@ #ifdef __APPLE__ .macro ROW_TRANSFORM_1_STEP -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 - vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 $8, $1, #1 - vshr.s16 $9, $3, #1 - vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 $8, $1, #1 + vshr.s16 $9, $3, #1 + vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } .endm -.macro TRANSFORM_4BYTES // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +.macro TRANSFORM_4BYTES // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro COL_TRANSFORM_1_STEP -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 $6, $1, #1 - vshr.s32 $7, $3, #1 - vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 $6, $1, #1 + vshr.s32 $7, $3, #1 + vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm #else .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 - vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 \arg8, \arg1, #1 - vshr.s16 \arg9, \arg3, #1 - vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 \arg8, \arg1, #1 + vshr.s16 \arg9, \arg3, #1 + vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } .endm .macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 \arg6, \arg1, #1 - vshr.s32 \arg7, \arg3, #1 - vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 \arg6, \arg1, #1 + vshr.s32 \arg7, \arg3, #1 + vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm #endif // r0 int16_t* block, // r1 int8_t* non_zero_count, WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon - vld1.64 {d0-d2}, [r1] + vld1.64 {d0-d2}, [r1] - vceq.s8 q0, q0, #0 - vceq.s8 d2, d2, #0 - vmvn q0, q0 - vmvn d2, d2 - vabs.s8 q0, q0 - vabs.s8 d2, d2 + vceq.s8 q0, q0, #0 + vceq.s8 d2, d2, #0 + vmvn q0, q0 + vmvn d2, d2 + vabs.s8 q0, q0 + vabs.s8 d2, d2 - vst1.64 {d0-d2}, [r1] + vst1.64 {d0-d2}, [r1] WELS_ASM_FUNC_END -// uint8_t *pred, const int32_t stride, int16_t *rs +// uint8_t *pred, const int32_t stride, int16_t *rs WELS_ASM_FUNC_BEGIN IdctResAddPred_neon - vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles! + vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles! - ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5 + ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5 - TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 + TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 - // transform element 32bits - vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] + // transform element 32bits + vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] - COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11 + COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11 - TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 + TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 - //after clip_table[MAX_NEG_CROP] into [0, 255] - mov r2, r0 - vld1.32 {d20[0]},[r0],r1 - vld1.32 {d20[1]},[r0],r1 - vld1.32 {d22[0]},[r0],r1 - vld1.32 {d22[1]},[r0] + //after clip_table[MAX_NEG_CROP] into [0, 255] + mov r2, r0 + vld1.32 {d20[0]},[r0],r1 + vld1.32 {d20[1]},[r0],r1 + vld1.32 {d22[0]},[r0],r1 + vld1.32 {d22[1]},[r0] - vrshrn.s32 d16, q0, #6 - vrshrn.s32 d17, q1, #6 - vrshrn.s32 d18, q2, #6 - vrshrn.s32 d19, q3, #6 + vrshrn.s32 d16, q0, #6 + vrshrn.s32 d17, q1, #6 + vrshrn.s32 d18, q2, #6 + vrshrn.s32 d19, q3, #6 - vmovl.u8 q0,d20 - vmovl.u8 q1,d22 - vadd.s16 q0,q8 - vadd.s16 q1,q9 + vmovl.u8 q0,d20 + vmovl.u8 q1,d22 + vadd.s16 q0,q8 + vadd.s16 q1,q9 - vqmovun.s16 d20,q0 - vqmovun.s16 d22,q1 + vqmovun.s16 d20,q0 + vqmovun.s16 d22,q1 - vst1.32 {d20[0]},[r2],r1 - vst1.32 {d20[1]},[r2],r1 - vst1.32 {d22[0]},[r2],r1 - vst1.32 {d22[1]},[r2] + vst1.32 {d20[0]},[r2],r1 + vst1.32 {d20[1]},[r2],r1 + vst1.32 {d22[0]},[r2],r1 + vst1.32 {d22[1]},[r2] WELS_ASM_FUNC_END #endif diff --git a/codec/decoder/core/arm/intra_pred_neon.S b/codec/decoder/core/arm/intra_pred_neon.S index 41bf4742..ec2e0672 100644 --- a/codec/decoder/core/arm/intra_pred_neon.S +++ b/codec/decoder/core/arm/intra_pred_neon.S @@ -38,104 +38,104 @@ #ifdef __APPLE__ //Global macro .macro GET_8BYTE_DATA - vld1.8 {$0[0]}, [$1], $2 - vld1.8 {$0[1]}, [$1], $2 - vld1.8 {$0[2]}, [$1], $2 - vld1.8 {$0[3]}, [$1], $2 - vld1.8 {$0[4]}, [$1], $2 - vld1.8 {$0[5]}, [$1], $2 - vld1.8 {$0[6]}, [$1], $2 - vld1.8 {$0[7]}, [$1], $2 + vld1.8 {$0[0]}, [$1], $2 + vld1.8 {$0[1]}, [$1], $2 + vld1.8 {$0[2]}, [$1], $2 + vld1.8 {$0[3]}, [$1], $2 + vld1.8 {$0[4]}, [$1], $2 + vld1.8 {$0[5]}, [$1], $2 + vld1.8 {$0[6]}, [$1], $2 + vld1.8 {$0[7]}, [$1], $2 .endmacro #else //Global macro .macro GET_8BYTE_DATA arg0, arg1, arg2 - vld1.8 {\arg0[0]}, [\arg1], \arg2 - vld1.8 {\arg0[1]}, [\arg1], \arg2 - vld1.8 {\arg0[2]}, [\arg1], \arg2 - vld1.8 {\arg0[3]}, [\arg1], \arg2 - vld1.8 {\arg0[4]}, [\arg1], \arg2 - vld1.8 {\arg0[5]}, [\arg1], \arg2 - vld1.8 {\arg0[6]}, [\arg1], \arg2 - vld1.8 {\arg0[7]}, [\arg1], \arg2 + vld1.8 {\arg0[0]}, [\arg1], \arg2 + vld1.8 {\arg0[1]}, [\arg1], \arg2 + vld1.8 {\arg0[2]}, [\arg1], \arg2 + vld1.8 {\arg0[3]}, [\arg1], \arg2 + vld1.8 {\arg0[4]}, [\arg1], \arg2 + vld1.8 {\arg0[5]}, [\arg1], \arg2 + vld1.8 {\arg0[6]}, [\arg1], \arg2 + vld1.8 {\arg0[7]}, [\arg1], \arg2 .endm #endif WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon - //Get the top line data to 'q0' - sub r2, r0, r1 - vldm r2, {d0, d1} + //Get the top line data to 'q0' + sub r2, r0, r1 + vldm r2, {d0, d1} - mov r2, r0 - mov r3, #4 - //Set the top line to the each line of MB(16*16) + mov r2, r0 + mov r3, #4 + //Set the top line to the each line of MB(16*16) loop_0_get_i16x16_luma_pred_v: - vst1.8 {d0,d1}, [r2], r1 - vst1.8 {d0,d1}, [r2], r1 - vst1.8 {d0,d1}, [r2], r1 - vst1.8 {d0,d1}, [r2], r1 - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_v + vst1.8 {d0,d1}, [r2], r1 + vst1.8 {d0,d1}, [r2], r1 + vst1.8 {d0,d1}, [r2], r1 + vst1.8 {d0,d1}, [r2], r1 + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_v WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon - sub r2, r0, #1 - mov r3, #4 + sub r2, r0, #1 + mov r3, #4 loop_0_get_i16x16_luma_pred_h: - //Get one byte data from left side - vld1.8 {d0[],d1[]}, [r2], r1 - vld1.8 {d2[],d3[]}, [r2], r1 - vld1.8 {d4[],d5[]}, [r2], r1 - vld1.8 {d6[],d7[]}, [r2], r1 + //Get one byte data from left side + vld1.8 {d0[],d1[]}, [r2], r1 + vld1.8 {d2[],d3[]}, [r2], r1 + vld1.8 {d4[],d5[]}, [r2], r1 + vld1.8 {d6[],d7[]}, [r2], r1 - //Set the line of MB using the left side byte data - vst1.8 {d0,d1}, [r0], r1 - vst1.8 {d2,d3}, [r0], r1 - vst1.8 {d4,d5}, [r0], r1 - vst1.8 {d6,d7}, [r0], r1 + //Set the line of MB using the left side byte data + vst1.8 {d0,d1}, [r0], r1 + vst1.8 {d2,d3}, [r0], r1 + vst1.8 {d4,d5}, [r0], r1 + vst1.8 {d6,d7}, [r0], r1 - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_h + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_h WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon - //stmdb sp!, { r2-r5, lr} - //Get the left vertical line data - sub r2, r0, #1 - GET_8BYTE_DATA d0, r2, r1 - GET_8BYTE_DATA d1, r2, r1 + //stmdb sp!, { r2-r5, lr} + //Get the left vertical line data + sub r2, r0, #1 + GET_8BYTE_DATA d0, r2, r1 + GET_8BYTE_DATA d1, r2, r1 - //Get the top horizontal line data - sub r2, r0, r1 - vldm r2, {d2, d3} + //Get the top horizontal line data + sub r2, r0, r1 + vldm r2, {d2, d3} - //Calculate the sum of top horizontal line data and vertical line data - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 - vadd.u16 q0, q0, q1 - vadd.u16 d0, d0, d1 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + //Calculate the sum of top horizontal line data and vertical line data + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - //Calculate the mean value - vrshr.u16 d0, d0, #5 - vdup.8 q0, d0[0] + //Calculate the mean value + vrshr.u16 d0, d0, #5 + vdup.8 q0, d0[0] - //Set the mean value to the all of member of MB - mov r2, #4 + //Set the mean value to the all of member of MB + mov r2, #4 loop_0_get_i16x16_luma_pred_dc_both: - vst1.8 {d0,d1}, [r0], r1 - vst1.8 {d0,d1}, [r0], r1 - vst1.8 {d0,d1}, [r0], r1 - vst1.8 {d0,d1}, [r0], r1 - subs r2, #1 - bne loop_0_get_i16x16_luma_pred_dc_both + vst1.8 {d0,d1}, [r0], r1 + vst1.8 {d0,d1}, [r0], r1 + vst1.8 {d0,d1}, [r0], r1 + vst1.8 {d0,d1}, [r0], r1 + subs r2, #1 + bne loop_0_get_i16x16_luma_pred_dc_both WELS_ASM_FUNC_END @@ -149,386 +149,386 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon - //stmdb sp!, { r2-r5, lr} + //stmdb sp!, { r2-r5, lr} - //Load the table {(8,7,6,5,4,3,2,1) * 5} - adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE - vldr d0, [r2] + //Load the table {(8,7,6,5,4,3,2,1) * 5} + adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE + vldr d0, [r2] - //Pack the top[-1] ~ top[6] to d1 - sub r2, r0, r1 - sub r3, r2, #1 - vld1.8 d1, [r3] + //Pack the top[-1] ~ top[6] to d1 + sub r2, r0, r1 + sub r3, r2, #1 + vld1.8 d1, [r3] - //Pack the top[8] ~ top[15] to d2 - add r3, #9 - vld1.8 d2, [r3] + //Pack the top[8] ~ top[15] to d2 + add r3, #9 + vld1.8 d2, [r3] - //Save the top[15] to d6 for next step - vdup.u8 d6, d2[7] + //Save the top[15] to d6 for next step + vdup.u8 d6, d2[7] - //Get and pack left[-1] ~ left[6] to d4 - sub r3, r2, #1 - GET_8BYTE_DATA d4, r3, r1 + //Get and pack left[-1] ~ left[6] to d4 + sub r3, r2, #1 + GET_8BYTE_DATA d4, r3, r1 - //Get and pack left[8] ~ left[15] to d3 - add r3, r1 - GET_8BYTE_DATA d3, r3, r1 + //Get and pack left[8] ~ left[15] to d3 + add r3, r1 + GET_8BYTE_DATA d3, r3, r1 - //Save the left[15] to d7 for next step - vdup.u8 d7, d3[7] + //Save the left[15] to d7 for next step + vdup.u8 d7, d3[7] - //revert the sequence of d2,d3 - vrev64.8 q1, q1 + //revert the sequence of d2,d3 + vrev64.8 q1, q1 - vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} - vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} + vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} + vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} - vmovl.u8 q0, d0 - vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} - vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} + vmovl.u8 q0, d0 + vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} + vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} - //Calculate the sum of items of q1, q2 - vpadd.s16 d0, d2, d3 - vpadd.s16 d1, d4, d5 - vpaddl.s16 q0, q0 - vpaddl.s32 q0, q0 + //Calculate the sum of items of q1, q2 + vpadd.s16 d0, d2, d3 + vpadd.s16 d1, d4, d5 + vpaddl.s16 q0, q0 + vpaddl.s32 q0, q0 - //Get the value of 'b', 'c' and extend to q1, q2. - vrshr.s64 q0, #6 - vdup.s16 q1, d0[0] - vdup.s16 q2, d1[0] + //Get the value of 'b', 'c' and extend to q1, q2. + vrshr.s64 q0, #6 + vdup.s16 q1, d0[0] + vdup.s16 q2, d1[0] - //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 - adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE - vld1.32 {d0}, [r2] + //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 + adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE + vld1.32 {d0}, [r2] - //Get the value of 'a' and save to q3 - vaddl.u8 q3, d6, d7 - vshl.u16 q3, #4 + //Get the value of 'a' and save to q3 + vaddl.u8 q3, d6, d7 + vshl.u16 q3, #4 - //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} - vmovl.s8 q0, d0 - vmla.s16 q3, q0, q1 - vmla.s16 q3, q2, d0[0] + //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} + vmovl.s8 q0, d0 + vmla.s16 q3, q0, q1 + vmla.s16 q3, q2, d0[0] - //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} - vshl.s16 q8, q1, #3 - vadd.s16 q8, q3 + //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} + vshl.s16 q8, q1, #3 + vadd.s16 q8, q3 - //right shift 5 bits and rounding - vqrshrun.s16 d0, q3, #5 - vqrshrun.s16 d1, q8, #5 + //right shift 5 bits and rounding + vqrshrun.s16 d0, q3, #5 + vqrshrun.s16 d1, q8, #5 - //Set the line of MB - vst1.u32 {d0,d1}, [r0], r1 + //Set the line of MB + vst1.u32 {d0,d1}, [r0], r1 - //Do the same processing for setting other lines - mov r2, #15 + //Do the same processing for setting other lines + mov r2, #15 loop_0_get_i16x16_luma_pred_plane: - vadd.s16 q3, q2 - vadd.s16 q8, q2 - vqrshrun.s16 d0, q3, #5 - vqrshrun.s16 d1, q8, #5 - vst1.u32 {d0,d1}, [r0], r1 - subs r2, #1 - bne loop_0_get_i16x16_luma_pred_plane + vadd.s16 q3, q2 + vadd.s16 q8, q2 + vqrshrun.s16 d0, q3, #5 + vqrshrun.s16 d1, q8, #5 + vst1.u32 {d0,d1}, [r0], r1 + subs r2, #1 + bne loop_0_get_i16x16_luma_pred_plane WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r2, r0, r1 - ldr r2, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r2, r0, r1 + ldr r2, [r2] - //Set the luma MB using top line - str r2, [r0], r1 - str r2, [r0], r1 - str r2, [r0], r1 - str r2, [r0] + //Set the luma MB using top line + str r2, [r0], r1 + str r2, [r0], r1 + str r2, [r0], r1 + str r2, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon - //stmdb sp!, { r2-r5, lr} - //Load the left column (4 bytes) - sub r2, r0, #1 - vld1.8 {d0[]}, [r2], r1 - vld1.8 {d1[]}, [r2], r1 - vld1.8 {d2[]}, [r2], r1 - vld1.8 {d3[]}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the left column (4 bytes) + sub r2, r0, #1 + vld1.8 {d0[]}, [r2], r1 + vld1.8 {d1[]}, [r2], r1 + vld1.8 {d2[]}, [r2], r1 + vld1.8 {d3[]}, [r2] - //Set the luma MB using the left side byte - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - vst1.32 {d2[0]}, [r0], r1 - vst1.32 {d3[0]}, [r0] + //Set the luma MB using the left side byte + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + vst1.32 {d2[0]}, [r0], r1 + vst1.32 {d3[0]}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row data(8 bytes) - sub r2, r0, r1 - vld1.32 {d0}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row data(8 bytes) + sub r2, r0, r1 + vld1.32 {d0}, [r2] - //For "t7 + (t7<<1)" - vdup.8 d1, d0[7] + //For "t7 + (t7<<1)" + vdup.8 d1, d0[7] - //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" - vext.8 d1, d0, d1, #1 - vaddl.u8 q1, d1, d0 + //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" + vext.8 d1, d0, d1, #1 + vaddl.u8 q1, d1, d0 - //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" - vext.8 q2, q1, q1, #14 - vadd.u16 q0, q1, q2 + //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" + vext.8 q2, q1, q1, #14 + vadd.u16 q0, q1, q2 - //right shift 2 bits and rounding - vqrshrn.u16 d0, q0, #2 + //right shift 2 bits and rounding + vqrshrn.u16 d0, q0, #2 - //Save "ddl0, ddl1, ddl2, ddl3" - vext.8 d1, d0, d0, #1 - vst1.32 d1[0], [r0], r1 + //Save "ddl0, ddl1, ddl2, ddl3" + vext.8 d1, d0, d0, #1 + vst1.32 d1[0], [r0], r1 - //Save "ddl1, ddl2, ddl3, ddl4" - vext.8 d1, d0, d0, #2 - vst1.32 d1[0], [r0], r1 + //Save "ddl1, ddl2, ddl3, ddl4" + vext.8 d1, d0, d0, #2 + vst1.32 d1[0], [r0], r1 - //Save "ddl2, ddl3, ddl4, ddl5" - vext.8 d1, d0, d0, #3 - vst1.32 d1[0], [r0], r1 + //Save "ddl2, ddl3, ddl4, ddl5" + vext.8 d1, d0, d0, #3 + vst1.32 d1[0], [r0], r1 - //Save "ddl3, ddl4, ddl5, ddl6" - vst1.32 d0[1], [r0] + //Save "ddl3, ddl4, ddl5, ddl6" + vst1.32 d0[1], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r2, r0, r1 - vld1.32 {d0[1]}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r2, r0, r1 + vld1.32 {d0[1]}, [r2] - //Load the left column (5 bytes) - sub r2, #1 - vld1.8 {d0[3]}, [r2], r1 - vld1.8 {d0[2]}, [r2], r1 - vld1.8 {d0[1]}, [r2], r1 - vld1.8 {d0[0]}, [r2], r1 - vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing + //Load the left column (5 bytes) + sub r2, #1 + vld1.8 {d0[3]}, [r2], r1 + vld1.8 {d0[2]}, [r2], r1 + vld1.8 {d0[1]}, [r2], r1 + vld1.8 {d0[0]}, [r2], r1 + vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing - vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} - //d2:{L3,L2,L1,L0,LT,T0,T1,T2} + vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} + //d2:{L3,L2,L1,L0,LT,T0,T1,T2} - //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} - vaddl.u8 q2, d2, d0 + //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} + vaddl.u8 q2, d2, d0 - //q1:{TL0+LT0,LT0+T01,...L12+L23} - vext.8 q3, q3, q2, #14 - vadd.u16 q1, q2, q3 + //q1:{TL0+LT0,LT0+T01,...L12+L23} + vext.8 q3, q3, q2, #14 + vadd.u16 q1, q2, q3 - //right shift 2 bits and rounding - vqrshrn.u16 d0, q1, #2 + //right shift 2 bits and rounding + vqrshrn.u16 d0, q1, #2 - //Adjust the data sequence for setting luma MB of 'pred' - vst1.32 d0[1], [r0], r1 - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0], r1 - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0], r1 - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0] + //Adjust the data sequence for setting luma MB of 'pred' + vst1.32 d0[1], [r0], r1 + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0], r1 + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0], r1 + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (8 bytes) - sub r2, r0, r1 - vld1.32 {d0}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row (8 bytes) + sub r2, r0, r1 + vld1.32 {d0}, [r2] - vext.8 d1, d0, d0, #1 - vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} + vext.8 d1, d0, d0, #1 + vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} - vext.8 q2, q1, q1, #2 - vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} + vext.8 q2, q1, q1, #2 + vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} - //calculate the "vl0,vl1,vl2,vl3,vl4" - vqrshrn.u16 d0, q1, #1 + //calculate the "vl0,vl1,vl2,vl3,vl4" + vqrshrn.u16 d0, q1, #1 - //calculate the "vl5,vl6,vl7,vl8,vl9" - vqrshrn.u16 d1, q2, #2 + //calculate the "vl5,vl6,vl7,vl8,vl9" + vqrshrn.u16 d1, q2, #2 - //Adjust the data sequence for setting the luma MB - vst1.32 d0[0], [r0], r1 - vst1.32 d1[0], [r0], r1 - vext.8 d0, d0, d0, #1 - vext.8 d1, d1, d1, #1 - vst1.32 d0[0], [r0], r1 - vst1.32 d1[0], [r0] + //Adjust the data sequence for setting the luma MB + vst1.32 d0[0], [r0], r1 + vst1.32 d1[0], [r0], r1 + vext.8 d0, d0, d0, #1 + vext.8 d1, d1, d1, #1 + vst1.32 d0[0], [r0], r1 + vst1.32 d1[0], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r2, r0, r1 - vld1.32 {d0[1]}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r2, r0, r1 + vld1.32 {d0[1]}, [r2] - //Load the left column (4 bytes) - sub r2, #1 - vld1.8 {d0[3]}, [r2], r1 - vld1.8 {d0[2]}, [r2], r1 - vld1.8 {d0[1]}, [r2], r1 - vld1.8 {d0[0]}, [r2] + //Load the left column (4 bytes) + sub r2, #1 + vld1.8 {d0[3]}, [r2], r1 + vld1.8 {d0[2]}, [r2], r1 + vld1.8 {d0[1]}, [r2], r1 + vld1.8 {d0[0]}, [r2] - vext.8 d1, d0, d0, #7 - vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} + vext.8 d1, d0, d0, #7 + vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} - vext.u8 q2, q1, q1, #14 - vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} + vext.u8 q2, q1, q1, #14 + vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} - //Calculate the vr0 ~ vr9 - vqrshrn.u16 d1, q2, #2 - vqrshrn.u16 d0, q1, #1 + //Calculate the vr0 ~ vr9 + vqrshrn.u16 d1, q2, #2 + vqrshrn.u16 d0, q1, #1 - //Adjust the data sequence for setting the luma MB - vst1.32 d0[1], [r0], r1 - vst1.32 d1[1], [r0], r1 - add r2, r0, r1 - vst1.8 d1[3], [r0]! - vst1.16 d0[2], [r0]! - vst1.8 d0[6], [r0]! - vst1.8 d1[2], [r2]! - vst1.16 d1[2], [r2]! - vst1.8 d1[6], [r2] + //Adjust the data sequence for setting the luma MB + vst1.32 d0[1], [r0], r1 + vst1.32 d1[1], [r0], r1 + add r2, r0, r1 + vst1.8 d1[3], [r0]! + vst1.16 d0[2], [r0]! + vst1.8 d0[6], [r0]! + vst1.8 d1[2], [r2]! + vst1.16 d1[2], [r2]! + vst1.8 d1[6], [r2] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon - //stmdb sp!, { r2-r5, lr} - //Load the left column data - sub r2, r0, #1 - mov r3, #3 - mul r3, r1 - add r3, r2 - vld1.8 {d0[]}, [r3] - vld1.8 {d0[4]}, [r2], r1 - vld1.8 {d0[5]}, [r2], r1 - vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} + //stmdb sp!, { r2-r5, lr} + //Load the left column data + sub r2, r0, #1 + mov r3, #3 + mul r3, r1 + add r3, r2 + vld1.8 {d0[]}, [r3] + vld1.8 {d0[4]}, [r2], r1 + vld1.8 {d0[5]}, [r2], r1 + vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} - vext.8 d1, d0, d0, #1 - vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} + vext.8 d1, d0, d0, #1 + vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} - vext.u8 d2, d5, d4, #2 - vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} + vext.u8 d2, d5, d4, #2 + vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} - //Calculate the hu0 ~ hu5 - vqrshrn.u16 d2, q2, #1 - vqrshrn.u16 d1, q1, #2 + //Calculate the hu0 ~ hu5 + vqrshrn.u16 d2, q2, #1 + vqrshrn.u16 d1, q1, #2 - //Adjust the data sequence for setting the luma MB - vzip.8 d2, d1 - vst1.32 d1[0], [r0], r1 - vext.8 d2, d1, d1, #2 - vst1.32 d2[0], [r0], r1 - vst1.32 d1[1], [r0], r1 - vst1.32 d0[0], [r0] + //Adjust the data sequence for setting the luma MB + vzip.8 d2, d1 + vst1.32 d1[0], [r0], r1 + vext.8 d2, d1, d1, #2 + vst1.32 d2[0], [r0], r1 + vst1.32 d1[1], [r0], r1 + vst1.32 d0[0], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon - //stmdb sp!, { r2-r5, lr} - //Load the data - sub r2, r0, r1 - sub r2, #1 - vld1.32 {d0[1]}, [r2], r1 - vld1.8 {d0[3]}, [r2], r1 - vld1.8 {d0[2]}, [r2], r1 - vld1.8 {d0[1]}, [r2], r1 - vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} + //stmdb sp!, { r2-r5, lr} + //Load the data + sub r2, r0, r1 + sub r2, #1 + vld1.32 {d0[1]}, [r2], r1 + vld1.8 {d0[3]}, [r2], r1 + vld1.8 {d0[2]}, [r2], r1 + vld1.8 {d0[1]}, [r2], r1 + vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} - vext.8 d1, d0, d0, #7 - vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} + vext.8 d1, d0, d0, #7 + vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} - vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} - vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} + vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} + vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} - //Calculate the hd0~hd9 - vqrshrn.u16 d1, q3, #2 - vqrshrn.u16 d0, q2, #1 + //Calculate the hd0~hd9 + vqrshrn.u16 d1, q3, #2 + vqrshrn.u16 d0, q2, #1 - //Adjust the data sequence for setting the luma MB - vmov d3, d1 - vtrn.8 d0, d1 - vext.u8 d2, d1, d1, #6 - vst2.16 {d2[3], d3[3]}, [r0], r1 - vst2.16 {d0[2], d1[2]}, [r0], r1 - vmov d3, d0 - vst2.16 {d2[2], d3[2]}, [r0], r1 - vst2.16 {d0[1], d1[1]}, [r0] + //Adjust the data sequence for setting the luma MB + vmov d3, d1 + vtrn.8 d0, d1 + vext.u8 d2, d1, d1, #6 + vst2.16 {d2[3], d3[3]}, [r0], r1 + vst2.16 {d0[2], d1[2]}, [r0], r1 + vmov d3, d0 + vst2.16 {d2[2], d3[2]}, [r0], r1 + vst2.16 {d0[1], d1[1]}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon - //stmdb sp!, { r2-r5, lr} - //Get the top row (8 byte) - sub r2, r0, r1 - vldr d0, [r2] + //stmdb sp!, { r2-r5, lr} + //Get the top row (8 byte) + sub r2, r0, r1 + vldr d0, [r2] - //Set the chroma MB using top row data - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0] + //Set the chroma MB using top row data + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon - //stmdb sp!, { r2-r5, lr} - ////Get the left column (8 byte) - sub r2, r0, #1 - vld1.8 {d0[]}, [r2], r1 - vld1.8 {d1[]}, [r2], r1 - vld1.8 {d2[]}, [r2], r1 - vld1.8 {d3[]}, [r2], r1 - vld1.8 {d4[]}, [r2], r1 - vld1.8 {d5[]}, [r2], r1 - vld1.8 {d6[]}, [r2], r1 - vld1.8 {d7[]}, [r2] + //stmdb sp!, { r2-r5, lr} + ////Get the left column (8 byte) + sub r2, r0, #1 + vld1.8 {d0[]}, [r2], r1 + vld1.8 {d1[]}, [r2], r1 + vld1.8 {d2[]}, [r2], r1 + vld1.8 {d3[]}, [r2], r1 + vld1.8 {d4[]}, [r2], r1 + vld1.8 {d5[]}, [r2], r1 + vld1.8 {d6[]}, [r2], r1 + vld1.8 {d7[]}, [r2] - //Set the chroma MB using left column data - vst1.8 {d0}, [r0], r1 - vst1.8 {d1}, [r0], r1 - vst1.8 {d2}, [r0], r1 - vst1.8 {d3}, [r0], r1 - vst1.8 {d4}, [r0], r1 - vst1.8 {d5}, [r0], r1 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r0] + //Set the chroma MB using left column data + vst1.8 {d0}, [r0], r1 + vst1.8 {d1}, [r0], r1 + vst1.8 {d2}, [r0], r1 + vst1.8 {d3}, [r0], r1 + vst1.8 {d4}, [r0], r1 + vst1.8 {d5}, [r0], r1 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r0] WELS_ASM_FUNC_END @@ -576,73 +576,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row data - sub r2, r0, #1 - sub r2, r1 - vld1.32 {d1[0]}, [r2] - add r2, #5 - vld1.32 {d0[0]}, [r2] + //stmdb sp!, { r2-r5, lr} + //Load the top row data + sub r2, r0, #1 + sub r2, r1 + vld1.32 {d1[0]}, [r2] + add r2, #5 + vld1.32 {d0[0]}, [r2] - //Load the left column data - sub r2, #5 - vld1.8 {d1[4]}, [r2], r1 - vld1.8 {d1[5]}, [r2], r1 - vld1.8 {d1[6]}, [r2], r1 - vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} - add r2, r1 - vld1.8 {d0[4]}, [r2], r1 - vld1.8 {d0[5]}, [r2], r1 - vld1.8 {d0[6]}, [r2], r1 - vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} + //Load the left column data + sub r2, #5 + vld1.8 {d1[4]}, [r2], r1 + vld1.8 {d1[5]}, [r2], r1 + vld1.8 {d1[6]}, [r2], r1 + vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} + add r2, r1 + vld1.8 {d0[4]}, [r2], r1 + vld1.8 {d0[5]}, [r2], r1 + vld1.8 {d0[6]}, [r2], r1 + vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} - //Save T7 to d3 for next step - vdup.u8 d3, d0[3] - //Save L7 to d4 for next step - vdup.u8 d4, d0[7] + //Save T7 to d3 for next step + vdup.u8 d3, d0[3] + //Save L7 to d4 for next step + vdup.u8 d4, d0[7] - //Calculate the value of 'a' and save to q2 - vaddl.u8 q2, d3, d4 - vshl.u16 q2, #4 + //Calculate the value of 'a' and save to q2 + vaddl.u8 q2, d3, d4 + vshl.u16 q2, #4 - //Load the table {{1,2,3,4,1,2,3,4}*17} - adr r2, CONST0_GET_I_CHROMA_PRED_PLANE - vld1.32 {d2}, [r2] + //Load the table {{1,2,3,4,1,2,3,4}*17} + adr r2, CONST0_GET_I_CHROMA_PRED_PLANE + vld1.32 {d2}, [r2] - //Calculate the 'b','c', and save to q0 - vrev32.8 d1, d1 - vsubl.u8 q0, d0, d1 - vmovl.u8 q1, d2 - vmul.s16 q0, q1 - vpaddl.s16 q0, q0 - vpaddl.s32 q0, q0 - vrshr.s64 q0, #5 + //Calculate the 'b','c', and save to q0 + vrev32.8 d1, d1 + vsubl.u8 q0, d0, d1 + vmovl.u8 q1, d2 + vmul.s16 q0, q1 + vpaddl.s16 q0, q0 + vpaddl.s32 q0, q0 + vrshr.s64 q0, #5 - //Load the table {-3,-2,-1,0,1,2,3,4} to q3 - adr r2, CONST1_GET_I_CHROMA_PRED_PLANE - vld1.32 {d6, d7}, [r2] + //Load the table {-3,-2,-1,0,1,2,3,4} to q3 + adr r2, CONST1_GET_I_CHROMA_PRED_PLANE + vld1.32 {d6, d7}, [r2] - //Duplicate the 'b','c' to q0, q1 for SIMD instruction - vdup.s16 q1, d1[0] - vdup.s16 q0, d0[0] + //Duplicate the 'b','c' to q0, q1 for SIMD instruction + vdup.s16 q1, d1[0] + vdup.s16 q0, d0[0] - //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" - vmla.s16 q2, q0, q3 - vmla.s16 q2, q1, d6[0] - vqrshrun.s16 d0, q2, #5 + //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" + vmla.s16 q2, q0, q3 + vmla.s16 q2, q1, d6[0] + vqrshrun.s16 d0, q2, #5 - //Set a line of chroma MB - vst1.u32 {d0}, [r0], r1 + //Set a line of chroma MB + vst1.u32 {d0}, [r0], r1 - //Do the same processing for each line. - mov r2, #7 + //Do the same processing for each line. + mov r2, #7 loop_0_get_i_chroma_pred_plane: - vadd.s16 q2, q1 - vqrshrun.s16 d0, q2, #5 - vst1.u32 {d0}, [r0], r1 - subs r2, #1 - bne loop_0_get_i_chroma_pred_plane + vadd.s16 q2, q1 + vqrshrun.s16 d0, q2, #5 + vst1.u32 {d0}, [r0], r1 + subs r2, #1 + bne loop_0_get_i_chroma_pred_plane WELS_ASM_FUNC_END diff --git a/codec/decoder/core/x86/dct.asm b/codec/decoder/core/x86/dct.asm index c55b05a7..d15c95af 100644 --- a/codec/decoder/core/x86/dct.asm +++ b/codec/decoder/core/x86/dct.asm @@ -54,7 +54,7 @@ %endmacro %macro MMX_SumSub 3 - movq %3, %2 + movq %3, %2 psubw %2, %1 paddw %1, %3 %endmacro @@ -62,8 +62,8 @@ %macro MMX_IDCT 6 MMX_SumSub %4, %5, %6 MMX_SumSubDiv2 %3, %2, %1 - MMX_SumSub %1, %4, %6 - MMX_SumSub %3, %5, %6 + MMX_SumSub %1, %4, %6 + MMX_SumSub %3, %5, %6 %endmacro @@ -96,13 +96,13 @@ WELS_EXTERN IdctResAddPred_mmx movq mm2, [r2+16] movq mm3, [r2+24] - MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 - MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 + MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 + MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 - MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 + MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 - WELS_Zero mm7 - WELS_DW32 mm6 + WELS_Zero mm7 + WELS_DW32 mm6 MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0] MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1] @@ -111,5 +111,5 @@ WELS_EXTERN IdctResAddPred_mmx MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1] - emms + emms ret diff --git a/codec/decoder/core/x86/intra_pred.asm b/codec/decoder/core/x86/intra_pred.asm index 4551d9fb..ca1f1f5f 100644 --- a/codec/decoder/core/x86/intra_pred.asm +++ b/codec/decoder/core/x86/intra_pred.asm @@ -36,10 +36,10 @@ ;* ;* History ;* 18/09/2009 Created -;* 19/11/2010 Added -;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2, -;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2 -;* and WelsDecoderIChromaPredDcNA_mmx +;* 19/11/2010 Added +;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2, +;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2 +;* and WelsDecoderIChromaPredDcNA_mmx ;* ;* ;*************************************************************************/ @@ -65,7 +65,7 @@ align 16 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4 align 16 -mmx_01bytes: times 16 db 1 +mmx_01bytes: times 16 db 1 align 16 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 @@ -81,86 +81,86 @@ sse2_wd_0x02: times 8 dw 0x02 ;xmm0, xmm1, xmm2, eax, ecx ;lower 64 bits of xmm0 save the result %macro SSE2_PRED_H_4X4_TWO_LINE 5 - movd %1, [%4-1] - movdqa %3, %1 - punpcklbw %1, %3 - movdqa %3, %1 - punpcklbw %1, %3 + movd %1, [%4-1] + movdqa %3, %1 + punpcklbw %1, %3 + movdqa %3, %1 + punpcklbw %1, %3 - ;add %4, %5 - movd %2, [%4+%5-1] - movdqa %3, %2 - punpcklbw %2, %3 - movdqa %3, %2 - punpcklbw %2, %3 - punpckldq %1, %2 + ;add %4, %5 + movd %2, [%4+%5-1] + movdqa %3, %2 + punpcklbw %2, %3 + movdqa %3, %2 + punpcklbw %2, %3 + punpckldq %1, %2 %endmacro %macro LOAD_COLUMN 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpcklwd %1, %3 - lea %5, [%5+2*%6] - movd %4, [%5] - movd %2, [%5+%6] - punpcklbw %4, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - lea %5, [%5+2*%6] - punpcklbw %3, %2 - punpcklwd %4, %3 - punpckhdq %1, %4 + movd %1, [%5] + movd %2, [%5+%6] + punpcklbw %1, %2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + punpcklbw %3, %2 + punpcklwd %1, %3 + lea %5, [%5+2*%6] + movd %4, [%5] + movd %2, [%5+%6] + punpcklbw %4, %2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + lea %5, [%5+2*%6] + punpcklbw %3, %2 + punpcklwd %4, %3 + punpckhdq %1, %4 %endmacro %macro SUMW_HORIZON 3 - movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 - paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 - punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 - movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 - paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 - pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 - paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 + movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 + paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 + punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 + movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 + paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 + pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 + paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 %endmacro %macro COPY_16_TIMES 2 - movdqa %2, [%1-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 + movdqa %2, [%1-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 %endmacro %macro COPY_16_TIMESS 3 - movdqa %2, [%1+%3-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 + movdqa %2, [%1+%3-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 %endmacro %macro LOAD_COLUMN_C 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1,%2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpckhwd %1, %3 - lea %5, [%5+2*%6] + movd %1, [%5] + movd %2, [%5+%6] + punpcklbw %1,%2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + punpcklbw %3, %2 + punpckhwd %1, %3 + lea %5, [%5+2*%6] %endmacro %macro LOAD_2_LEFT_AND_ADD 0 - lea r0, [r0+2*r1] - movzx r3, byte [r0-0x01] - add r2, r3 - movzx r3, byte [r0+r1-0x01] - add r2, r3 + lea r0, [r0+2*r1] + movzx r3, byte [r0-0x01] + add r2, r3 + movzx r3, byte [r0+r1-0x01] + add r2, r3 %endmacro ;******************************************************************************* @@ -173,131 +173,131 @@ SECTION .text ;******************************************************************************* ; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride) ; -; pPred must align to 16 +; pPred must align to 16 ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2 - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d - movzx r2, byte [r0-1] - movd xmm0, r2d - pmuludq xmm0, [mmx_01bytes] + movzx r2, byte [r0-1] + movd xmm0, r2d + pmuludq xmm0, [mmx_01bytes] - movzx r2, byte [r0+r1-1] - movd xmm1, r2d - pmuludq xmm1, [mmx_01bytes] + movzx r2, byte [r0+r1-1] + movd xmm1, r2d + pmuludq xmm1, [mmx_01bytes] - lea r0, [r0+r1] - movzx r2, byte [r0+r1-1] - movd xmm2, r2d - pmuludq xmm2, [mmx_01bytes] + lea r0, [r0+r1] + movzx r2, byte [r0+r1-1] + movd xmm2, r2d + pmuludq xmm2, [mmx_01bytes] - movzx r2, byte [r0+2*r1-1] - movd xmm3, r2d - pmuludq xmm3, [mmx_01bytes] + movzx r2, byte [r0+2*r1-1] + movd xmm3, r2d + pmuludq xmm3, [mmx_01bytes] - sub r0, r1 - movd [r0], xmm0 - movd [r0+r1], xmm1 - lea r0, [r0+2*r1] - movd [r0], xmm2 - movd [r0+r1], xmm3 + sub r0, r1 + movd [r0], xmm0 + movd [r0+r1], xmm1 + lea r0, [r0+2*r1] + movd [r0], xmm2 + movd [r0+r1], xmm3 - ret + ret ;******************************************************************************* ; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride); ;******************************************************************************* WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_2_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - mov r4, r0 ; save r0 in r4 - sub r0, 1 - sub r0, r1 + push r3 + push r4 + %assign push_num 2 + LOAD_2_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + mov r4, r0 ; save r0 in r4 + sub r0, 1 + sub r0, r1 - ;for H - pxor xmm7, xmm7 - movq xmm0, [r0] - movdqa xmm5, [sse2_plane_dec] - punpcklbw xmm0, xmm7 - pmullw xmm0, xmm5 - movq xmm1, [r0 + 9] - movdqa xmm6, [sse2_plane_inc] - punpcklbw xmm1, xmm7 - pmullw xmm1, xmm6 - psubw xmm1, xmm0 + ;for H + pxor xmm7, xmm7 + movq xmm0, [r0] + movdqa xmm5, [sse2_plane_dec] + punpcklbw xmm0, xmm7 + pmullw xmm0, xmm5 + movq xmm1, [r0 + 9] + movdqa xmm6, [sse2_plane_inc] + punpcklbw xmm1, xmm7 + pmullw xmm1, xmm6 + psubw xmm1, xmm0 - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); - movsx r2, r2w - imul r2, 5 - add r2, 32 - sar r2, 6 ; b = (5 * H + 32) >> 6; - SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b + SUMW_HORIZON xmm1,xmm0,xmm2 + movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); + movsx r2, r2w + imul r2, 5 + add r2, 32 + sar r2, 6 ; b = (5 * H + 32) >> 6; + SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b - movzx r3, BYTE [r0+16] - sub r0, 3 - LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1 + movzx r3, BYTE [r0+16] + sub r0, 3 + LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1 - add r0, 3 - movzx r2, BYTE [r0+8*r1] - add r3, r2 - shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4; + add r0, 3 + movzx r2, BYTE [r0+8*r1] + add r3, r2 + shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4; - sub r0, 3 - add r0, r1 - LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1 - pxor xmm4, xmm4 - punpckhbw xmm0, xmm4 - pmullw xmm0, xmm5 - punpckhbw xmm7, xmm4 - pmullw xmm7, xmm6 - psubw xmm7, xmm0 + sub r0, 3 + add r0, r1 + LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1 + pxor xmm4, xmm4 + punpckhbw xmm0, xmm4 + pmullw xmm0, xmm5 + punpckhbw xmm7, xmm4 + pmullw xmm7, xmm6 + psubw xmm7, xmm0 - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r2d, xmm7 ; V - movsx r2, r2w + SUMW_HORIZON xmm7,xmm0,xmm2 + movd r2d, xmm7 ; V + movsx r2, r2w - imul r2, 5 - add r2, 32 - sar r2, 6 ; c = (5 * V + 32) >> 6; - SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c + imul r2, 5 + add r2, 32 + sar r2, 6 ; c = (5 * V + 32) >> 6; + SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c - mov r0, r4 - add r3, 16 - imul r2, -7 - add r3, r2 ; s = a + 16 + (-7)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s + mov r0, r4 + add r3, 16 + imul r2, -7 + add r3, r2 ; s = a + 16 + (-7)*c + SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - xor r2, r2 - movdqa xmm5, [sse2_plane_inc_minus] + xor r2, r2 + movdqa xmm5, [sse2_plane_inc_minus] get_i16x16_luma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - movdqa xmm3, xmm1 - pmullw xmm3, xmm6 - paddw xmm3, xmm0 - psraw xmm3, 5 - packuswb xmm2, xmm3 - movdqa [r0], xmm2 - paddw xmm0, xmm4 - add r0, r1 - inc r2 - cmp r2, 16 - jnz get_i16x16_luma_pred_plane_sse2_1 + movdqa xmm2, xmm1 + pmullw xmm2, xmm5 + paddw xmm2, xmm0 + psraw xmm2, 5 + movdqa xmm3, xmm1 + pmullw xmm3, xmm6 + paddw xmm3, xmm0 + psraw xmm3, 5 + packuswb xmm2, xmm3 + movdqa [r0], xmm2 + paddw xmm0, xmm4 + add r0, r1 + inc r2 + cmp r2, 16 + jnz get_i16x16_luma_pred_plane_sse2_1 - POP_XMM - pop r4 - pop r3 - ret + POP_XMM + pop r4 + pop r3 + ret @@ -306,31 +306,31 @@ get_i16x16_luma_pred_plane_sse2_1: ;******************************************************************************* %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2 - lea %1, [%1+%2*2] + lea %1, [%1+%2*2] - COPY_16_TIMES %1, xmm0 - movdqa [%1], xmm0 - COPY_16_TIMESS %1, xmm0, %2 - movdqa [%1+%2], xmm0 + COPY_16_TIMES %1, xmm0 + movdqa [%1], xmm0 + COPY_16_TIMESS %1, xmm0, %2 + movdqa [%1+%2], xmm0 %endmacro WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2 - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d - COPY_16_TIMES r0, xmm0 - movdqa [r0], xmm0 - COPY_16_TIMESS r0, xmm0, r1 - movdqa [r0+r1], xmm0 + COPY_16_TIMES r0, xmm0 + movdqa [r0], xmm0 + COPY_16_TIMESS r0, xmm0, r1 + movdqa [r0+r1], xmm0 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 - SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 + SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 ret @@ -338,9 +338,9 @@ WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2 ; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride); ;******************************************************************************* WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2 - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d sub r0, r1 movdqa xmm0, [r0] @@ -376,252 +376,252 @@ WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2 ; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride); ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredPlane_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_2_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - mov r4, r0 - sub r0, 1 - sub r0, r1 + push r3 + push r4 + %assign push_num 2 + LOAD_2_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + mov r4, r0 + sub r0, 1 + sub r0, r1 - pxor mm7, mm7 - movq mm0, [r0] - movq mm5, [sse2_plane_dec_c] - punpcklbw mm0, mm7 - pmullw mm0, mm5 - movq mm1, [r0 + 5] - movq mm6, [sse2_plane_inc_c] - punpcklbw mm1, mm7 - pmullw mm1, mm6 - psubw mm1, mm0 + pxor mm7, mm7 + movq mm0, [r0] + movq mm5, [sse2_plane_dec_c] + punpcklbw mm0, mm7 + pmullw mm0, mm5 + movq mm1, [r0 + 5] + movq mm6, [sse2_plane_inc_c] + punpcklbw mm1, mm7 + pmullw mm1, mm6 + psubw mm1, mm0 - movq2dq xmm1, mm1 - pxor xmm2, xmm2 - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r2d, xmm1 - movsx r2, r2w - imul r2, 17 - add r2, 16 - sar r2, 5 ; b = (17 * H + 16) >> 5; - SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b + movq2dq xmm1, mm1 + pxor xmm2, xmm2 + SUMW_HORIZON xmm1,xmm0,xmm2 + movd r2d, xmm1 + movsx r2, r2w + imul r2, 17 + add r2, 16 + sar r2, 5 ; b = (17 * H + 16) >> 5; + SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b - movzx r3, BYTE [r0+8] - sub r0, 3 - LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1 + movzx r3, BYTE [r0+8] + sub r0, 3 + LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1 - add r0, 3 - movzx r2, BYTE [r0+4*r1] - add r3, r2 - shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4; + add r0, 3 + movzx r2, BYTE [r0+4*r1] + add r3, r2 + shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4; - sub r0, 3 - add r0, r1 - LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1 - pxor mm4, mm4 - punpckhbw mm0, mm4 - pmullw mm0, mm5 - punpckhbw mm7, mm4 - pmullw mm7, mm6 - psubw mm7, mm0 + sub r0, 3 + add r0, r1 + LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1 + pxor mm4, mm4 + punpckhbw mm0, mm4 + pmullw mm0, mm5 + punpckhbw mm7, mm4 + pmullw mm7, mm6 + psubw mm7, mm0 - movq2dq xmm7, mm7 - pxor xmm2, xmm2 - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r2d, xmm7 ; V - movsx r2, r2w + movq2dq xmm7, mm7 + pxor xmm2, xmm2 + SUMW_HORIZON xmm7,xmm0,xmm2 + movd r2d, xmm7 ; V + movsx r2, r2w - imul r2, 17 - add r2, 16 - sar r2, 5 ; c = (17 * V + 16) >> 5; - SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c + imul r2, 17 + add r2, 16 + sar r2, 5 ; c = (17 * V + 16) >> 5; + SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c - mov r0, r4 - add r3, 16 - imul r2, -3 - add r3, r2 ; s = a + 16 + (-3)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s + mov r0, r4 + add r3, 16 + imul r2, -3 + add r3, r2 ; s = a + 16 + (-3)*c + SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - xor r2, r2 - movdqa xmm5, [sse2_plane_mul_b_c] + xor r2, r2 + movdqa xmm5, [sse2_plane_mul_b_c] get_i_chroma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r0], xmm2 - paddw xmm0, xmm4 - add r0, r1 - inc r2 - cmp r2, 8 - jnz get_i_chroma_pred_plane_sse2_1 + movdqa xmm2, xmm1 + pmullw xmm2, xmm5 + paddw xmm2, xmm0 + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r0], xmm2 + paddw xmm0, xmm4 + add r0, r1 + inc r2 + cmp r2, 8 + jnz get_i_chroma_pred_plane_sse2_1 - POP_XMM - pop r4 - pop r3 - WELSEMMS - ret + POP_XMM + pop r4 + pop r3 + WELSEMMS + ret ;******************************************************************************* -; 0 |1 |2 |3 |4 | -; 6 |7 |8 |9 |10| -; 11|12|13|14|15| -; 16|17|18|19|20| -; 21|22|23|24|25| -; 7 is the start pixel of current 4x4 block -; pPred[7] = ([6]+[0]*2+[1]+2)/4 +; 0 |1 |2 |3 |4 | +; 6 |7 |8 |9 |10| +; 11|12|13|14|15| +; 16|17|18|19|20| +; 21|22|23|24|25| +; 7 is the start pixel of current 4x4 block +; pPred[7] = ([6]+[0]*2+[1]+2)/4 ; ; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride) ; ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 - movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 - movq mm2,[r2-8] ;get value of 6 mm2[8] = 6 - sub r2, r1 ;mov eax to above line of current block(postion of 1) - punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] - movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] - punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] - psllq mm3,18h ;mm3[5]=[1] - psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] - movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - lea r2,[r2+r1*2-8h] ;set eax point to 12 - movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16] - psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[16] - por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] - movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] - movq mm4,[r2+r1*2] ;mm4[8]=[21] - psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[21] - por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] - movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] - pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 - pxor mm1,mm4 ;find odd value in the lowest bit of each byte - pand mm1,[mmx_01bytes] ;set the odd bit - psubusb mm3,mm1 ;decrease 1 from odd bytes - pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 + movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 + movq mm2,[r2-8] ;get value of 6 mm2[8] = 6 + sub r2, r1 ;mov eax to above line of current block(postion of 1) + punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] + movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] + punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] + psllq mm3,18h ;mm3[5]=[1] + psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] + por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] + movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] + lea r2,[r2+r1*2-8h] ;set eax point to 12 + movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16] + psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 + psrlq mm4,38h ;mm4[1]=[16] + por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] + movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] + movq mm4,[r2+r1*2] ;mm4[8]=[21] + psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 + psrlq mm4,38h ;mm4[1]=[21] + por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] + movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] + pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 + pxor mm1,mm4 ;find odd value in the lowest bit of each byte + pand mm1,[mmx_01bytes] ;set the odd bit + psubusb mm3,mm1 ;decrease 1 from odd bytes + pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 - lea r0,[r0+r1] - movd [r0+2*r1],mm2 - sub r0,r1 - psrlq mm2,8 - movd [r0+2*r1],mm2 - psrlq mm2,8 - movd [r0+r1],mm2 - psrlq mm2,8 - movd [r0],mm2 - WELSEMMS - ret + lea r0,[r0+r1] + movd [r0+2*r1],mm2 + sub r0,r1 + psrlq mm2,8 + movd [r0+2*r1],mm2 + psrlq mm2,8 + movd [r0+r1],mm2 + psrlq mm2,8 + movd [r0],mm2 + WELSEMMS + ret ;******************************************************************************* -; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride) ; copy 8 pixel of 8 line from left ;******************************************************************************* %macro MMX_PRED_H_8X8_ONE_LINE 4 - movq %1, [%3-8] - psrlq %1, 38h + movq %1, [%3-8] + psrlq %1, 38h - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 + pmullw %1, [mmx_01bytes] + pshufw %1, %1, 0 + movq [%4], %1 %endmacro %macro MMX_PRED_H_8X8_ONE_LINEE 4 - movq %1, [%3+r1-8] - psrlq %1, 38h + movq %1, [%3+r1-8] + psrlq %1, 38h - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 + pmullw %1, [mmx_01bytes] + pshufw %1, %1, 0 + movq [%4], %1 %endmacro WELS_EXTERN WelsDecoderIChromaPredH_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 - movq mm0, [r2-8] - psrlq mm0, 38h + movq mm0, [r2-8] + psrlq mm0, 38h - pmullw mm0, [mmx_01bytes] - pshufw mm0, mm0, 0 - movq [r0], mm0 + pmullw mm0, [mmx_01bytes] + pshufw mm0, mm0, 0 + movq [r0], mm0 - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 - lea r2, [r2+r1*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 + lea r2, [r2+r1*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 - lea r0, [r0+2*r1] - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 + lea r0, [r0+2*r1] + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 - lea r2, [r2+r1*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 + lea r2, [r2+r1*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 - lea r0, [r0+2*r1] - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 + lea r0, [r0+2*r1] + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 - lea r2, [r2+r1*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 + lea r2, [r2+r1*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 - lea r0, [r0+2*r1] - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 + lea r0, [r0+2*r1] + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 - WELSEMMS - ret + WELSEMMS + ret ;******************************************************************************* -; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride) ; copy 8 pixels from top 8 pixels ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredV_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d - sub r0, r1 - movq mm0, [r0] + sub r0, r1 + movq mm0, [r0] - movq [r0+r1], mm0 - movq [r0+2*r1], mm0 - lea r0, [r0+2*r1] - movq [r0+r1], mm0 - movq [r0+2*r1], mm0 - lea r0, [r0+2*r1] - movq [r0+r1], mm0 - movq [r0+2*r1], mm0 - lea r0, [r0+2*r1] - movq [r0+r1], mm0 - movq [r0+2*r1], mm0 + movq [r0+r1], mm0 + movq [r0+2*r1], mm0 + lea r0, [r0+2*r1] + movq [r0+r1], mm0 + movq [r0+2*r1], mm0 + lea r0, [r0+2*r1] + movq [r0+r1], mm0 + movq [r0+2*r1], mm0 + lea r0, [r0+2*r1] + movq [r0+r1], mm0 + movq [r0+2*r1], mm0 - WELSEMMS - ret + WELSEMMS + ret ;******************************************************************************* -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; t3 will never been used ; destination: -; |a |b |c |d | -; |e |f |a |b | -; |g |h |e |f | -; |i |j |g |h | +; |a |b |c |d | +; |e |f |a |b | +; |g |h |e |f | +; |i |j |g |h | ; a = (1 + lt + l0)>>1 ; e = (1 + l0 + l1)>>1 @@ -640,73 +640,73 @@ WELS_EXTERN WelsDecoderIChromaPredV_mmx ; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 - sub r2, r1 - movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] - psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 + sub r2, r1 + movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] + psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] - movd mm1, [r2+2*r1-4] - punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 - lea r2, [r2+2*r1] - movd mm2, [r2+2*r1-4] - punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] - psrlq mm2, 20h - pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] + movd mm1, [r2+2*r1-4] + punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 + lea r2, [r2+2*r1] + movd mm2, [r2+2*r1-4] + punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3 + punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] + psrlq mm2, 20h + pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] - movq mm1, mm0 - psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] - movq mm2, mm0 - psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] - movq mm3, mm2 - movq mm4, mm1 - pavgb mm1, mm0 + movq mm1, mm0 + psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] + movq mm2, mm0 + psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] + movq mm3, mm2 + movq mm4, mm1 + pavgb mm1, mm0 - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm4 ; decrease 1 from odd bytes + pxor mm4, mm0 ; find odd value in the lowest bit of each byte + pand mm4, [mmx_01bytes] ; set the odd bit + psubusb mm1, mm4 ; decrease 1 from odd bytes - pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] + pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] - movq mm4, mm0 - pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] - punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] + movq mm4, mm0 + pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] + punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] - psrlq mm2, 20h - psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] - movq mm4, mm3 - psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] - pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] - psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] + psrlq mm2, 20h + psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] + movq mm4, mm3 + psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] + pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] + psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] - movd [r0], mm2 - lea r0, [r0+r1] - movd [r0+2*r1], mm3 - sub r0, r1 - psrlq mm3, 10h - movd [r0+2*r1], mm3 - psrlq mm3, 10h - movd [r0+r1], mm3 - WELSEMMS - ret + movd [r0], mm2 + lea r0, [r0+r1] + movd [r0+2*r1], mm3 + sub r0, r1 + psrlq mm3, 10h + movd [r0+2*r1], mm3 + psrlq mm3, 10h + movd [r0+r1], mm3 + WELSEMMS + ret ;******************************************************************************* -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; t3 will never been used ; destination: -; |a |b |c |d | -; |c |d |e |f | -; |e |f |g |g | -; |g |g |g |g | +; |a |b |c |d | +; |c |d |e |f | +; |e |f |g |g | +; |g |g |g |g | ; a = (1 + l0 + l1)>>1 ; c = (1 + l1 + l2)>>1 @@ -722,74 +722,74 @@ WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx ; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 - movd mm0, [r2-4] ; mm0[3] = l0 - punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0 - lea r2, [r2+2*r1] - movd mm2, [r2-4] ; mm2[3] = l2 - movd mm4, [r2+r1-4] ; mm4[3] = l3 - punpcklbw mm2, mm4 - punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] + movd mm0, [r2-4] ; mm0[3] = l0 + punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0 + lea r2, [r2+2*r1] + movd mm2, [r2-4] ; mm2[3] = l2 + movd mm4, [r2+r1-4] ; mm4[3] = l3 + punpcklbw mm2, mm4 + punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] - psrlq mm4, 18h - psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] - psrlq mm0, 8h - pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] + psrlq mm4, 18h + psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] + psrlq mm0, 8h + pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] - movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] - pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] + movq mm1, mm0 + psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] + movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] + pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] - movq mm5, mm2 - pavgb mm2, mm0 + movq mm2, mm0 + psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] + movq mm5, mm2 + pavgb mm2, mm0 - pxor mm5, mm0 ; find odd value in the lowest bit of each byte - pand mm5, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm5 ; decrease 1 from odd bytes + pxor mm5, mm0 ; find odd value in the lowest bit of each byte + pand mm5, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm5 ; decrease 1 from odd bytes - pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] + pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] - psrlq mm2, 8h - pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] + psrlq mm2, 8h + pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] - punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] - punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] - punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] + punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] + punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] + punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] - psrlq mm4, 20h - lea r0, [r0+r1] - movd [r0+2*r1], mm4 + psrlq mm4, 20h + lea r0, [r0+r1] + movd [r0+2*r1], mm4 - sub r0, r1 - movd [r0], mm1 - psrlq mm1, 10h - movd [r0+r1], mm1 - psrlq mm1, 10h - movd [r0+2*r1], mm1 - WELSEMMS - ret + sub r0, r1 + movd [r0], mm1 + psrlq mm1, 10h + movd [r0+r1], mm1 + psrlq mm1, 10h + movd [r0+2*r1], mm1 + WELSEMMS + ret ;******************************************************************************* -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; l3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; l3 will never been used ; destination: -; |a |b |c |d | -; |e |f |g |h | -; |i |a |b |c | -; |j |e |f |g | +; |a |b |c |d | +; |e |f |g |h | +; |i |a |b |c | +; |j |e |f |g | ; a = (1 + lt + t0)>>1 ; b = (1 + t0 + t1)>>1 @@ -807,77 +807,77 @@ WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx ; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 - sub r2, r1 - movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] - psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 + sub r2, r1 + movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] + psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] - movd mm1, [r2+2*r1-4] - punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 - lea r2, [r2+2*r1] - movq mm2, [r2+r1-8] ; mm2[7] = l2 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] - psrlq mm2, 28h - pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] + movd mm1, [r2+2*r1-4] + punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 + lea r2, [r2+2*r1] + movq mm2, [r2+r1-8] ; mm2[7] = l2 + punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] + psrlq mm2, 28h + pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] + movq mm1, mm0 + psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] + pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] - movq mm3, mm2 - pavgb mm2, mm0 + movq mm2, mm0 + psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] + movq mm3, mm2 + pavgb mm2, mm0 - pxor mm3, mm0 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm3 ; decrease 1 from odd bytes + pxor mm3, mm0 ; find odd value in the lowest bit of each byte + pand mm3, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm3 ; decrease 1 from odd bytes - movq mm3, mm0 - psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] - movq mm2, mm3 + movq mm3, mm0 + psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] + pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] + movq mm2, mm3 - psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] - movd [r0], mm1 + psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] + movd [r0], mm1 - psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] - movd [r0+r1], mm2 + psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] + movd [r0+r1], mm2 - movq mm4, mm3 - psllq mm4, 20h - psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] + movq mm4, mm3 + psllq mm4, 20h + psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] - movq mm5, mm3 - psllq mm5, 28h - psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] + movq mm5, mm3 + psllq mm5, 28h + psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] - psllq mm1, 8h - pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] - movd [r0+2*r1], mm4 + psllq mm1, 8h + pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] + movd [r0+2*r1], mm4 - psllq mm2, 8h - pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] - lea r0, [r0+2*r1] - movd [r0+r1], mm5 - WELSEMMS - ret + psllq mm2, 8h + pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] + lea r0, [r0+2*r1] + movd [r0+r1], mm5 + WELSEMMS + ret ;******************************************************************************* -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used +; lt|t0|t1|t2|t3|t4|t5|t6|t7 +; l0| +; l1| +; l2| +; l3| +; lt,t0,t1,t2,t3 will never been used ; destination: -; |a |b |c |d | -; |b |c |d |e | -; |c |d |e |f | -; |d |e |f |g | +; |a |b |c |d | +; |b |c |d |e | +; |c |d |e |f | +; |d |e |f |g | ; a = (2 + t0 + t2 + (t1<<1))>>2 ; b = (2 + t1 + t3 + (t2<<1))>>2 @@ -893,56 +893,56 @@ WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx ; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 - sub r2, r1 - movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 + sub r2, r1 + movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] + movq mm1, mm0 + movq mm2, mm0 - movq mm3, mm0 - psrlq mm3, 38h - psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] + movq mm3, mm0 + psrlq mm3, 38h + psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] - psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] - psrlq mm2, 8h - pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] + psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] + psrlq mm2, 8h + pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] - movq mm3, mm1 - pavgb mm1, mm2 - pxor mm3, mm2 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm3 ; decrease 1 from odd bytes + movq mm3, mm1 + pavgb mm1, mm2 + pxor mm3, mm2 ; find odd value in the lowest bit of each byte + pand mm3, [mmx_01bytes] ; set the odd bit + psubusb mm1, mm3 ; decrease 1 from odd bytes - pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] + pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] - psrlq mm0, 8h - movd [r0], mm0 - psrlq mm0, 8h - movd [r0+r1], mm0 - psrlq mm0, 8h - movd [r0+2*r1], mm0 - psrlq mm0, 8h - lea r0, [r0+2*r1] - movd [r0+r1], mm0 - WELSEMMS - ret + psrlq mm0, 8h + movd [r0], mm0 + psrlq mm0, 8h + movd [r0+r1], mm0 + psrlq mm0, 8h + movd [r0+2*r1], mm0 + psrlq mm0, 8h + lea r0, [r0+2*r1] + movd [r0+r1], mm0 + WELSEMMS + ret ;******************************************************************************* -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used +; lt|t0|t1|t2|t3|t4|t5|t6|t7 +; l0| +; l1| +; l2| +; l3| +; lt,t0,t1,t2,t3 will never been used ; destination: -; |a |b |c |d | -; |e |f |g |h | -; |b |c |d |i | -; |f |g |h |j | +; |a |b |c |d | +; |e |f |g |h | +; |b |c |d |i | +; |f |g |h |j | ; a = (1 + t0 + t1)>>1 ; b = (1 + t1 + t2)>>1 @@ -961,133 +961,133 @@ WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx ; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r2, r0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r2, r0 - sub r2, r1 - movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 + sub r2, r1 + movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] + movq mm1, mm0 + movq mm2, mm0 - psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] - psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] + psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] + psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] - movq mm3, mm1 - pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] + movq mm3, mm1 + pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] - movq mm4, mm2 - pavgb mm2, mm0 - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm4 ; decrease 1 from odd bytes + movq mm4, mm2 + pavgb mm2, mm0 + pxor mm4, mm0 ; find odd value in the lowest bit of each byte + pand mm4, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm4 ; decrease 1 from odd bytes - pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] + pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] - movd [r0], mm3 - psrlq mm3, 8h - movd [r0+2*r1], mm3 + movd [r0], mm3 + psrlq mm3, 8h + movd [r0+2*r1], mm3 - movd [r0+r1], mm2 - psrlq mm2, 8h - lea r0, [r0+2*r1] - movd [r0+r1], mm2 - WELSEMMS - ret + movd [r0+r1], mm2 + psrlq mm2, 8h + lea r0, [r0+2*r1] + movd [r0+r1], mm2 + WELSEMMS + ret ;******************************************************************************* ; ; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredDc_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r4, r0 + push r3 + push r4 + %assign push_num 2 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r4, r0 - sub r0, r1 - movq mm0, [r0] + sub r0, r1 + movq mm0, [r0] - movzx r2, byte [r0+r1-0x01] ; l1 - lea r0, [r0+2*r1] - movzx r3, byte [r0-0x01] ; l2 - add r2, r3 - movzx r3, byte [r0+r1-0x01] ; l3 - add r2, r3 - lea r0, [r0+2*r1] - movzx r3, byte [r0-0x01] ; l4 - add r2, r3 - movd mm1, r2d ; mm1 = l1+l2+l3+l4 + movzx r2, byte [r0+r1-0x01] ; l1 + lea r0, [r0+2*r1] + movzx r3, byte [r0-0x01] ; l2 + add r2, r3 + movzx r3, byte [r0+r1-0x01] ; l3 + add r2, r3 + lea r0, [r0+2*r1] + movzx r3, byte [r0-0x01] ; l4 + add r2, r3 + movd mm1, r2d ; mm1 = l1+l2+l3+l4 - movzx r2, byte [r0+r1-0x01] ; l5 - lea r0, [r0+2*r1] - movzx r3, byte [r0-0x01] ; l6 - add r2, r3 - movzx r3, byte [r0+r1-0x01] ; l7 - add r2, r3 - lea r0, [r0+2*r1] - movzx r3, byte [r0-0x01] ; l8 - add r2, r3 - movd mm2, r2d ; mm2 = l5+l6+l7+l8 + movzx r2, byte [r0+r1-0x01] ; l5 + lea r0, [r0+2*r1] + movzx r3, byte [r0-0x01] ; l6 + add r2, r3 + movzx r3, byte [r0+r1-0x01] ; l7 + add r2, r3 + lea r0, [r0+2*r1] + movzx r3, byte [r0-0x01] ; l8 + add r2, r3 + movd mm2, r2d ; mm2 = l5+l6+l7+l8 - movq mm3, mm0 - psrlq mm0, 0x20 - psllq mm3, 0x20 - psrlq mm3, 0x20 - pxor mm4, mm4 - psadbw mm0, mm4 - psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 + movq mm3, mm0 + psrlq mm0, 0x20 + psllq mm3, 0x20 + psrlq mm3, 0x20 + pxor mm4, mm4 + psadbw mm0, mm4 + psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 - paddq mm3, mm1 - movq mm1, mm2 - paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 + paddq mm3, mm1 + movq mm1, mm2 + paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 - movq mm4, [mmx_0x02] + movq mm4, [mmx_0x02] - paddq mm0, mm4 - psrlq mm0, 0x02 + paddq mm0, mm4 + psrlq mm0, 0x02 - paddq mm2, mm4 - psrlq mm2, 0x02 + paddq mm2, mm4 + psrlq mm2, 0x02 - paddq mm3, mm4 - paddq mm3, mm4 - psrlq mm3, 0x03 + paddq mm3, mm4 + paddq mm3, mm4 + psrlq mm3, 0x03 - paddq mm1, mm4 - paddq mm1, mm4 - psrlq mm1, 0x03 + paddq mm1, mm4 + paddq mm1, mm4 + psrlq mm1, 0x03 - pmuludq mm0, [mmx_01bytes] - pmuludq mm3, [mmx_01bytes] - psllq mm0, 0x20 - pxor mm0, mm3 ; mm0 = m_up + pmuludq mm0, [mmx_01bytes] + pmuludq mm3, [mmx_01bytes] + psllq mm0, 0x20 + pxor mm0, mm3 ; mm0 = m_up - pmuludq mm2, [mmx_01bytes] - pmuludq mm1, [mmx_01bytes] - psllq mm1, 0x20 - pxor mm1, mm2 ; mm2 = m_down + pmuludq mm2, [mmx_01bytes] + pmuludq mm1, [mmx_01bytes] + psllq mm1, 0x20 + pxor mm1, mm2 ; mm2 = m_down - movq [r4], mm0 - movq [r4+r1], mm0 - movq [r4+2*r1], mm0 - lea r4, [r4+2*r1] - movq [r4+r1], mm0 + movq [r4], mm0 + movq [r4+r1], mm0 + movq [r4+2*r1], mm0 + lea r4, [r4+2*r1] + movq [r4+r1], mm0 - movq [r4+2*r1], mm1 - lea r4, [r4+2*r1] - movq [r4+r1], mm1 - movq [r4+2*r1], mm1 - lea r4, [r4+2*r1] - movq [r4+r1], mm1 + movq [r4+2*r1], mm1 + lea r4, [r4+2*r1] + movq [r4+r1], mm1 + movq [r4+2*r1], mm1 + lea r4, [r4+2*r1] + movq [r4+r1], mm1 - pop r4 - pop r3 - WELSEMMS - ret + pop r4 + pop r3 + WELSEMMS + ret @@ -1096,314 +1096,314 @@ WELS_EXTERN WelsDecoderIChromaPredDc_sse2 ; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r4, r0 - sub r0, r1 - movdqa xmm0, [r0] ; read one row - pxor xmm1, xmm1 - psadbw xmm0, xmm1 - movdqa xmm1, xmm0 - psrldq xmm1, 0x08 - pslldq xmm0, 0x08 - psrldq xmm0, 0x08 - paddw xmm0, xmm1 + push r3 + push r4 + %assign push_num 2 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r4, r0 + sub r0, r1 + movdqa xmm0, [r0] ; read one row + pxor xmm1, xmm1 + psadbw xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 0x08 + pslldq xmm0, 0x08 + psrldq xmm0, 0x08 + paddw xmm0, xmm1 - movzx r2, byte [r0+r1-0x01] - movzx r3, byte [r0+2*r1-0x01] - add r2, r3 - lea r0, [r0+r1] - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - add r2, 0x10 - movd xmm1, r2d - paddw xmm0, xmm1 - psrld xmm0, 0x05 - pmuludq xmm0, [mmx_01bytes] - pshufd xmm0, xmm0, 0 + movzx r2, byte [r0+r1-0x01] + movzx r3, byte [r0+2*r1-0x01] + add r2, r3 + lea r0, [r0+r1] + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + add r2, 0x10 + movd xmm1, r2d + paddw xmm0, xmm1 + psrld xmm0, 0x05 + pmuludq xmm0, [mmx_01bytes] + pshufd xmm0, xmm0, 0 - movdqa [r4], xmm0 - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4], xmm0 + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 - movdqa [r4+2*r1], xmm0 - lea r4, [r4+2*r1] + movdqa [r4+r1], xmm0 + movdqa [r4+2*r1], xmm0 + lea r4, [r4+2*r1] - movdqa [r4+r1], xmm0 + movdqa [r4+r1], xmm0 - pop r4 - pop r3 + pop r4 + pop r3 - ret + ret ;******************************************************************************* ; for intra prediction as follows, 11/19/2010 ;******************************************************************************* ;******************************************************************************* -; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2 - %assign push_num 0 - LOAD_2_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - mov r2, r0 - sub r2, r1 - movdqa xmm0, [r2] ; pPred-kiStride, top line - pxor xmm7, xmm7 - psadbw xmm0, xmm7 - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddw xmm0, xmm1 - xor r2, r2 - movd r2d, xmm0 - ;movdqa xmm1, xmm0 - ;punpcklbw xmm0, xmm7 - ;punpckhbw xmm1, xmm7 + %assign push_num 0 + LOAD_2_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + mov r2, r0 + sub r2, r1 + movdqa xmm0, [r2] ; pPred-kiStride, top line + pxor xmm7, xmm7 + psadbw xmm0, xmm7 + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddw xmm0, xmm1 + xor r2, r2 + movd r2d, xmm0 + ;movdqa xmm1, xmm0 + ;punpcklbw xmm0, xmm7 + ;punpckhbw xmm1, xmm7 - ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope - ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4 - ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 - ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 - ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6) - ;pshuflw xmm1, xmm0, 0b1h ; 10110001 - ;paddw xmm0, xmm1 ; sum in word unit (x8) - ;xor r3, r3 - ;movd r3d, xmm0 - ;and edx, 0ffffh + ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope + ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4 + ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 + ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 + ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6) + ;pshuflw xmm1, xmm0, 0b1h ; 10110001 + ;paddw xmm0, xmm1 ; sum in word unit (x8) + ;xor r3, r3 + ;movd r3d, xmm0 + ;and edx, 0ffffh - add r2, 8 - sar r2, 4 - SSE2_Copy16Times xmm1, r2d - ;mov dh, dl - ;mov r2, edx - ;shl r2, 010h - ;or edx, r2 - ;movd xmm1, edx - ;pshufd xmm0, xmm1, 00h - ;movdqa xmm1, xmm0 - movdqa xmm0, xmm1 - lea r2, [2*r1+r1] ; 3*kiStride + add r2, 8 + sar r2, 4 + SSE2_Copy16Times xmm1, r2d + ;mov dh, dl + ;mov r2, edx + ;shl r2, 010h + ;or edx, r2 + ;movd xmm1, edx + ;pshufd xmm0, xmm1, 00h + ;movdqa xmm1, xmm0 + movdqa xmm0, xmm1 + lea r2, [2*r1+r1] ; 3*kiStride - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 - POP_XMM - ret + POP_XMM + ret ;******************************************************************************* -; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2 - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - lea r2, [2*r1+r1] ; 3*kiStride + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + lea r2, [2*r1+r1] ; 3*kiStride - movdqa xmm0, [sse2_dc_0x80] - movdqa xmm1, xmm0 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 - lea r0, [r0+4*r1] - movdqa [r0], xmm0 - movdqa [r0+r1], xmm1 - movdqa [r0+2*r1], xmm0 - movdqa [r0+r2], xmm1 + movdqa xmm0, [sse2_dc_0x80] + movdqa xmm1, xmm0 + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 + lea r0, [r0+4*r1] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+2*r1], xmm0 + movdqa [r0+r2], xmm1 - ret + ret ;******************************************************************************* -; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx - push r3 - push r4 - %assign push_num 2 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - mov r4, r0 - ; for left - dec r0 - xor r2, r2 - xor r3, r3 - movzx r2, byte [r0] - movzx r3, byte [r0+r1] - add r2, r3 - lea r0, [r0+2*r1] - movzx r3, byte [r0] - add r2, r3 - movzx r3, byte [r0+r1] - add r2, r3 - add r2, 02h - sar r2, 02h - ;SSE2_Copy16Times mm0, r2d - mov r3, r2 - sal r3, 8 - or r2, r3 - movd mm1, r2d - pshufw mm0, mm1, 00h - ;mov bh, bl - ;movd mm1, ebx - ;pshufw mm0, mm1, 00h ; up64 - movq mm1, mm0 - xor r2, r2 - lea r0, [r0+2*r1] - movzx r2, byte [r0] - movzx r3, byte [r0+r1] - add r2, r3 - lea r0, [r0+2*r1] - movzx r3, byte [r0] - add r2, r3 - movzx r3, byte [r0+r1] - add r2, r3 - add r2, 02h - sar r2, 02h - mov r3, r2 - sal r3, 8 - or r2, r3 - movd mm3, r2d - pshufw mm2, mm3, 00h - ;mov bh, bl - ;movd mm3, ebx - ;pshufw mm2, mm3, 00h ; down64 - ;SSE2_Copy16Times mm2, r2d - movq mm3, mm2 - lea r2, [2*r1+r1] - movq [r4], mm0 - movq [r4+r1], mm1 - movq [r4+2*r1], mm0 - movq [r4+r2], mm1 - lea r4, [r4+4*r1] - movq [r4], mm2 - movq [r4+r1], mm3 - movq [r4+2*r1], mm2 - movq [r4+r2], mm3 - pop r4 - pop r3 - emms - ret + push r3 + push r4 + %assign push_num 2 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + mov r4, r0 + ; for left + dec r0 + xor r2, r2 + xor r3, r3 + movzx r2, byte [r0] + movzx r3, byte [r0+r1] + add r2, r3 + lea r0, [r0+2*r1] + movzx r3, byte [r0] + add r2, r3 + movzx r3, byte [r0+r1] + add r2, r3 + add r2, 02h + sar r2, 02h + ;SSE2_Copy16Times mm0, r2d + mov r3, r2 + sal r3, 8 + or r2, r3 + movd mm1, r2d + pshufw mm0, mm1, 00h + ;mov bh, bl + ;movd mm1, ebx + ;pshufw mm0, mm1, 00h ; up64 + movq mm1, mm0 + xor r2, r2 + lea r0, [r0+2*r1] + movzx r2, byte [r0] + movzx r3, byte [r0+r1] + add r2, r3 + lea r0, [r0+2*r1] + movzx r3, byte [r0] + add r2, r3 + movzx r3, byte [r0+r1] + add r2, r3 + add r2, 02h + sar r2, 02h + mov r3, r2 + sal r3, 8 + or r2, r3 + movd mm3, r2d + pshufw mm2, mm3, 00h + ;mov bh, bl + ;movd mm3, ebx + ;pshufw mm2, mm3, 00h ; down64 + ;SSE2_Copy16Times mm2, r2d + movq mm3, mm2 + lea r2, [2*r1+r1] + movq [r4], mm0 + movq [r4+r1], mm1 + movq [r4+2*r1], mm0 + movq [r4+r2], mm1 + lea r4, [r4+4*r1] + movq [r4], mm2 + movq [r4+r1], mm3 + movq [r4+2*r1], mm2 + movq [r4+r2], mm3 + pop r4 + pop r3 + emms + ret ;******************************************************************************* -; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2 - %assign push_num 0 - LOAD_2_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - mov r2, r0 - sub r2, r1 - movq xmm0, [r2] ; top: 8x1 pixels - pxor xmm7, xmm7 - punpcklbw xmm0, xmm7 ; ext 8x2 words - pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2 - paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2 - movdqa xmm1, xmm0 - pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3 - pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 .. - paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3 - paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 .. - punpckhqdq xmm1, xmm7 - punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0 - movdqa xmm6, [sse2_wd_0x02] - paddw xmm0, xmm6 - psraw xmm0, 02h - packuswb xmm0, xmm7 - lea r2, [2*r1+r1] - movq [r0], xmm0 - movq [r0+r1], xmm0 - movq [r0+2*r1], xmm0 - movq [r0+r2], xmm0 - lea r0, [r0+4*r1] - movq [r0], xmm0 - movq [r0+r1], xmm0 - movq [r0+2*r1], xmm0 - movq [r0+r2], xmm0 - POP_XMM - ret + %assign push_num 0 + LOAD_2_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + mov r2, r0 + sub r2, r1 + movq xmm0, [r2] ; top: 8x1 pixels + pxor xmm7, xmm7 + punpcklbw xmm0, xmm7 ; ext 8x2 words + pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2 + paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2 + movdqa xmm1, xmm0 + pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3 + pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 .. + paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3 + paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 .. + punpckhqdq xmm1, xmm7 + punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0 + movdqa xmm6, [sse2_wd_0x02] + paddw xmm0, xmm6 + psraw xmm0, 02h + packuswb xmm0, xmm7 + lea r2, [2*r1+r1] + movq [r0], xmm0 + movq [r0+r1], xmm0 + movq [r0+2*r1], xmm0 + movq [r0+r2], xmm0 + lea r0, [r0+4*r1] + movq [r0], xmm0 + movq [r0+r1], xmm0 + movq [r0+2*r1], xmm0 + movq [r0+r2], xmm0 + POP_XMM + ret ;******************************************************************************* -; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride) +; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride) ;******************************************************************************* WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - lea r2, [2*r1+r1] - movq mm0, [sse2_dc_0x80] - movq mm1, mm0 - movq [r0], mm0 - movq [r0+r1], mm1 - movq [r0+2*r1], mm0 - movq [r0+r2], mm1 - lea r0, [r0+4*r1] - movq [r0], mm0 - movq [r0+r1], mm1 - movq [r0+2*r1], mm0 - movq [r0+r2], mm1 - emms - ret + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + lea r2, [2*r1+r1] + movq mm0, [sse2_dc_0x80] + movq mm1, mm0 + movq [r0], mm0 + movq [r0+r1], mm1 + movq [r0+2*r1], mm0 + movq [r0+r2], mm1 + lea r0, [r0+4*r1] + movq [r0], mm0 + movq [r0+r1], mm1 + movq [r0+2*r1], mm0 + movq [r0+r2], mm1 + emms + ret diff --git a/codec/encoder/core/arm/intra_pred_neon.S b/codec/encoder/core/arm/intra_pred_neon.S index b6b3a387..1697fde2 100644 --- a/codec/encoder/core/arm/intra_pred_neon.S +++ b/codec/encoder/core/arm/intra_pred_neon.S @@ -38,107 +38,107 @@ #ifdef __APPLE__ //Global macro .macro GET_8BYTE_DATA - vld1.8 {$0[0]}, [$1], $2 - vld1.8 {$0[1]}, [$1], $2 - vld1.8 {$0[2]}, [$1], $2 - vld1.8 {$0[3]}, [$1], $2 - vld1.8 {$0[4]}, [$1], $2 - vld1.8 {$0[5]}, [$1], $2 - vld1.8 {$0[6]}, [$1], $2 - vld1.8 {$0[7]}, [$1], $2 + vld1.8 {$0[0]}, [$1], $2 + vld1.8 {$0[1]}, [$1], $2 + vld1.8 {$0[2]}, [$1], $2 + vld1.8 {$0[3]}, [$1], $2 + vld1.8 {$0[4]}, [$1], $2 + vld1.8 {$0[5]}, [$1], $2 + vld1.8 {$0[6]}, [$1], $2 + vld1.8 {$0[7]}, [$1], $2 .endm #else //Global macro .macro GET_8BYTE_DATA arg0, arg1, arg2 - vld1.8 {\arg0[0]}, [\arg1], \arg2 - vld1.8 {\arg0[1]}, [\arg1], \arg2 - vld1.8 {\arg0[2]}, [\arg1], \arg2 - vld1.8 {\arg0[3]}, [\arg1], \arg2 - vld1.8 {\arg0[4]}, [\arg1], \arg2 - vld1.8 {\arg0[5]}, [\arg1], \arg2 - vld1.8 {\arg0[6]}, [\arg1], \arg2 - vld1.8 {\arg0[7]}, [\arg1], \arg2 + vld1.8 {\arg0[0]}, [\arg1], \arg2 + vld1.8 {\arg0[1]}, [\arg1], \arg2 + vld1.8 {\arg0[2]}, [\arg1], \arg2 + vld1.8 {\arg0[3]}, [\arg1], \arg2 + vld1.8 {\arg0[4]}, [\arg1], \arg2 + vld1.8 {\arg0[5]}, [\arg1], \arg2 + vld1.8 {\arg0[6]}, [\arg1], \arg2 + vld1.8 {\arg0[7]}, [\arg1], \arg2 .endm #endif WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon - //Get the top line data to 'q0' - sub r3, r1, r2 - vldm r3, {d0, d1} + //Get the top line data to 'q0' + sub r3, r1, r2 + vldm r3, {d0, d1} - //mov r2, #16 - mov r3, #4 - //Set the top line to the each line of MB(16*16) + //mov r2, #16 + mov r3, #4 + //Set the top line to the each line of MB(16*16) loop_0_get_i16x16_luma_pred_v: - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_v + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_v WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon //stmdb sp!, {r4, lr} - sub r1, r1, #1 - mov r3, #4 + sub r1, r1, #1 + mov r3, #4 loop_0_get_i16x16_luma_pred_h: - //Get one byte data from left side - vld1.8 {d0[],d1[]}, [r1], r2 - vld1.8 {d2[],d3[]}, [r1], r2 - vld1.8 {d4[],d5[]}, [r1], r2 - vld1.8 {d6[],d7[]}, [r1], r2 + //Get one byte data from left side + vld1.8 {d0[],d1[]}, [r1], r2 + vld1.8 {d2[],d3[]}, [r1], r2 + vld1.8 {d4[],d5[]}, [r1], r2 + vld1.8 {d6[],d7[]}, [r1], r2 - //Set the line of MB using the left side byte data - vst1.8 {d0,d1}, [r0]! - //add r0, #16 - vst1.8 {d2,d3}, [r0]! - //add r0, #16 - vst1.8 {d4,d5}, [r0]! - //add r0, #16 - vst1.8 {d6,d7}, [r0]! - //add r0, #16 + //Set the line of MB using the left side byte data + vst1.8 {d0,d1}, [r0]! + //add r0, #16 + vst1.8 {d2,d3}, [r0]! + //add r0, #16 + vst1.8 {d4,d5}, [r0]! + //add r0, #16 + vst1.8 {d6,d7}, [r0]! + //add r0, #16 - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_h + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_h WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon - //stmdb sp!, { r2-r5, lr} - //Get the left vertical line data - sub r3, r1, #1 - GET_8BYTE_DATA d0, r3, r2 - GET_8BYTE_DATA d1, r3, r2 + //stmdb sp!, { r2-r5, lr} + //Get the left vertical line data + sub r3, r1, #1 + GET_8BYTE_DATA d0, r3, r2 + GET_8BYTE_DATA d1, r3, r2 - //Get the top horizontal line data - sub r3, r1, r2 - vldm r3, {d2, d3} + //Get the top horizontal line data + sub r3, r1, r2 + vldm r3, {d2, d3} - //Calculate the sum of top horizontal line data and vertical line data - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 - vadd.u16 q0, q0, q1 - vadd.u16 d0, d0, d1 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + //Calculate the sum of top horizontal line data and vertical line data + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - //Calculate the mean value - vrshr.u16 d0, d0, #5 - vdup.8 q0, d0[0] + //Calculate the mean value + vrshr.u16 d0, d0, #5 + vdup.8 q0, d0[0] - //Set the mean value to the all of member of MB - mov r3, #4 + //Set the mean value to the all of member of MB + mov r3, #4 loop_0_get_i16x16_luma_pred_dc_both: - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - vst1.8 {d0,d1}, [r0]! - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_dc_both + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + vst1.8 {d0,d1}, [r0]! + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_dc_both WELS_ASM_FUNC_END @@ -151,383 +151,383 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon - //stmdb sp!, { r4, lr} + //stmdb sp!, { r4, lr} - //Load the table {(8,7,6,5,4,3,2,1) * 5} - adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE - vldr d0, [r3] + //Load the table {(8,7,6,5,4,3,2,1) * 5} + adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE + vldr d0, [r3] - //Pack the top[-1] ~ top[6] to d1 - sub r3, r1, r2 - sub r1, r3, #1 - vld1.8 d1, [r1] + //Pack the top[-1] ~ top[6] to d1 + sub r3, r1, r2 + sub r1, r3, #1 + vld1.8 d1, [r1] - //Pack the top[8] ~ top[15] to d2 - add r1, #9 - vld1.8 d2, [r1] + //Pack the top[8] ~ top[15] to d2 + add r1, #9 + vld1.8 d2, [r1] - //Save the top[15] to d6 for next step - vdup.u8 d6, d2[7] + //Save the top[15] to d6 for next step + vdup.u8 d6, d2[7] - //Get and pack left[-1] ~ left[6] to d4 - sub r1, r3, #1 - GET_8BYTE_DATA d4, r1, r2 + //Get and pack left[-1] ~ left[6] to d4 + sub r1, r3, #1 + GET_8BYTE_DATA d4, r1, r2 - //Get and pack left[8] ~ left[15] to d3 - add r1, r2 - GET_8BYTE_DATA d3, r1, r2 + //Get and pack left[8] ~ left[15] to d3 + add r1, r2 + GET_8BYTE_DATA d3, r1, r2 - //Save the left[15] to d7 for next step - vdup.u8 d7, d3[7] + //Save the left[15] to d7 for next step + vdup.u8 d7, d3[7] - //revert the sequence of d2,d3 - vrev64.8 q1, q1 + //revert the sequence of d2,d3 + vrev64.8 q1, q1 - vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} - vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} + vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} + vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} - vmovl.u8 q0, d0 - vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} - vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} + vmovl.u8 q0, d0 + vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} + vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} - //Calculate the sum of items of q1, q2 - vpadd.s16 d0, d2, d3 - vpadd.s16 d1, d4, d5 - vpaddl.s16 q0, q0 - vpaddl.s32 q0, q0 + //Calculate the sum of items of q1, q2 + vpadd.s16 d0, d2, d3 + vpadd.s16 d1, d4, d5 + vpaddl.s16 q0, q0 + vpaddl.s32 q0, q0 - //Get the value of 'b', 'c' and extend to q1, q2. - vrshr.s64 q0, #6 - vdup.s16 q1, d0[0] - vdup.s16 q2, d1[0] + //Get the value of 'b', 'c' and extend to q1, q2. + vrshr.s64 q0, #6 + vdup.s16 q1, d0[0] + vdup.s16 q2, d1[0] - //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 - adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE - vld1.32 {d0}, [r3] + //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 + adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE + vld1.32 {d0}, [r3] - //Get the value of 'a' and save to q3 - vaddl.u8 q3, d6, d7 - vshl.u16 q3, #4 + //Get the value of 'a' and save to q3 + vaddl.u8 q3, d6, d7 + vshl.u16 q3, #4 - //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} - vmovl.s8 q0, d0 - vmla.s16 q3, q0, q1 - vmla.s16 q3, q2, d0[0] + //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7} + vmovl.s8 q0, d0 + vmla.s16 q3, q0, q1 + vmla.s16 q3, q2, d0[0] - //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} - vshl.s16 q8, q1, #3 - vadd.s16 q8, q3 + //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} + vshl.s16 q8, q1, #3 + vadd.s16 q8, q3 - //right shift 5 bits and rounding - vqrshrun.s16 d0, q3, #5 - vqrshrun.s16 d1, q8, #5 + //right shift 5 bits and rounding + vqrshrun.s16 d0, q3, #5 + vqrshrun.s16 d1, q8, #5 - //Set the line of MB - vst1.u32 {d0,d1}, [r0]! + //Set the line of MB + vst1.u32 {d0,d1}, [r0]! - //Do the same processing for setting other lines - mov r3, #15 + //Do the same processing for setting other lines + mov r3, #15 loop_0_get_i16x16_luma_pred_plane: - vadd.s16 q3, q2 - vadd.s16 q8, q2 - vqrshrun.s16 d0, q3, #5 - vqrshrun.s16 d1, q8, #5 - vst1.u32 {d0,d1}, [r0]! - subs r3, #1 - bne loop_0_get_i16x16_luma_pred_plane + vadd.s16 q3, q2 + vadd.s16 q8, q2 + vqrshrun.s16 d0, q3, #5 + vqrshrun.s16 d1, q8, #5 + vst1.u32 {d0,d1}, [r0]! + subs r3, #1 + bne loop_0_get_i16x16_luma_pred_plane WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r3, r1, r2 - ldr r3, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r3, r1, r2 + ldr r3, [r3] - //Set the luma MB using top line - str r3, [r0], #4 - str r3, [r0], #4 - str r3, [r0], #4 - str r3, [r0] + //Set the luma MB using top line + str r3, [r0], #4 + str r3, [r0], #4 + str r3, [r0], #4 + str r3, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon - //stmdb sp!, { r2-r5, lr} - //Load the left column (4 bytes) - sub r3, r1, #1 - vld1.8 {d0[]}, [r3], r2 - vld1.8 {d1[]}, [r3], r2 - vld1.8 {d2[]}, [r3], r2 - vld1.8 {d3[]}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the left column (4 bytes) + sub r3, r1, #1 + vld1.8 {d0[]}, [r3], r2 + vld1.8 {d1[]}, [r3], r2 + vld1.8 {d2[]}, [r3], r2 + vld1.8 {d3[]}, [r3] - //Set the luma MB using the left side byte - vst1.32 {d0[0]}, [r0]! - vst1.32 {d1[0]}, [r0]! - vst1.32 {d2[0]}, [r0]! - vst1.32 {d3[0]}, [r0] + //Set the luma MB using the left side byte + vst1.32 {d0[0]}, [r0]! + vst1.32 {d1[0]}, [r0]! + vst1.32 {d2[0]}, [r0]! + vst1.32 {d3[0]}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row data(8 bytes) - sub r3, r1, r2 - vld1.32 {d0}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row data(8 bytes) + sub r3, r1, r2 + vld1.32 {d0}, [r3] - //For "t7 + (t7<<1)" - vdup.8 d1, d0[7] + //For "t7 + (t7<<1)" + vdup.8 d1, d0[7] - //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" - vext.8 d1, d0, d1, #1 - vaddl.u8 q1, d1, d0 + //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" + vext.8 d1, d0, d1, #1 + vaddl.u8 q1, d1, d0 - //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" - vext.8 q2, q1, q1, #14 - vadd.u16 q0, q1, q2 + //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" + vext.8 q2, q1, q1, #14 + vadd.u16 q0, q1, q2 - //right shift 2 bits and rounding - vqrshrn.u16 d0, q0, #2 + //right shift 2 bits and rounding + vqrshrn.u16 d0, q0, #2 - //Save "ddl0, ddl1, ddl2, ddl3" - vext.8 d1, d0, d0, #1 - vst1.32 d1[0], [r0]! + //Save "ddl0, ddl1, ddl2, ddl3" + vext.8 d1, d0, d0, #1 + vst1.32 d1[0], [r0]! - //Save "ddl1, ddl2, ddl3, ddl4" - vext.8 d1, d0, d0, #2 - vst1.32 d1[0], [r0]! + //Save "ddl1, ddl2, ddl3, ddl4" + vext.8 d1, d0, d0, #2 + vst1.32 d1[0], [r0]! - //Save "ddl2, ddl3, ddl4, ddl5" - vext.8 d1, d0, d0, #3 - vst1.32 d1[0], [r0]! + //Save "ddl2, ddl3, ddl4, ddl5" + vext.8 d1, d0, d0, #3 + vst1.32 d1[0], [r0]! - //Save "ddl3, ddl4, ddl5, ddl6" - vst1.32 d0[1], [r0] + //Save "ddl3, ddl4, ddl5, ddl6" + vst1.32 d0[1], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r3, r1, r2 - vld1.32 {d0[1]}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r3, r1, r2 + vld1.32 {d0[1]}, [r3] - //Load the left column (5 bytes) - sub r3, #1 - vld1.8 {d0[3]}, [r3], r2 - vld1.8 {d0[2]}, [r3], r2 - vld1.8 {d0[1]}, [r3], r2 - vld1.8 {d0[0]}, [r3], r2 - vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing + //Load the left column (5 bytes) + sub r3, #1 + vld1.8 {d0[3]}, [r3], r2 + vld1.8 {d0[2]}, [r3], r2 + vld1.8 {d0[1]}, [r3], r2 + vld1.8 {d0[0]}, [r3], r2 + vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing - vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} - //d2:{L3,L2,L1,L0,LT,T0,T1,T2} + vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} + //d2:{L3,L2,L1,L0,LT,T0,T1,T2} - //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} - vaddl.u8 q2, d2, d0 + //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} + vaddl.u8 q2, d2, d0 - //q1:{TL0+LT0,LT0+T01,...L12+L23} - vext.8 q3, q3, q2, #14 - vadd.u16 q1, q2, q3 + //q1:{TL0+LT0,LT0+T01,...L12+L23} + vext.8 q3, q3, q2, #14 + vadd.u16 q1, q2, q3 - //right shift 2 bits and rounding - vqrshrn.u16 d0, q1, #2 + //right shift 2 bits and rounding + vqrshrn.u16 d0, q1, #2 - //Adjust the data sequence for setting luma MB of 'pred' - vst1.32 d0[1], [r0]! - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0]! - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0]! - vext.8 d0, d0, d0, #7 - vst1.32 d0[1], [r0] + //Adjust the data sequence for setting luma MB of 'pred' + vst1.32 d0[1], [r0]! + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0]! + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0]! + vext.8 d0, d0, d0, #7 + vst1.32 d0[1], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (8 bytes) - sub r3, r1, r2 - vld1.32 {d0}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row (8 bytes) + sub r3, r1, r2 + vld1.32 {d0}, [r3] - vext.8 d1, d0, d0, #1 - vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} + vext.8 d1, d0, d0, #1 + vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} - vext.8 q2, q1, q1, #2 - vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} + vext.8 q2, q1, q1, #2 + vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} - //calculate the "vl0,vl1,vl2,vl3,vl4" - vqrshrn.u16 d0, q1, #1 + //calculate the "vl0,vl1,vl2,vl3,vl4" + vqrshrn.u16 d0, q1, #1 - //calculate the "vl5,vl6,vl7,vl8,vl9" - vqrshrn.u16 d1, q2, #2 + //calculate the "vl5,vl6,vl7,vl8,vl9" + vqrshrn.u16 d1, q2, #2 - //Adjust the data sequence for setting the luma MB - vst1.32 d0[0], [r0]! - vst1.32 d1[0], [r0]! - vext.8 d0, d0, d0, #1 - vext.8 d1, d1, d1, #1 - vst1.32 d0[0], [r0]! - vst1.32 d1[0], [r0] + //Adjust the data sequence for setting the luma MB + vst1.32 d0[0], [r0]! + vst1.32 d1[0], [r0]! + vext.8 d0, d0, d0, #1 + vext.8 d1, d1, d1, #1 + vst1.32 d0[0], [r0]! + vst1.32 d1[0], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row (4 bytes) - sub r3, r1, r2 - vld1.32 {d0[1]}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row (4 bytes) + sub r3, r1, r2 + vld1.32 {d0[1]}, [r3] - //Load the left column (4 bytes) - sub r3, #1 - vld1.8 {d0[3]}, [r3], r2 - vld1.8 {d0[2]}, [r3], r2 - vld1.8 {d0[1]}, [r3], r2 - vld1.8 {d0[0]}, [r3] + //Load the left column (4 bytes) + sub r3, #1 + vld1.8 {d0[3]}, [r3], r2 + vld1.8 {d0[2]}, [r3], r2 + vld1.8 {d0[1]}, [r3], r2 + vld1.8 {d0[0]}, [r3] - vext.8 d1, d0, d0, #7 - vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} + vext.8 d1, d0, d0, #7 + vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} - vext.u8 q2, q1, q1, #14 - vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} + vext.u8 q2, q1, q1, #14 + vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} - //Calculate the vr0 ~ vr9 - vqrshrn.u16 d1, q2, #2 - vqrshrn.u16 d0, q1, #1 + //Calculate the vr0 ~ vr9 + vqrshrn.u16 d1, q2, #2 + vqrshrn.u16 d0, q1, #1 - //Adjust the data sequence for setting the luma MB - vst1.32 d0[1], [r0]! - vst1.32 d1[1], [r0]! - //add r2, r0, r1 - vst1.8 d1[3], [r0]! - vst1.16 d0[2], [r0]! - vst1.8 d0[6], [r0]! - vst1.8 d1[2], [r0]! - vst1.16 d1[2], [r0]! - vst1.8 d1[6], [r0] + //Adjust the data sequence for setting the luma MB + vst1.32 d0[1], [r0]! + vst1.32 d1[1], [r0]! + //add r2, r0, r1 + vst1.8 d1[3], [r0]! + vst1.16 d0[2], [r0]! + vst1.8 d0[6], [r0]! + vst1.8 d1[2], [r0]! + vst1.16 d1[2], [r0]! + vst1.8 d1[6], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon - //stmdb sp!, { r4, lr} - //Load the left column data - sub r3, r1, #1 - mov r1, #3 - mul r1, r2 - add r1, r3 - vld1.8 {d0[]}, [r1] - vld1.8 {d0[4]}, [r3], r2 - vld1.8 {d0[5]}, [r3], r2 - vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} + //stmdb sp!, { r4, lr} + //Load the left column data + sub r3, r1, #1 + mov r1, #3 + mul r1, r2 + add r1, r3 + vld1.8 {d0[]}, [r1] + vld1.8 {d0[4]}, [r3], r2 + vld1.8 {d0[5]}, [r3], r2 + vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} - vext.8 d1, d0, d0, #1 - vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} + vext.8 d1, d0, d0, #1 + vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} - vext.u8 d2, d5, d4, #2 - vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} + vext.u8 d2, d5, d4, #2 + vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} - //Calculate the hu0 ~ hu5 - vqrshrn.u16 d2, q2, #1 - vqrshrn.u16 d1, q1, #2 + //Calculate the hu0 ~ hu5 + vqrshrn.u16 d2, q2, #1 + vqrshrn.u16 d1, q1, #2 - //Adjust the data sequence for setting the luma MB - vzip.8 d2, d1 - vst1.32 d1[0], [r0]! - vext.8 d2, d1, d1, #2 - vst1.32 d2[0], [r0]! - vst1.32 d1[1], [r0]! - vst1.32 d0[0], [r0] + //Adjust the data sequence for setting the luma MB + vzip.8 d2, d1 + vst1.32 d1[0], [r0]! + vext.8 d2, d1, d1, #2 + vst1.32 d2[0], [r0]! + vst1.32 d1[1], [r0]! + vst1.32 d0[0], [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon - //stmdb sp!, { r2-r5, lr} - //Load the data - sub r3, r1, r2 - sub r3, #1 - vld1.32 {d0[1]}, [r3], r2 - vld1.8 {d0[3]}, [r3], r2 - vld1.8 {d0[2]}, [r3], r2 - vld1.8 {d0[1]}, [r3], r2 - vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} + //stmdb sp!, { r2-r5, lr} + //Load the data + sub r3, r1, r2 + sub r3, #1 + vld1.32 {d0[1]}, [r3], r2 + vld1.8 {d0[3]}, [r3], r2 + vld1.8 {d0[2]}, [r3], r2 + vld1.8 {d0[1]}, [r3], r2 + vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} - vext.8 d1, d0, d0, #7 - vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} + vext.8 d1, d0, d0, #7 + vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} - vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} - vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} + vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} + vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} - //Calculate the hd0~hd9 - vqrshrn.u16 d1, q3, #2 - vqrshrn.u16 d0, q2, #1 + //Calculate the hd0~hd9 + vqrshrn.u16 d1, q3, #2 + vqrshrn.u16 d0, q2, #1 - //Adjust the data sequence for setting the luma MB - vmov d3, d1 - vtrn.8 d0, d1 - vext.u8 d2, d1, d1, #6 - vst2.16 {d2[3], d3[3]}, [r0]! - vst2.16 {d0[2], d1[2]}, [r0]! - vmov d3, d0 - vst2.16 {d2[2], d3[2]}, [r0]! - vst2.16 {d0[1], d1[1]}, [r0] + //Adjust the data sequence for setting the luma MB + vmov d3, d1 + vtrn.8 d0, d1 + vext.u8 d2, d1, d1, #6 + vst2.16 {d2[3], d3[3]}, [r0]! + vst2.16 {d0[2], d1[2]}, [r0]! + vmov d3, d0 + vst2.16 {d2[2], d3[2]}, [r0]! + vst2.16 {d0[1], d1[1]}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon - //stmdb sp!, { r2-r5, lr} - //Get the top row (8 byte) - sub r3, r1, r2 - vldr d0, [r3] + //stmdb sp!, { r2-r5, lr} + //Get the top row (8 byte) + sub r3, r1, r2 + vldr d0, [r3] - //Set the chroma MB using top row data - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0]! - vst1.8 {d0}, [r0] + //Set the chroma MB using top row data + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon - //stmdb sp!, { r2-r5, lr} - ////Get the left column (8 byte) - sub r3, r1, #1 - vld1.8 {d0[]}, [r3], r2 - vld1.8 {d1[]}, [r3], r2 - vld1.8 {d2[]}, [r3], r2 - vld1.8 {d3[]}, [r3], r2 - vld1.8 {d4[]}, [r3], r2 - vld1.8 {d5[]}, [r3], r2 - vld1.8 {d6[]}, [r3], r2 - vld1.8 {d7[]}, [r3] + //stmdb sp!, { r2-r5, lr} + ////Get the left column (8 byte) + sub r3, r1, #1 + vld1.8 {d0[]}, [r3], r2 + vld1.8 {d1[]}, [r3], r2 + vld1.8 {d2[]}, [r3], r2 + vld1.8 {d3[]}, [r3], r2 + vld1.8 {d4[]}, [r3], r2 + vld1.8 {d5[]}, [r3], r2 + vld1.8 {d6[]}, [r3], r2 + vld1.8 {d7[]}, [r3] - //Set the chroma MB using left column data - vst1.8 {d0}, [r0]! - vst1.8 {d1}, [r0]! - vst1.8 {d2}, [r0]! - vst1.8 {d3}, [r0]! - vst1.8 {d4}, [r0]! - vst1.8 {d5}, [r0]! - vst1.8 {d6}, [r0]! - vst1.8 {d7}, [r0] + //Set the chroma MB using left column data + vst1.8 {d0}, [r0]! + vst1.8 {d1}, [r0]! + vst1.8 {d2}, [r0]! + vst1.8 {d3}, [r0]! + vst1.8 {d4}, [r0]! + vst1.8 {d5}, [r0]! + vst1.8 {d6}, [r0]! + vst1.8 {d7}, [r0] WELS_ASM_FUNC_END @@ -575,73 +575,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003 WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon - //stmdb sp!, { r2-r5, lr} - //Load the top row data - sub r3, r1, #1 - sub r3, r2 - vld1.32 {d1[0]}, [r3] - add r3, #5 - vld1.32 {d0[0]}, [r3] + //stmdb sp!, { r2-r5, lr} + //Load the top row data + sub r3, r1, #1 + sub r3, r2 + vld1.32 {d1[0]}, [r3] + add r3, #5 + vld1.32 {d0[0]}, [r3] - //Load the left column data - sub r3, #5 - vld1.8 {d1[4]}, [r3], r2 - vld1.8 {d1[5]}, [r3], r2 - vld1.8 {d1[6]}, [r3], r2 - vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} - add r3, r2 - vld1.8 {d0[4]}, [r3], r2 - vld1.8 {d0[5]}, [r3], r2 - vld1.8 {d0[6]}, [r3], r2 - vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} + //Load the left column data + sub r3, #5 + vld1.8 {d1[4]}, [r3], r2 + vld1.8 {d1[5]}, [r3], r2 + vld1.8 {d1[6]}, [r3], r2 + vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} + add r3, r2 + vld1.8 {d0[4]}, [r3], r2 + vld1.8 {d0[5]}, [r3], r2 + vld1.8 {d0[6]}, [r3], r2 + vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} - //Save T7 to d3 for next step - vdup.u8 d3, d0[3] - //Save L7 to d4 for next step - vdup.u8 d4, d0[7] + //Save T7 to d3 for next step + vdup.u8 d3, d0[3] + //Save L7 to d4 for next step + vdup.u8 d4, d0[7] - //Calculate the value of 'a' and save to q2 - vaddl.u8 q2, d3, d4 - vshl.u16 q2, #4 + //Calculate the value of 'a' and save to q2 + vaddl.u8 q2, d3, d4 + vshl.u16 q2, #4 - //Load the table {{1,2,3,4,1,2,3,4}*17} - adr r3, CONST0_GET_I_CHROMA_PRED_PLANE - vld1.32 {d2}, [r3] + //Load the table {{1,2,3,4,1,2,3,4}*17} + adr r3, CONST0_GET_I_CHROMA_PRED_PLANE + vld1.32 {d2}, [r3] - //Calculate the 'b','c', and save to q0 - vrev32.8 d1, d1 - vsubl.u8 q0, d0, d1 - vmovl.u8 q1, d2 - vmul.s16 q0, q1 - vpaddl.s16 q0, q0 - vpaddl.s32 q0, q0 - vrshr.s64 q0, #5 + //Calculate the 'b','c', and save to q0 + vrev32.8 d1, d1 + vsubl.u8 q0, d0, d1 + vmovl.u8 q1, d2 + vmul.s16 q0, q1 + vpaddl.s16 q0, q0 + vpaddl.s32 q0, q0 + vrshr.s64 q0, #5 - //Load the table {-3,-2,-1,0,1,2,3,4} to q3 - adr r3, CONST1_GET_I_CHROMA_PRED_PLANE - vld1.32 {d6, d7}, [r3] + //Load the table {-3,-2,-1,0,1,2,3,4} to q3 + adr r3, CONST1_GET_I_CHROMA_PRED_PLANE + vld1.32 {d6, d7}, [r3] - //Duplicate the 'b','c' to q0, q1 for SIMD instruction - vdup.s16 q1, d1[0] - vdup.s16 q0, d0[0] + //Duplicate the 'b','c' to q0, q1 for SIMD instruction + vdup.s16 q1, d1[0] + vdup.s16 q0, d0[0] - //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" - vmla.s16 q2, q0, q3 - vmla.s16 q2, q1, d6[0] - vqrshrun.s16 d0, q2, #5 + //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" + vmla.s16 q2, q0, q3 + vmla.s16 q2, q1, d6[0] + vqrshrun.s16 d0, q2, #5 - //Set a line of chroma MB - vst1.u32 {d0}, [r0]! + //Set a line of chroma MB + vst1.u32 {d0}, [r0]! - //Do the same processing for each line. - mov r3, #7 + //Do the same processing for each line. + mov r3, #7 loop_0_get_i_chroma_pred_plane: - vadd.s16 q2, q1 - vqrshrun.s16 d0, q2, #5 - vst1.u32 {d0}, [r0]! - subs r3, #1 - bne loop_0_get_i_chroma_pred_plane + vadd.s16 q2, q1 + vqrshrun.s16 d0, q2, #5 + vst1.u32 {d0}, [r0]! + subs r3, #1 + bne loop_0_get_i_chroma_pred_plane WELS_ASM_FUNC_END diff --git a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S index cdf880d3..8a0215e8 100644 --- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S +++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S @@ -38,117 +38,117 @@ #ifdef __APPLE__ //The data sequence will be used .macro GET_8BYTE_DATA_L0 - vld1.8 {$0[0]}, [$1], $2 - vld1.8 {$0[1]}, [$1], $2 - vld1.8 {$0[2]}, [$1], $2 - vld1.8 {$0[3]}, [$1], $2 - vld1.8 {$0[4]}, [$1], $2 - vld1.8 {$0[5]}, [$1], $2 - vld1.8 {$0[6]}, [$1], $2 - vld1.8 {$0[7]}, [$1], $2 + vld1.8 {$0[0]}, [$1], $2 + vld1.8 {$0[1]}, [$1], $2 + vld1.8 {$0[2]}, [$1], $2 + vld1.8 {$0[3]}, [$1], $2 + vld1.8 {$0[4]}, [$1], $2 + vld1.8 {$0[5]}, [$1], $2 + vld1.8 {$0[6]}, [$1], $2 + vld1.8 {$0[7]}, [$1], $2 .endm .macro HDM_TRANSFORM_4X4_L0 - //Do the vertical transform - vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13} - vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15} - vswp d1, d2 - vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} - vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} + //Do the vertical transform + vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13} + vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15} + vswp d1, d2 + vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} + vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} - //Do the horizontal transform - vtrn.32 q2, q1 - vadd.s16 q0, q2, q1 - vsub.s16 q1, q2, q1 + //Do the horizontal transform + vtrn.32 q2, q1 + vadd.s16 q0, q2, q1 + vsub.s16 q1, q2, q1 - vtrn.16 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 + vtrn.16 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 - vmov.s16 d0, d4 - vmov.s16 d1, d2 + vmov.s16 d0, d4 + vmov.s16 d1, d2 - vabs.s16 d3, d3 + vabs.s16 d3, d3 - //16x16_v - vtrn.32 d0, d1 //{0,1,3,2} - vaba.s16 $5, d0, $2 //16x16_v - vaba.s16 $5, d1, $8 - vaba.s16 $5, d5, $8 - vadd.u16 $5, d3 + //16x16_v + vtrn.32 d0, d1 //{0,1,3,2} + vaba.s16 $5, d0, $2 //16x16_v + vaba.s16 $5, d1, $8 + vaba.s16 $5, d5, $8 + vadd.u16 $5, d3 - //16x16_h - vtrn.16 d4, d5 //{0,4,12,8} - vaba.s16 $6, d4, $3 //16x16_h - vabs.s16 d2, d2 - vabs.s16 d5, d5 - vadd.u16 d2, d3 - vadd.u16 d2, d5 - vadd.u16 $6, d2 + //16x16_h + vtrn.16 d4, d5 //{0,4,12,8} + vaba.s16 $6, d4, $3 //16x16_h + vabs.s16 d2, d2 + vabs.s16 d5, d5 + vadd.u16 d2, d3 + vadd.u16 d2, d5 + vadd.u16 $6, d2 - //16x16_dc_both - vaba.s16 $7, d4, $4 //16x16_dc_both - vadd.u16 $7, d2 + //16x16_dc_both + vaba.s16 $7, d4, $4 //16x16_dc_both + vadd.u16 $7, d2 .endm #else //The data sequence will be used .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2 - vld1.8 {\arg0[0]}, [\arg1], \arg2 - vld1.8 {\arg0[1]}, [\arg1], \arg2 - vld1.8 {\arg0[2]}, [\arg1], \arg2 - vld1.8 {\arg0[3]}, [\arg1], \arg2 - vld1.8 {\arg0[4]}, [\arg1], \arg2 - vld1.8 {\arg0[5]}, [\arg1], \arg2 - vld1.8 {\arg0[6]}, [\arg1], \arg2 - vld1.8 {\arg0[7]}, [\arg1], \arg2 + vld1.8 {\arg0[0]}, [\arg1], \arg2 + vld1.8 {\arg0[1]}, [\arg1], \arg2 + vld1.8 {\arg0[2]}, [\arg1], \arg2 + vld1.8 {\arg0[3]}, [\arg1], \arg2 + vld1.8 {\arg0[4]}, [\arg1], \arg2 + vld1.8 {\arg0[5]}, [\arg1], \arg2 + vld1.8 {\arg0[6]}, [\arg1], \arg2 + vld1.8 {\arg0[7]}, [\arg1], \arg2 .endm .macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8 - //Do the vertical transform - vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13} - vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15} - vswp d1, d2 - vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} - vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} + //Do the vertical transform + vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13} + vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15} + vswp d1, d2 + vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7} + vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11} - //Do the horizontal transform - vtrn.32 q2, q1 - vadd.s16 q0, q2, q1 - vsub.s16 q1, q2, q1 + //Do the horizontal transform + vtrn.32 q2, q1 + vadd.s16 q0, q2, q1 + vsub.s16 q1, q2, q1 - vtrn.16 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 + vtrn.16 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 - vmov.s16 d0, d4 - vmov.s16 d1, d2 + vmov.s16 d0, d4 + vmov.s16 d1, d2 - vabs.s16 d3, d3 + vabs.s16 d3, d3 - //16x16_v - vtrn.32 d0, d1 //{0,1,3,2} - vaba.s16 \arg5, d0, \arg2 //16x16_v - vaba.s16 \arg5, d1, \arg8 - vaba.s16 \arg5, d5, \arg8 - vadd.u16 \arg5, d3 + //16x16_v + vtrn.32 d0, d1 //{0,1,3,2} + vaba.s16 \arg5, d0, \arg2 //16x16_v + vaba.s16 \arg5, d1, \arg8 + vaba.s16 \arg5, d5, \arg8 + vadd.u16 \arg5, d3 - //16x16_h - vtrn.16 d4, d5 //{0,4,12,8} - vaba.s16 \arg6, d4, \arg3 //16x16_h - vabs.s16 d2, d2 - vabs.s16 d5, d5 - vadd.u16 d2, d3 - vadd.u16 d2, d5 - vadd.u16 \arg6, d2 + //16x16_h + vtrn.16 d4, d5 //{0,4,12,8} + vaba.s16 \arg6, d4, \arg3 //16x16_h + vabs.s16 d2, d2 + vabs.s16 d5, d5 + vadd.u16 d2, d3 + vadd.u16 d2, d5 + vadd.u16 \arg6, d2 - //16x16_dc_both - vaba.s16 \arg7, d4, \arg4 //16x16_dc_both - vadd.u16 \arg7, d2 + //16x16_dc_both + vaba.s16 \arg7, d4, \arg4 //16x16_dc_both + vadd.u16 \arg7, d2 .endm #endif @@ -156,131 +156,131 @@ WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon stmdb sp!, {r4-r7, lr} vpush {q4-q7} - //Get the top line data to 'q15'(16 bytes) - sub r7, r0, r1 + //Get the top line data to 'q15'(16 bytes) + sub r7, r0, r1 vld1.8 {q15}, [r7] - //Get the left colume data to 'q14' (16 bytes) - sub r7, r0, #1 - GET_8BYTE_DATA_L0 d28, r7, r1 - GET_8BYTE_DATA_L0 d29, r7, r1 + //Get the left colume data to 'q14' (16 bytes) + sub r7, r0, #1 + GET_8BYTE_DATA_L0 d28, r7, r1 + GET_8BYTE_DATA_L0 d29, r7, r1 - //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes) - //Calculate the 16x16_dc_both mode SATD - vaddl.u8 q0, d30, d31 - vaddl.u8 q1, d28, d29 - vadd.u16 q0, q1 - vadd.u16 d0, d1 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes) + //Calculate the 16x16_dc_both mode SATD + vaddl.u8 q0, d30, d31 + vaddl.u8 q1, d28, d29 + vadd.u16 q0, q1 + vadd.u16 d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - //Calculate the mean value - vrshr.u16 d0, #5 - vshl.u16 d27, d0, #4 + //Calculate the mean value + vrshr.u16 d0, #5 + vshl.u16 d27, d0, #4 - //Calculate the 16x16_v mode SATD and save to "q11, 12" - vshll.u8 q0, d30, #2 - vshll.u8 q1, d31, #2 - vtrn.32 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 - vtrn.16 q2, q1 - vadd.s16 q12, q2, q1 - vsub.s16 q11, q2, q1 - vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12 - //{8,9,11,10, 12,13,15,14} q11 + //Calculate the 16x16_v mode SATD and save to "q11, 12" + vshll.u8 q0, d30, #2 + vshll.u8 q1, d31, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q12, q2, q1 + vsub.s16 q11, q2, q1 + vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12 + //{8,9,11,10, 12,13,15,14} q11 //Calculate the 16x16_h mode SATD and save to "q9, q10" - vshll.u8 q0, d28, #2 - vshll.u8 q1, d29, #2 - vtrn.32 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 - vtrn.16 q2, q1 - vadd.s16 q10, q2, q1 - vsub.s16 q9, q2, q1 - vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10 - //{8,9,11,10, 12,13,15,14} q9 + vshll.u8 q0, d28, #2 + vshll.u8 q1, d29, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q10, q2, q1 + vsub.s16 q9, q2, q1 + vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10 + //{8,9,11,10, 12,13,15,14} q9 - vmov.i32 d17, #0//Save the SATD of DC_BOTH - vmov.i32 d16, #0//Save the SATD of H - vmov.i32 d15, #0//Save the SATD of V - vmov.i32 d14, #0//For zero D register - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {q3}, [r2], r3 - vld1.32 {q4}, [r2], r3 - vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 - vtrn.32 q3, q4 - vtrn.32 q5, q6 + vmov.i32 d17, #0//Save the SATD of DC_BOTH + vmov.i32 d16, #0//Save the SATD of H + vmov.i32 d15, #0//Save the SATD of V + vmov.i32 d14, #0//For zero D register + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {q3}, [r2], r3 - vld1.32 {q4}, [r2], r3 - vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 - vtrn.32 q3, q4 - vtrn.32 q5, q6 + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {q3}, [r2], r3 - vld1.32 {q4}, [r2], r3 - vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 - vtrn.32 q3, q4 - vtrn.32 q5, q6 + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {q3}, [r2], r3 - vld1.32 {q4}, [r2], r3 - vld1.32 {q5}, [r2], r3 - vld1.32 {q6}, [r2], r3 - vtrn.32 q3, q4 - vtrn.32 q5, q6 + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {q3}, [r2], r3 + vld1.32 {q4}, [r2], r3 + vld1.32 {q5}, [r2], r3 + vld1.32 {q6}, [r2], r3 + vtrn.32 q3, q4 + vtrn.32 q5, q6 HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14 HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14 - //Get the data from stack - ldr r5, [sp, #84] //the addr of Best_mode - ldr r6, [sp, #88] //the value of i_lambda + //Get the data from stack + ldr r5, [sp, #84] //the addr of Best_mode + ldr r6, [sp, #88] //the value of i_lambda - //vadd.u16 d24, d25 - vrshr.u16 d15, #1 - vpaddl.u16 d15, d15 - vpaddl.u32 d15, d15 - vmov.u32 r0, d15[0] + //vadd.u16 d24, d25 + vrshr.u16 d15, #1 + vpaddl.u16 d15, d15 + vpaddl.u32 d15, d15 + vmov.u32 r0, d15[0] - //vadd.u16 d22, d23 - vrshr.u16 d16, #1 - vpaddl.u16 d16, d16 - vpaddl.u32 d16, d16 - vmov.u32 r1, d16[0] - add r1, r1, r6, lsl #1 + //vadd.u16 d22, d23 + vrshr.u16 d16, #1 + vpaddl.u16 d16, d16 + vpaddl.u32 d16, d16 + vmov.u32 r1, d16[0] + add r1, r1, r6, lsl #1 - //vadd.u16 d20, d21 - vrshr.u16 d17, #1 - vpaddl.u16 d17, d17 - vpaddl.u32 d17, d17 - vmov.u32 r2, d17[0] - add r2, r2, r6, lsl #1 + //vadd.u16 d20, d21 + vrshr.u16 d17, #1 + vpaddl.u16 d17, d17 + vpaddl.u32 d17, d17 + vmov.u32 r2, d17[0] + add r2, r2, r6, lsl #1 mov r4, #0 cmp r1, r0 @@ -300,77 +300,77 @@ WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon stmdb sp!, {r4-r7, lr} - //Get the top line data to 'q15'(16 bytes) - sub r4, r0, r1 + //Get the top line data to 'q15'(16 bytes) + sub r4, r0, r1 vld1.8 {q15}, [r4] - //Get the left colume data to 'q14' (16 bytes) - sub r4, r0, #1 - GET_8BYTE_DATA_L0 d28, r4, r1 - GET_8BYTE_DATA_L0 d29, r4, r1 + //Get the left colume data to 'q14' (16 bytes) + sub r4, r0, #1 + GET_8BYTE_DATA_L0 d28, r4, r1 + GET_8BYTE_DATA_L0 d29, r4, r1 - //Calculate the mean value and save to 'q13' (8 bytes) - //Calculate the 16x16_dc_both mode SATD - vaddl.u8 q0, d30, d31 - vaddl.u8 q1, d28, d29 - vadd.u16 q0, q1 - vadd.u16 d0, d1 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + //Calculate the mean value and save to 'q13' (8 bytes) + //Calculate the 16x16_dc_both mode SATD + vaddl.u8 q0, d30, d31 + vaddl.u8 q1, d28, d29 + vadd.u16 q0, q1 + vadd.u16 d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - //Calculate the mean value - vrshr.u16 d0, d0, #5 - vdup.8 q13, d0[0] + //Calculate the mean value + vrshr.u16 d0, d0, #5 + vdup.8 q13, d0[0] - sub r4, r0, #1 + sub r4, r0, #1 - vmov.i32 q12, #0//Save the SATD of DC_BOTH - vmov.i32 q11, #0//Save the SATD of H - vmov.i32 q10, #0//Save the SATD of V + vmov.i32 q12, #0//Save the SATD of DC_BOTH + vmov.i32 q11, #0//Save the SATD of H + vmov.i32 q10, #0//Save the SATD of V - mov lr, #16 + mov lr, #16 sad_intra_16x16_x3_opt_loop0: //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d0[]}, [r4], r1 + vld1.8 {d0[]}, [r4], r1 - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes - vld1.8 {q1}, [r2], r3 + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {q1}, [r2], r3 - subs lr, #1 - //Do the SAD for top colume - vabal.u8 q12, d30, d2 - vabal.u8 q12, d31, d3 + subs lr, #1 + //Do the SAD for top colume + vabal.u8 q12, d30, d2 + vabal.u8 q12, d31, d3 - //Do the SAD for left colume - vabal.u8 q11, d0, d2 - vabal.u8 q11, d0, d3 + //Do the SAD for left colume + vabal.u8 q11, d0, d2 + vabal.u8 q11, d0, d3 - //Do the SAD for mean value - vabal.u8 q10, d26, d2 - vabal.u8 q10, d26, d3 + //Do the SAD for mean value + vabal.u8 q10, d26, d2 + vabal.u8 q10, d26, d3 - bne sad_intra_16x16_x3_opt_loop0 + bne sad_intra_16x16_x3_opt_loop0 - //Get the data from stack - ldr r5, [sp, #20] //the addr of Best_mode - ldr r6, [sp, #24] //the value of i_lambda + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda - vadd.u16 d24, d25 - vpaddl.u16 d24, d24 - vpaddl.u32 d24, d24 - vmov.u32 r0, d24[0] + vadd.u16 d24, d25 + vpaddl.u16 d24, d24 + vpaddl.u32 d24, d24 + vmov.u32 r0, d24[0] - vadd.u16 d22, d23 - vpaddl.u16 d22, d22 - vpaddl.u32 d22, d22 - vmov.u32 r1, d22[0] - add r1, r1, r6, lsl #1 + vadd.u16 d22, d23 + vpaddl.u16 d22, d22 + vpaddl.u32 d22, d22 + vmov.u32 r1, d22[0] + add r1, r1, r6, lsl #1 - vadd.u16 d20, d21 - vpaddl.u16 d20, d20 - vpaddl.u32 d20, d20 - vmov.u32 r2, d20[0] - add r2, r2, r6, lsl #1 + vadd.u16 d20, d21 + vpaddl.u16 d20, d20 + vpaddl.u32 d20, d20 + vmov.u32 r2, d20[0] + add r2, r2, r6, lsl #1 mov r4, #0 cmp r1, r0 @@ -382,31 +382,31 @@ sad_intra_16x16_x3_opt_loop0: str r4, [r5] - ldmia sp!, {r4-r7, lr} + ldmia sp!, {r4-r7, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon stmdb sp!, {r4-r7, lr} - //Get the data from stack - ldr r4, [sp, #32] //p_dec_cr - ldr r5, [sp, #36] //p_enc_cr + //Get the data from stack + ldr r4, [sp, #32] //p_dec_cr + ldr r5, [sp, #36] //p_enc_cr - //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) - sub r6, r0, #1 - GET_8BYTE_DATA_L0 d28, r6, r1 - sub r6, r4, #1 - GET_8BYTE_DATA_L0 d30, r6, r1 + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) + sub r6, r0, #1 + GET_8BYTE_DATA_L0 d28, r6, r1 + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 - //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) - sub r6, r0, r1 + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) + sub r6, r0, r1 vld1.8 {d29}, [r6] - sub r6, r4, r1 + sub r6, r4, r1 vld1.8 {d31}, [r6] - //Calculate the sum of left column and top row - vmov.i32 q0, q14 + //Calculate the sum of left column and top row + vmov.i32 q0, q14 vpaddl.u8 q0, q0 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 //'m1' save to d2 @@ -416,13 +416,13 @@ WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon //duplicate the 'mx' to a vector line vdup.8 d27, d2[0] vdup.8 d26, d1[4] - vtrn.32 d27, d26 + vtrn.32 d27, d26 vdup.8 d26, d0[4] vdup.8 d25, d2[4] vtrn.32 d26, d25 //Save to "d27, d26" - vmov.i32 q0, q15 + vmov.i32 q0, q15 vpaddl.u8 q0, q0 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 //'m1' save to d2 @@ -432,94 +432,94 @@ WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon //duplicate the 'mx' to a vector line vdup.8 d25, d2[0] vdup.8 d24, d1[4] - vtrn.32 d25, d24 + vtrn.32 d25, d24 vdup.8 d24, d0[4] vdup.8 d23, d2[4] - vtrn.32 d24, d23 //Save to "d25, d24" + vtrn.32 d24, d23 //Save to "d25, d24" - vmov.i32 q11, #0//Save the SATD of DC_BOTH - vmov.i32 q10, #0//Save the SATD of H - vmov.i32 q9 , #0//Save the SATD of V - sub r6, r0, #1 - sub r7, r4, #1 - mov lr, #4 + vmov.i32 q11, #0//Save the SATD of DC_BOTH + vmov.i32 q10, #0//Save the SATD of H + vmov.i32 q9 , #0//Save the SATD of V + sub r6, r0, #1 + sub r7, r4, #1 + mov lr, #4 sad_intra_8x8_x3_opt_loop0: - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes - vld1.8 {d0}, [r2], r3 - vld1.8 {d1}, [r5], r3 + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r5], r3 //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d2[]}, [r6], r1 - vld1.8 {d3[]}, [r7], r1 + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 - subs lr, #1 + subs lr, #1 - //Do the SAD for top colume - vabal.u8 q11, d29, d0 - vabal.u8 q11, d31, d1 + //Do the SAD for top colume + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 - //Do the SAD for left colume - vabal.u8 q10, d2, d0 - vabal.u8 q10, d3, d1 + //Do the SAD for left colume + vabal.u8 q10, d2, d0 + vabal.u8 q10, d3, d1 - //Do the SAD for mean value - vabal.u8 q9, d27, d0 - vabal.u8 q9, d25, d1 + //Do the SAD for mean value + vabal.u8 q9, d27, d0 + vabal.u8 q9, d25, d1 - bne sad_intra_8x8_x3_opt_loop0 + bne sad_intra_8x8_x3_opt_loop0 - mov lr, #4 + mov lr, #4 sad_intra_8x8_x3_opt_loop1: - //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes - vld1.8 {d0}, [r2], r3 - vld1.8 {d1}, [r5], r3 + //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r5], r3 //Get the left colume data to 'd0' (16 bytes) - vld1.8 {d2[]}, [r6], r1 - vld1.8 {d3[]}, [r7], r1 + vld1.8 {d2[]}, [r6], r1 + vld1.8 {d3[]}, [r7], r1 - subs lr, #1 + subs lr, #1 - //Do the SAD for top colume - vabal.u8 q11, d29, d0 - vabal.u8 q11, d31, d1 + //Do the SAD for top colume + vabal.u8 q11, d29, d0 + vabal.u8 q11, d31, d1 - //Do the SAD for left colume - vabal.u8 q10, d2, d0 - vabal.u8 q10, d3, d1 + //Do the SAD for left colume + vabal.u8 q10, d2, d0 + vabal.u8 q10, d3, d1 - //Do the SAD for mean value - vabal.u8 q9, d26, d0 - vabal.u8 q9, d24, d1 + //Do the SAD for mean value + vabal.u8 q9, d26, d0 + vabal.u8 q9, d24, d1 - bne sad_intra_8x8_x3_opt_loop1 - //Get the data from stack - ldr r5, [sp, #20] //the addr of Best_mode - ldr r6, [sp, #24] //the value of i_lambda + bne sad_intra_8x8_x3_opt_loop1 + //Get the data from stack + ldr r5, [sp, #20] //the addr of Best_mode + ldr r6, [sp, #24] //the value of i_lambda - vadd.u16 d22, d23 - vpaddl.u16 d22, d22 - vpaddl.u32 d22, d22 - vmov.u32 r0, d22[0] - add r0, r0, r6, lsl #1 + vadd.u16 d22, d23 + vpaddl.u16 d22, d22 + vpaddl.u32 d22, d22 + vmov.u32 r0, d22[0] + add r0, r0, r6, lsl #1 - vadd.u16 d20, d21 - vpaddl.u16 d20, d20 - vpaddl.u32 d20, d20 - vmov.u32 r1, d20[0] - add r1, r1, r6, lsl #1 + vadd.u16 d20, d21 + vpaddl.u16 d20, d20 + vpaddl.u32 d20, d20 + vmov.u32 r1, d20[0] + add r1, r1, r6, lsl #1 - vadd.u16 d18, d19 - vpaddl.u16 d18, d18 - vpaddl.u32 d18, d18 - vmov.u32 r2, d18[0] + vadd.u16 d18, d19 + vpaddl.u16 d18, d18 + vpaddl.u32 d18, d18 + vmov.u32 r2, d18[0] mov r4, #2 cmp r1, r0 @@ -531,7 +531,7 @@ sad_intra_8x8_x3_opt_loop1: str r4, [r5] - ldmia sp!, {r4-r7, lr} + ldmia sp!, {r4-r7, lr} WELS_ASM_FUNC_END @@ -539,47 +539,47 @@ WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon stmdb sp!, {r4-r7, lr} vpush {q4-q7} - //Get the data from stack - ldr r4, [sp, #96] //p_dec_cr - ldr r5, [sp, #100] //p_enc_cr + //Get the data from stack + ldr r4, [sp, #96] //p_dec_cr + ldr r5, [sp, #100] //p_enc_cr - //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) - sub r6, r0, r1 + //Get the top line data to 'd29(cb), d31(cr)'(16 bytes) + sub r6, r0, r1 vld1.8 {d29}, [r6] - sub r6, r4, r1 + sub r6, r4, r1 vld1.8 {d31}, [r6] - //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) - sub r6, r0, #1 - GET_8BYTE_DATA_L0 d28, r6, r1 - sub r6, r4, #1 - GET_8BYTE_DATA_L0 d30, r6, r1 + //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes) + sub r6, r0, #1 + GET_8BYTE_DATA_L0 d28, r6, r1 + sub r6, r4, #1 + GET_8BYTE_DATA_L0 d30, r6, r1 - //Calculate the 16x16_v mode SATD and save to "q12, 13" - vshll.u8 q0, d29, #2 - vshll.u8 q1, d31, #2 - vtrn.32 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 - vtrn.16 q2, q1 - vadd.s16 q13, q2, q1 - vsub.s16 q12, q2, q1 - vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13 - //{8,9,11,10, 12,13,15,14} q12 + //Calculate the 16x16_v mode SATD and save to "q12, 13" + vshll.u8 q0, d29, #2 + vshll.u8 q1, d31, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q13, q2, q1 + vsub.s16 q12, q2, q1 + vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13 + //{8,9,11,10, 12,13,15,14} q12 //Calculate the 16x16_h mode SATD and save to "q10, q11" - vshll.u8 q0, d28, #2 - vshll.u8 q1, d30, #2 - vtrn.32 q0, q1 - vadd.s16 q2, q0, q1 - vsub.s16 q1, q0, q1 - vtrn.16 q2, q1 - vadd.s16 q11, q2, q1 - vsub.s16 q10, q2, q1 - vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11 - //{8,9,11,10, 12,13,15,14} q10 + vshll.u8 q0, d28, #2 + vshll.u8 q1, d30, #2 + vtrn.32 q0, q1 + vadd.s16 q2, q0, q1 + vsub.s16 q1, q0, q1 + vtrn.16 q2, q1 + vadd.s16 q11, q2, q1 + vsub.s16 q10, q2, q1 + vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11 + //{8,9,11,10, 12,13,15,14} q10 - //Calculate the sum of left column and top row - //vmov.i32 q0, q14 + //Calculate the sum of left column and top row + //vmov.i32 q0, q14 vpaddl.u8 q0, q14 vpaddl.u16 q0, q0 vadd.u32 d2, d0, d1 @@ -588,77 +588,77 @@ WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon vpaddl.u16 q2, q2 vadd.u32 d3, d4, d5 - vtrn.32 q0, q2 - vrshr.u32 q1, #3 - vrshr.u32 q2, #2 - vshll.u32 q9, d4, #4 // {2cb, 2cr} q9 - vshll.u32 q8, d5, #4 // {1cb, 1cr} q8 - vshll.u32 q7, d2, #4 // {0cb, 3cb} q7 - vshll.u32 q6, d3, #4 // {0cr, 3cr} q6 + vtrn.32 q0, q2 + vrshr.u32 q1, #3 + vrshr.u32 q2, #2 + vshll.u32 q9, d4, #4 // {2cb, 2cr} q9 + vshll.u32 q8, d5, #4 // {1cb, 1cr} q8 + vshll.u32 q7, d2, #4 // {0cb, 3cb} q7 + vshll.u32 q6, d3, #4 // {0cr, 3cr} q6 vmov.i32 d28, #0//Save the SATD of DC_BOTH - vmov.i32 d10, #0//Save the SATD of H - vmov.i32 d11, #0//Save the SATD of V - vmov.i32 d30, #0//For zero D register - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {d6}, [r2], r3 - vld1.32 {d7}, [r2], r3 - vld1.32 {d8}, [r2], r3 - vld1.32 {d9}, [r2], r3 - vtrn.32 d6, d7 - vtrn.32 d8, d9 + vmov.i32 d10, #0//Save the SATD of H + vmov.i32 d11, #0//Save the SATD of V + vmov.i32 d30, #0//For zero D register + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {d6}, [r2], r3 + vld1.32 {d7}, [r2], r3 + vld1.32 {d8}, [r2], r3 + vld1.32 {d9}, [r2], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30 - vld1.32 {d6}, [r5], r3 - vld1.32 {d7}, [r5], r3 - vld1.32 {d8}, [r5], r3 - vld1.32 {d9}, [r5], r3 - vtrn.32 d6, d7 - vtrn.32 d8, d9 + vld1.32 {d6}, [r5], r3 + vld1.32 {d7}, [r5], r3 + vld1.32 {d8}, [r5], r3 + vld1.32 {d9}, [r5], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30 - //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes - vld1.32 {d6}, [r2], r3 - vld1.32 {d7}, [r2], r3 - vld1.32 {d8}, [r2], r3 - vld1.32 {d9}, [r2], r3 - vtrn.32 d6, d7 - vtrn.32 d8, d9 + //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes + vld1.32 {d6}, [r2], r3 + vld1.32 {d7}, [r2], r3 + vld1.32 {d8}, [r2], r3 + vld1.32 {d9}, [r2], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30 - vld1.32 {d6}, [r5], r3 - vld1.32 {d7}, [r5], r3 - vld1.32 {d8}, [r5], r3 - vld1.32 {d9}, [r5], r3 - vtrn.32 d6, d7 - vtrn.32 d8, d9 + vld1.32 {d6}, [r5], r3 + vld1.32 {d7}, [r5], r3 + vld1.32 {d8}, [r5], r3 + vld1.32 {d9}, [r5], r3 + vtrn.32 d6, d7 + vtrn.32 d8, d9 HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30 HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30 - //Get the data from stack - ldr r5, [sp, #84] //the addr of Best_mode - ldr r6, [sp, #88] //the value of i_lambda + //Get the data from stack + ldr r5, [sp, #84] //the addr of Best_mode + ldr r6, [sp, #88] //the value of i_lambda - vrshr.u16 d11, #1 - vpaddl.u16 d11, d11 - vpaddl.u32 d11, d11 - vmov.u32 lr, d11[0] - add lr, lr, r6, lsl #1 + vrshr.u16 d11, #1 + vpaddl.u16 d11, d11 + vpaddl.u32 d11, d11 + vmov.u32 lr, d11[0] + add lr, lr, r6, lsl #1 - vrshr.u16 d10, #1 - vpaddl.u16 d10, d10 - vpaddl.u32 d10, d10 - vmov.u32 r3, d10[0] - add r3, r3, r6, lsl #1 + vrshr.u16 d10, #1 + vpaddl.u16 d10, d10 + vpaddl.u32 d10, d10 + vmov.u32 r3, d10[0] + add r3, r3, r6, lsl #1 - vrshr.u16 d28, #1 - vpaddl.u16 d28, d28 - vpaddl.u32 d28, d28 - vmov.u32 r2, d28[0] + vrshr.u16 d28, #1 + vpaddl.u16 d28, d28 + vpaddl.u32 d28, d28 + vmov.u32 r2, d28[0] mov r6, #2 cmp r3, lr @@ -671,8 +671,8 @@ WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon str r6, [r5] mov r0, lr - vpop {q4-q7} - ldmia sp!, {r4-r7, lr} + vpop {q4-q7} + ldmia sp!, {r4-r7, lr} WELS_ASM_FUNC_END @@ -680,118 +680,118 @@ WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon stmdb sp!, {r4-r7, lr} //Get the top line data to 'd31[0~3]'(4 bytes) - sub r7, r0, r1 + sub r7, r0, r1 vld1.32 {d31[0]}, [r7] - //Get the left colume data to 'd31[4~7]' (4 bytes) - sub r7, r0, #1 + //Get the left colume data to 'd31[4~7]' (4 bytes) + sub r7, r0, #1 vld1.8 {d31[4]}, [r7], r1 vld1.8 {d31[5]}, [r7], r1 vld1.8 {d31[6]}, [r7], r1 vld1.8 {d31[7]}, [r7], r1 - //Calculate the mean value and save to 'd30' (2 bytes) - vpaddl.u8 d0, d31 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - //Calculate the mean value - vrshr.u16 d0, #3 - vshl.u16 d30, d0, #4 + //Calculate the mean value and save to 'd30' (2 bytes) + vpaddl.u8 d0, d31 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + //Calculate the mean value + vrshr.u16 d0, #3 + vshl.u16 d30, d0, #4 - //Calculate the 16x16_v mode SATD and save to "d29" + //Calculate the 16x16_v mode SATD and save to "d29" //Calculate the 16x16_h mode SATD and save to "d28" - vshll.u8 q0, d31, #2 - vtrn.32 d0, d1 - vadd.s16 d2, d0, d1 - vsub.s16 d1, d0, d1 - vtrn.16 d2, d1 - vadd.s16 d29, d2, d1 - vsub.s16 d28, d2, d1 - vtrn.32 d29, d28 //{0,1,3,2 top} d29 - //{0,1,3,2 left} d28 + vshll.u8 q0, d31, #2 + vtrn.32 d0, d1 + vadd.s16 d2, d0, d1 + vsub.s16 d1, d0, d1 + vtrn.16 d2, d1 + vadd.s16 d29, d2, d1 + vsub.s16 d28, d2, d1 + vtrn.32 d29, d28 //{0,1,3,2 top} d29 + //{0,1,3,2 left} d28 vmov.i32 d27, #0//Save the SATD of DC_BOTH - vmov.i32 d26, #0//Save the SATD of H - vmov.i32 d25, #0//Save the SATD of V - vmov.i32 d24, #0//For zero D register + vmov.i32 d26, #0//Save the SATD of H + vmov.i32 d25, #0//Save the SATD of V + vmov.i32 d24, #0//For zero D register - //Load the p_enc data and save to "d22,d23"--- 4X4 bytes - vld1.32 {d23[0]}, [r2], r3 - vld1.32 {d23[1]}, [r2], r3 - vld1.32 {d22[0]}, [r2], r3 - vld1.32 {d22[1]}, [r2], r3 + //Load the p_enc data and save to "d22,d23"--- 4X4 bytes + vld1.32 {d23[0]}, [r2], r3 + vld1.32 {d23[1]}, [r2], r3 + vld1.32 {d22[0]}, [r2], r3 + vld1.32 {d22[1]}, [r2], r3 HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24 - //Get the data from stack - ldr r5, [sp, #28] //the value of lambda2 - ldr r6, [sp, #32] //the value of lambda1 - ldr r7, [sp, #36] //the value of lambda0 + //Get the data from stack + ldr r5, [sp, #28] //the value of lambda2 + ldr r6, [sp, #32] //the value of lambda1 + ldr r7, [sp, #36] //the value of lambda0 - vrshr.u16 d25, #1 - vpaddl.u16 d25, d25 - vpaddl.u32 d25, d25 - vmov.u32 r0, d25[0] - add r0, r7 + vrshr.u16 d25, #1 + vpaddl.u16 d25, d25 + vpaddl.u32 d25, d25 + vmov.u32 r0, d25[0] + add r0, r7 - vrshr.u16 d26, #1 - vpaddl.u16 d26, d26 - vpaddl.u32 d26, d26 - vmov.u32 r1, d26[0] - add r1, r6 + vrshr.u16 d26, #1 + vpaddl.u16 d26, d26 + vpaddl.u32 d26, d26 + vmov.u32 r1, d26[0] + add r1, r6 - vrshr.u16 d27, #1 - vpaddl.u16 d27, d27 - vpaddl.u32 d27, d27 - vmov.u32 r2, d27[0] - add r2, r5 + vrshr.u16 d27, #1 + vpaddl.u16 d27, d27 + vpaddl.u32 d27, d27 + vmov.u32 r2, d27[0] + add r2, r5 - ldr r5, [sp, #20] //p_dst - ldr r6, [sp, #24] //the addr of Best_mode + ldr r5, [sp, #20] //p_dst + ldr r6, [sp, #24] //the addr of Best_mode - mov r4, r0 - cmp r1, r4 - movcc r4, r1 - cmp r2, r4 - movcc r4, r2 + mov r4, r0 + cmp r1, r4 + movcc r4, r1 + cmp r2, r4 + movcc r4, r2 - //The compare sequence affect the resule - cmp r4, r2 - bne satd_intra_4x4_x3_opt_jump0 - mov r0, #2 - str r0, [r6] - vshr.u32 d0, d30, #4 // {2cb, 2cr} q9 - vdup.8 q1, d0[0] - vst1.8 {q1}, [r5] - //... - bl satd_intra_4x4_x3_opt_end + //The compare sequence affect the resule + cmp r4, r2 + bne satd_intra_4x4_x3_opt_jump0 + mov r0, #2 + str r0, [r6] + vshr.u32 d0, d30, #4 // {2cb, 2cr} q9 + vdup.8 q1, d0[0] + vst1.8 {q1}, [r5] + //... + bl satd_intra_4x4_x3_opt_end satd_intra_4x4_x3_opt_jump0: - cmp r4, r1 - bne satd_intra_4x4_x3_opt_jump1 - mov r0, #1 - str r0, [r6] - vdup.8 d0, d31[4] - vdup.8 d1, d31[5] - vdup.8 d2, d31[6] - vdup.8 d3, d31[7] - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5] + cmp r4, r1 + bne satd_intra_4x4_x3_opt_jump1 + mov r0, #1 + str r0, [r6] + vdup.8 d0, d31[4] + vdup.8 d1, d31[5] + vdup.8 d2, d31[6] + vdup.8 d3, d31[7] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5] - bl satd_intra_4x4_x3_opt_end + bl satd_intra_4x4_x3_opt_end satd_intra_4x4_x3_opt_jump1: - mov r0, #0 - str r0, [r6] - vst1.32 {d31[0]}, [r5]! - vst1.32 {d31[0]}, [r5]! - vst1.32 {d31[0]}, [r5]! - vst1.32 {d31[0]}, [r5]! + mov r0, #0 + str r0, [r6] + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! + vst1.32 {d31[0]}, [r5]! satd_intra_4x4_x3_opt_end: - mov r0, r4 + mov r0, r4 - ldmia sp!, {r4-r7, lr} + ldmia sp!, {r4-r7, lr} WELS_ASM_FUNC_END #endif diff --git a/codec/encoder/core/arm/pixel_neon.S b/codec/encoder/core/arm/pixel_neon.S index 2ee19dd7..c21c3e13 100644 --- a/codec/encoder/core/arm/pixel_neon.S +++ b/codec/encoder/core/arm/pixel_neon.S @@ -66,10 +66,10 @@ vsub.s16 q3, q12, q13 vadd.s16 q8, q10, q11 - vsub.s16 q9, q10, q11 + vsub.s16 q9, q10, q11 vadd.s16 q10, q14, q15 - vsub.s16 q11, q14, q15 + vsub.s16 q11, q14, q15 vadd.s16 q12, q0, q2 vsub.s16 q14, q0, q2 @@ -372,28 +372,28 @@ WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon stmdb sp!, {r4-r5, lr} - //Loading a horizontal line data (4 bytes) - //line 0 - ldr r4, [r0], r1 - ldr r5, [r2], r3 - usad8 lr, r4, r5 + //Loading a horizontal line data (4 bytes) + //line 0 + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usad8 lr, r4, r5 //line 1 - ldr r4, [r0], r1 - ldr r5, [r2], r3 - usada8 lr, r4, r5, lr + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usada8 lr, r4, r5, lr //line 2 - ldr r4, [r0], r1 - ldr r5, [r2], r3 - usada8 lr, r4, r5, lr + ldr r4, [r0], r1 + ldr r5, [r2], r3 + usada8 lr, r4, r5, lr - //line 3 - ldr r4, [r0] - ldr r5, [r2] - usada8 r0, r4, r5, lr + //line 3 + ldr r4, [r0] + ldr r5, [r2] + usada8 r0, r4, r5, lr - ldmia sp!, {r4-r5, lr} + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END @@ -401,340 +401,340 @@ WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon stmdb sp!, {r4-r5, lr} - //Generate the pix2 start addr - sub r4, r2, #1 - add r5, r2, #1 - sub r2, r3 + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 //Loading a horizontal line data (16 bytes) - vld1.8 {q0}, [r0], r1 //save pix1 + vld1.8 {q0}, [r0], r1 //save pix1 - vld1.8 {q1}, [r2], r3 //save pix2 - stride - vld1.8 {q10}, [r2], r3 //save pix2 - vld1.8 {q2}, [r2], r3 //save pix2 + stride + vld1.8 {q1}, [r2], r3 //save pix2 - stride + vld1.8 {q10}, [r2], r3 //save pix2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride - vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vld1.8 {q8}, [r5], r3 //save pix2 + 1 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vld1.8 {q8}, [r5], r3 //save pix2 + 1 - //Do the SAD for 16 bytes - vabdl.u8 q15, d0, d2 - vabal.u8 q15, d1, d3 + //Do the SAD for 16 bytes + vabdl.u8 q15, d0, d2 + vabal.u8 q15, d1, d3 - vabdl.u8 q13, d0, d4 - vabal.u8 q13, d1, d5 + vabdl.u8 q13, d0, d4 + vabal.u8 q13, d1, d5 - vabdl.u8 q11, d0, d6 - vabal.u8 q11, d1, d7 + vabdl.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 - vabdl.u8 q9, d0, d16 - vabal.u8 q9, d1, d17 + vabdl.u8 q9, d0, d16 + vabal.u8 q9, d1, d17 - mov lr, #15 + mov lr, #15 pixel_sad_4_16x16_loop_0: //Loading a horizontal line data (16 bytes) - vld1.8 {q0}, [r0], r1 //save pix1 - vmov.8 q1, q10 //save pix2 - stride - vmov.8 q10, q2 - vabal.u8 q15, d0, d2 - vld1.8 {q2}, [r2], r3 //save pix2 + stride - vabal.u8 q15, d1, d3 - vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vabal.u8 q13, d0, d4 - vld1.8 {q8}, [r5], r3 //save pix2 + 1 + vld1.8 {q0}, [r0], r1 //save pix1 + vmov.8 q1, q10 //save pix2 - stride + vmov.8 q10, q2 + vabal.u8 q15, d0, d2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + vabal.u8 q15, d1, d3 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vabal.u8 q13, d0, d4 + vld1.8 {q8}, [r5], r3 //save pix2 + 1 vabal.u8 q13, d1, d5 - subs lr, #1 + subs lr, #1 - vabal.u8 q11, d0, d6 - vabal.u8 q11, d1, d7 + vabal.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 - vabal.u8 q9, d0, d16 - vabal.u8 q9, d1, d17 + vabal.u8 q9, d0, d16 + vabal.u8 q9, d1, d17 - bne pixel_sad_4_16x16_loop_0 + bne pixel_sad_4_16x16_loop_0 //Save SAD to 'r0' - ldr r0, [sp, #12] + ldr r0, [sp, #12] - vadd.u16 d0, d30, d31 - vadd.u16 d1, d26, d27 - vadd.u16 d2, d22, d23 - vadd.u16 d3, d18, d19 + vadd.u16 d0, d30, d31 + vadd.u16 d1, d26, d27 + vadd.u16 d2, d22, d23 + vadd.u16 d3, d18, d19 - vpaddl.u16 q0, q0 - vpaddl.u16 q1, q1 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 - vpaddl.u32 q0, q0 - vpaddl.u32 q1, q1 + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - ldmia sp!, {r4-r5, lr} + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon stmdb sp!, {r4-r5, lr} - //Generate the pix2 start addr - sub r4, r2, #1 - add r5, r2, #1 - sub r2, r3 + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 //Loading a horizontal line data (16 bytes) - vld1.8 {q0}, [r0], r1 //save pix1 + vld1.8 {q0}, [r0], r1 //save pix1 - vld1.8 {q1}, [r2], r3 //save pix2 - stride - vld1.8 {q10}, [r2], r3 //save pix2 - vld1.8 {q2}, [r2], r3 //save pix2 + stride + vld1.8 {q1}, [r2], r3 //save pix2 - stride + vld1.8 {q10}, [r2], r3 //save pix2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride - vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vld1.8 {q8}, [r5], r3 //save pix2 + 1 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vld1.8 {q8}, [r5], r3 //save pix2 + 1 - //Do the SAD for 16 bytes - vabdl.u8 q15, d0, d2 - vabal.u8 q15, d1, d3 + //Do the SAD for 16 bytes + vabdl.u8 q15, d0, d2 + vabal.u8 q15, d1, d3 - vabdl.u8 q13, d0, d4 - vabal.u8 q13, d1, d5 + vabdl.u8 q13, d0, d4 + vabal.u8 q13, d1, d5 - vabdl.u8 q11, d0, d6 - vabal.u8 q11, d1, d7 + vabdl.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 - vabdl.u8 q9, d0, d16 - vabal.u8 q9, d1, d17 + vabdl.u8 q9, d0, d16 + vabal.u8 q9, d1, d17 - mov lr, #7 + mov lr, #7 pixel_sad_4_16x8_loop_0: //Loading a horizontal line data (16 bytes) - vld1.8 {q0}, [r0], r1 //save pix1 - vmov.8 q1, q10 //save pix2 - stride - vmov.8 q10, q2 - vabal.u8 q15, d0, d2 - vld1.8 {q2}, [r2], r3 //save pix2 + stride - vabal.u8 q15, d1, d3 - vld1.8 {q3}, [r4], r3 //save pix2 - 1 - vabal.u8 q13, d0, d4 - vld1.8 {q8}, [r5], r3 //save pix2 + 1 + vld1.8 {q0}, [r0], r1 //save pix1 + vmov.8 q1, q10 //save pix2 - stride + vmov.8 q10, q2 + vabal.u8 q15, d0, d2 + vld1.8 {q2}, [r2], r3 //save pix2 + stride + vabal.u8 q15, d1, d3 + vld1.8 {q3}, [r4], r3 //save pix2 - 1 + vabal.u8 q13, d0, d4 + vld1.8 {q8}, [r5], r3 //save pix2 + 1 vabal.u8 q13, d1, d5 - subs lr, #1 + subs lr, #1 - vabal.u8 q11, d0, d6 - vabal.u8 q11, d1, d7 + vabal.u8 q11, d0, d6 + vabal.u8 q11, d1, d7 - vabal.u8 q9, d0, d16 - vabal.u8 q9, d1, d17 + vabal.u8 q9, d0, d16 + vabal.u8 q9, d1, d17 - bne pixel_sad_4_16x8_loop_0 + bne pixel_sad_4_16x8_loop_0 //Save SAD to 'r0' - ldr r0, [sp, #12] + ldr r0, [sp, #12] - vadd.u16 d0, d30, d31 - vadd.u16 d1, d26, d27 - vadd.u16 d2, d22, d23 - vadd.u16 d3, d18, d19 + vadd.u16 d0, d30, d31 + vadd.u16 d1, d26, d27 + vadd.u16 d2, d22, d23 + vadd.u16 d3, d18, d19 - vpaddl.u16 q0, q0 - vpaddl.u16 q1, q1 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 - vpaddl.u32 q0, q0 - vpaddl.u32 q1, q1 + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - ldmia sp!, {r4-r5, lr} + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon stmdb sp!, {r4-r5, lr} - //Generate the pix2 start addr - sub r4, r2, #1 - add r5, r2, #1 - sub r2, r3 + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 //save pix1 + vld1.8 {d0}, [r0], r1 //save pix1 - vld1.8 {d1}, [r2], r3 //save pix2 - stride - vld1.8 {d6}, [r2], r3 //save pix2 - vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d1}, [r2], r3 //save pix2 - stride + vld1.8 {d6}, [r2], r3 //save pix2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride - vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 - //Do the SAD for 8 bytes - vabdl.u8 q15, d0, d1 - vabdl.u8 q14, d0, d2 - vabdl.u8 q13, d0, d3 - vabdl.u8 q12, d0, d4 + //Do the SAD for 8 bytes + vabdl.u8 q15, d0, d1 + vabdl.u8 q14, d0, d2 + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d0, d4 - mov lr, #15 + mov lr, #15 pixel_sad_4_8x16_loop_0: //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 //save pix1 - vmov.8 d1, d6 //save pix2 - stride - vmov.8 d6, d2 - vld1.8 {d2}, [r2], r3 //save pix2 + stride - vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vabal.u8 q15, d0, d1 + vld1.8 {d0}, [r0], r1 //save pix1 + vmov.8 d1, d6 //save pix2 - stride + vmov.8 d6, d2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vabal.u8 q15, d0, d1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 - //Do the SAD for 8 bytes - vabal.u8 q14, d0, d2 - vabal.u8 q13, d0, d3 - vabal.u8 q12, d0, d4 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes + vabal.u8 q14, d0, d2 + vabal.u8 q13, d0, d3 + vabal.u8 q12, d0, d4 subs lr, #1 - bne pixel_sad_4_8x16_loop_0 + bne pixel_sad_4_8x16_loop_0 //Save SAD to 'r0' - ldr r0, [sp, #12] + ldr r0, [sp, #12] - vadd.u16 d0, d30, d31 - vadd.u16 d1, d28, d29 - vadd.u16 d2, d26, d27 - vadd.u16 d3, d24, d25 + vadd.u16 d0, d30, d31 + vadd.u16 d1, d28, d29 + vadd.u16 d2, d26, d27 + vadd.u16 d3, d24, d25 - vpaddl.u16 q0, q0 - vpaddl.u16 q1, q1 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 - vpaddl.u32 q0, q0 - vpaddl.u32 q1, q1 + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - ldmia sp!, {r4-r5, lr} + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon - stmdb sp!, {r4-r5, lr} + stmdb sp!, {r4-r5, lr} - //Generate the pix2 start addr - sub r4, r2, #1 - add r5, r2, #1 - sub r2, r3 + //Generate the pix2 start addr + sub r4, r2, #1 + add r5, r2, #1 + sub r2, r3 //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 //save pix1 + vld1.8 {d0}, [r0], r1 //save pix1 - vld1.8 {d1}, [r2], r3 //save pix2 - stride - vld1.8 {d6}, [r2], r3 //save pix2 - vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d1}, [r2], r3 //save pix2 - stride + vld1.8 {d6}, [r2], r3 //save pix2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride - vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 - //Do the SAD for 8 bytes - vabdl.u8 q15, d0, d1 - vabdl.u8 q14, d0, d2 - vabdl.u8 q13, d0, d3 - vabdl.u8 q12, d0, d4 + //Do the SAD for 8 bytes + vabdl.u8 q15, d0, d1 + vabdl.u8 q14, d0, d2 + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d0, d4 - mov lr, #7 + mov lr, #7 pixel_sad_4_8x8_loop_0: //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 //save pix1 - vmov.8 d1, d6 //save pix2 - stride - vmov.8 d6, d2 - vld1.8 {d2}, [r2], r3 //save pix2 + stride - vld1.8 {d3}, [r4], r3 //save pix2 - 1 - vabal.u8 q15, d0, d1 + vld1.8 {d0}, [r0], r1 //save pix1 + vmov.8 d1, d6 //save pix2 - stride + vmov.8 d6, d2 + vld1.8 {d2}, [r2], r3 //save pix2 + stride + vld1.8 {d3}, [r4], r3 //save pix2 - 1 + vabal.u8 q15, d0, d1 - vld1.8 {d4}, [r5], r3 //save pix2 + 1 - //Do the SAD for 8 bytes - vabal.u8 q14, d0, d2 - vabal.u8 q13, d0, d3 - vabal.u8 q12, d0, d4 + vld1.8 {d4}, [r5], r3 //save pix2 + 1 + //Do the SAD for 8 bytes + vabal.u8 q14, d0, d2 + vabal.u8 q13, d0, d3 + vabal.u8 q12, d0, d4 subs lr, #1 - bne pixel_sad_4_8x8_loop_0 + bne pixel_sad_4_8x8_loop_0 //Save SAD to 'r0' - ldr r0, [sp, #12] + ldr r0, [sp, #12] - vadd.u16 d0, d30, d31 - vadd.u16 d1, d28, d29 - vadd.u16 d2, d26, d27 - vadd.u16 d3, d24, d25 + vadd.u16 d0, d30, d31 + vadd.u16 d1, d28, d29 + vadd.u16 d2, d26, d27 + vadd.u16 d3, d24, d25 - vpaddl.u16 q0, q0 - vpaddl.u16 q1, q1 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 - vpaddl.u32 q0, q0 - vpaddl.u32 q1, q1 + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] - ldmia sp!, {r4-r5, lr} + ldmia sp!, {r4-r5, lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d1[0]}, [r0], r1 - vld1.32 {d1[1]}, [r0] + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d1[0]}, [r0], r1 + vld1.32 {d1[1]}, [r0] - sub r0, r2, r3 - vld1.32 {d2[0]}, [r0], r3 - vld1.32 {d2[1]}, [r0], r3 - vld1.32 {d3[0]}, [r0], r3 - vld1.32 {d3[1]}, [r0], r3 - vld1.32 {d4[0]}, [r0], r3 - vld1.32 {d4[1]}, [r0] + sub r0, r2, r3 + vld1.32 {d2[0]}, [r0], r3 + vld1.32 {d2[1]}, [r0], r3 + vld1.32 {d3[0]}, [r0], r3 + vld1.32 {d3[1]}, [r0], r3 + vld1.32 {d4[0]}, [r0], r3 + vld1.32 {d4[1]}, [r0] - sub r0, r2, #1 - vld1.32 {d5[0]}, [r0], r3 - vld1.32 {d5[1]}, [r0], r3 - vld1.32 {d6[0]}, [r0], r3 - vld1.32 {d6[1]}, [r0] + sub r0, r2, #1 + vld1.32 {d5[0]}, [r0], r3 + vld1.32 {d5[1]}, [r0], r3 + vld1.32 {d6[0]}, [r0], r3 + vld1.32 {d6[1]}, [r0] - add r0, r2, #1 - vld1.32 {d7[0]}, [r0], r3 - vld1.32 {d7[1]}, [r0], r3 - vld1.32 {d8[0]}, [r0], r3 - vld1.32 {d8[1]}, [r0] + add r0, r2, #1 + vld1.32 {d7[0]}, [r0], r3 + vld1.32 {d7[1]}, [r0], r3 + vld1.32 {d8[0]}, [r0], r3 + vld1.32 {d8[1]}, [r0] - vabdl.u8 q15, d0, d2 - vabdl.u8 q14, d1, d3 + vabdl.u8 q15, d0, d2 + vabdl.u8 q14, d1, d3 - vabdl.u8 q13, d0, d3 - vabdl.u8 q12, d1, d4 + vabdl.u8 q13, d0, d3 + vabdl.u8 q12, d1, d4 - vabdl.u8 q11, d0, d5 - vabdl.u8 q10, d1, d6 + vabdl.u8 q11, d0, d5 + vabdl.u8 q10, d1, d6 - vabdl.u8 q9, d0, d7 - vabdl.u8 q8, d1, d8 + vabdl.u8 q9, d0, d7 + vabdl.u8 q8, d1, d8 - //Save SAD to 'r4' - ldr r0, [sp] - vadd.u16 q0, q14, q15 - vadd.u16 q1, q12, q13 - vadd.u16 q2, q10, q11 - vadd.u16 q3, q8 , q9 + //Save SAD to 'r4' + ldr r0, [sp] + vadd.u16 q0, q14, q15 + vadd.u16 q1, q12, q13 + vadd.u16 q2, q10, q11 + vadd.u16 q3, q8 , q9 - vadd.u16 d0, d1 - vadd.u16 d1, d2, d3 - vadd.u16 d2, d4, d5 - vadd.u16 d3, d6, d7 + vadd.u16 d0, d1 + vadd.u16 d1, d2, d3 + vadd.u16 d2, d4, d5 + vadd.u16 d3, d6, d7 - vpaddl.u16 q0, q0 - vpaddl.u16 q1, q1 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 - vpaddl.u32 q0, q0 - vpaddl.u32 q1, q1 + vpaddl.u32 q0, q0 + vpaddl.u32 q1, q1 - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0] WELS_ASM_FUNC_END @@ -834,16 +834,16 @@ WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon //Load the pix1 data --- 16 bytes - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d1[0]}, [r0], r1 - vld1.32 {d1[1]}, [r0] + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d1[0]}, [r0], r1 + vld1.32 {d1[1]}, [r0] //Load the pix2 data --- 16 bytes - vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d3[0]}, [r2], r3 - vld1.32 {d3[1]}, [r2] + vld1.32 {d2[0]}, [r2], r3 + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d3[0]}, [r2], r3 + vld1.32 {d3[1]}, [r2] //Get the difference vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7} @@ -874,7 +874,7 @@ WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon vpaddl.u16 d0, d0 vpaddl.u32 d0, d0 - vmov.u32 r0, d0[0] + vmov.u32 r0, d0[0] WELS_ASM_FUNC_END diff --git a/codec/encoder/core/arm/reconstruct_neon.S b/codec/encoder/core/arm/reconstruct_neon.S index 0450a036..b36844ba 100644 --- a/codec/encoder/core/arm/reconstruct_neon.S +++ b/codec/encoder/core/arm/reconstruct_neon.S @@ -36,1115 +36,1115 @@ #ifdef __APPLE__ .macro LOAD_4x4_DATA_FOR_DCT -// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride - vld2.16 {$0[0],$1[0]}, [$4], $5 - vld2.16 {$2[0],$3[0]}, [$6], $7 - vld2.16 {$0[1],$1[1]}, [$4], $5 - vld2.16 {$2[1],$3[1]}, [$6], $7 +// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride + vld2.16 {$0[0],$1[0]}, [$4], $5 + vld2.16 {$2[0],$3[0]}, [$6], $7 + vld2.16 {$0[1],$1[1]}, [$4], $5 + vld2.16 {$2[1],$3[1]}, [$6], $7 - vld2.16 {$0[2],$1[2]}, [$4], $5 - vld2.16 {$2[2],$3[2]}, [$6], $7 - vld2.16 {$0[3],$1[3]}, [$4], $5 - vld2.16 {$2[3],$3[3]}, [$6], $7 -// } + vld2.16 {$0[2],$1[2]}, [$4], $5 + vld2.16 {$2[2],$3[2]}, [$6], $7 + vld2.16 {$0[3],$1[3]}, [$4], $5 + vld2.16 {$2[3],$3[3]}, [$6], $7 +// } .endm .macro LOAD_8x8_DATA_FOR_DCT -// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride - vld1.64 {$0}, [$8], r2 - vld1.64 {$4}, [$9], r4 - vld1.64 {$1}, [$8], r2 - vld1.64 {$5}, [$9], r4 +// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {$0}, [$8], r2 + vld1.64 {$4}, [$9], r4 + vld1.64 {$1}, [$8], r2 + vld1.64 {$5}, [$9], r4 - vld1.64 {$2}, [$8], r2 - vld1.64 {$6}, [$9], r4 - vld1.64 {$3}, [$8], r2 - vld1.64 {$7}, [$9], r4 -// } + vld1.64 {$2}, [$8], r2 + vld1.64 {$6}, [$9], r4 + vld1.64 {$3}, [$8], r2 + vld1.64 {$7}, [$9], r4 +// } .endm .macro DCT_ROW_TRANSFORM_TOTAL_16BITS -// { // input: src_d[0]~[3], working: [4]~[7] - vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3]; - vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3]; - vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2]; - vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2]; +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2]; - vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1]; - vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1]; - vshl.s16 $1, $7, #1 - vshl.s16 $3, $6, #1 - vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2]; - vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1); -// } + vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 $1, $7, #1 + vshl.s16 $3, $6, #1 + vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } .endm .macro MATRIX_TRANSFORM_EACH_16BITS -// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] - vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] -// } +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } .endm -.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 - veor.s16 $6, $6 // init 0 , and keep 0; - vaba.s16 $1, $0, $6 // f + abs(coef - 0) - vmull.s16 $7, $2, $4 - vmull.s16 $8, $3, $5 - vshr.s32 $7, #16 - vshr.s32 $8, #16 - vmovn.s32 $2, $7 - vmovn.s32 $3, $8 +.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 - vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $6, #1 - vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } .endm -.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 - veor.s16 $6, $6 // init 0 , and keep 0; - vaba.s16 $1, $0, $6 // f + abs(coef - 0) - vmull.s16 $7, $2, $4 - vmull.s16 $8, $3, $5 - vshr.s32 $7, #16 - vshr.s32 $8, #16 - vmovn.s32 $2, $7 - vmovn.s32 $3, $8 +.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 $6, $6 // init 0 , and keep 0; + vaba.s16 $1, $0, $6 // f + abs(coef - 0) + vmull.s16 $7, $2, $4 + vmull.s16 $8, $3, $5 + vshr.s32 $7, #16 + vshr.s32 $8, #16 + vmovn.s32 $2, $7 + vmovn.s32 $3, $8 - vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $6, #1 - vmax.s16 $9, $2, $3 - vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $6, #1 + vmax.s16 $9, $2, $3 + vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x +// } .endm -.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; -// { // input: coef, ff (dst), mf , working_d (all 0), working_q - vaba.s16 $1, $0, $3 // f + abs(coef - 0) - vmull.s16 $4, $1, $2 // *= mf - vshr.s32 $4, #16 - vmovn.s32 $1, $4 // >> 16 +.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef; +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 $1, $0, $3 // f + abs(coef - 0) + vmull.s16 $4, $1, $2 // *= mf + vshr.s32 $4, #16 + vmovn.s32 $1, $4 // >> 16 - vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111 - vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 $3, #1 - vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111 + vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 $3, #1 + vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x +// } .endm .macro DC_ZERO_COUNT_IN_DUALWORD -// { // input: coef, dst_d, working_d (all 0x01) - vceq.s16 $1, $0, #0 - vand.s16 $1, $2 - vpadd.s16 $1, $1, $1 - vpadd.s16 $1, $1, $1 -// } +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 $1, $0, #0 + vand.s16 $1, $2 + vpadd.s16 $1, $1, $1 + vpadd.s16 $1, $1, $1 +// } .endm .macro SELECT_MAX_IN_ABS_COEF -// { // input: coef_0, coef_1, max_q (identy to follow two) - vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4 - vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3] - vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] -// } +// { // input: coef_0, coef_1, max_q (identy to follow two) + vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4 + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3] + vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] +// } .endm .macro ZERO_COUNT_IN_2_QUARWORD -// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q - vceq.s16 $0, #0 - vceq.s16 $1, #0 - vand.s16 $0, $2 - vand.s16 $1, $2 +// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q + vceq.s16 $0, #0 + vceq.s16 $1, #0 + vand.s16 $0, $2 + vand.s16 $1, $2 - vpadd.s16 $3, $3, $5 - vpadd.s16 $4, $4, $6 - vpadd.s16 $3, $3, $4 // 8-->4 - vpadd.s16 $3, $3, $3 - vpadd.s16 $3, $3, $3 -// } + vpadd.s16 $3, $3, $5 + vpadd.s16 $4, $4, $6 + vpadd.s16 $3, $3, $4 // 8-->4 + vpadd.s16 $3, $3, $3 + vpadd.s16 $3, $3, $3 +// } .endm .macro HDM_QUANT_2x2_TOTAL_16BITS -// { // input: src_d[0]~[3], working_d, dst_d - vshr.s64 $1, $0, #32 - vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; - vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; - vtrn.s16 $2, $1 - vtrn.s32 $2, $1 -// } +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 $2, $1 + vtrn.s32 $2, $1 +// } .endm .macro IHDM_4x4_TOTAL_16BITS -// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 - vshr.s64 $1, $0, #32 - vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; - vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; - vtrn.s16 $2, $1 - vrev32.16 $1, $1 - vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 $1, $0, #32 + vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 $2, $1 + vrev32.16 $1, $1 + vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; - vrev64.16 $1, $2 - vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; - vsub.s16 $1, $2, $1 - vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; - vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; -// } + vrev64.16 $1, $2 + vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 $1, $2, $1 + vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } .endm .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP -// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; - vmovl.u8 $4,$0 - vmovl.u8 $5,$1 - vadd.s16 $4,$2 - vadd.s16 $5,$3 - vqmovun.s16 $0,$4 - vqmovun.s16 $1,$5 -// } +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 $4,$0 + vmovl.u8 $5,$1 + vadd.s16 $4,$2 + vadd.s16 $5,$3 + vqmovun.s16 $0,$4 + vqmovun.s16 $1,$5 +// } .endm .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS -// { // input: src_d[0]~[3], output: e_d[0]~[3]; - vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2]; - vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2]; - vshr.s16 $6, $1, #1 - vshr.s16 $7, $3, #1 - vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3]; - vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 $6, $1, #1 + vshr.s16 $7, $3, #1 + vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } .endm -.macro TRANSFORM_TOTAL_16BITS // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +.macro TRANSFORM_TOTAL_16BITS // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro ROW_TRANSFORM_0_STEP -// { // input: src_d[0]~[3], output: e_q[0]~[3]; - vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; - vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3]; - vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3]; -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3]; +// } .endm .macro ROW_TRANSFORM_1_STEP -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 - vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 $8, $1, #1 - vshr.s16 $9, $3, #1 - vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 + vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 $8, $1, #1 + vshr.s16 $9, $3, #1 + vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } .endm -.macro TRANSFORM_4BYTES // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +.macro TRANSFORM_4BYTES // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro COL_TRANSFORM_0_STEP -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; - vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm .macro COL_TRANSFORM_1_STEP -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 $6, $1, #1 - vshr.s32 $7, $3, #1 - vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 $6, $1, #1 + vshr.s32 $7, $3, #1 + vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm #else .macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride - vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 - vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 - vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 - vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 +// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride + vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5 + vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7 + vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5 + vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7 - vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 - vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 - vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 - vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 -// } + vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5 + vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7 + vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5 + vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7 +// } .endm .macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride - vld1.64 {\arg0}, [\arg8], r2 - vld1.64 {\arg4}, [\arg9], r4 - vld1.64 {\arg1}, [\arg8], r2 - vld1.64 {\arg5}, [\arg9], r4 +// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride + vld1.64 {\arg0}, [\arg8], r2 + vld1.64 {\arg4}, [\arg9], r4 + vld1.64 {\arg1}, [\arg8], r2 + vld1.64 {\arg5}, [\arg9], r4 - vld1.64 {\arg2}, [\arg8], r2 - vld1.64 {\arg6}, [\arg9], r4 - vld1.64 {\arg3}, [\arg8], r2 - vld1.64 {\arg7}, [\arg9], r4 -// } + vld1.64 {\arg2}, [\arg8], r2 + vld1.64 {\arg6}, [\arg9], r4 + vld1.64 {\arg3}, [\arg8], r2 + vld1.64 {\arg7}, [\arg9], r4 +// } .endm .macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], working: [4]~[7] - vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; - vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; - vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; - vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; +// { // input: src_d[0]~[3], working: [4]~[7] + vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3]; + vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3]; + vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2]; + vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2]; - vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; - vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; - vshl.s16 \arg1, \arg7, #1 - vshl.s16 \arg3, \arg6, #1 - vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; - vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); -// } + vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1]; + vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1]; + vshl.s16 \arg1, \arg7, #1 + vshl.s16 \arg3, \arg6, #1 + vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2]; + vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1); +// } .endm .macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3 -// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] - vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] -// } +// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15] + vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] +// } .endm .macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 - veor.s16 \arg6, \arg6 // init 0 , and keep 0; - vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) - vmull.s16 \arg7, \arg2, \arg4 - vmull.s16 \arg8, \arg3, \arg5 - vshr.s32 \arg7, #16 - vshr.s32 \arg8, #16 - vmovn.s32 \arg2, \arg7 - vmovn.s32 \arg3, \arg8 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 - vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg6, #1 - vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } .endm .macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 - veor.s16 \arg6, \arg6 // init 0 , and keep 0; - vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) - vmull.s16 \arg7, \arg2, \arg4 - vmull.s16 \arg8, \arg3, \arg5 - vshr.s32 \arg7, #16 - vshr.s32 \arg8, #16 - vmovn.s32 \arg2, \arg7 - vmovn.s32 \arg3, \arg8 +// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1 + veor.s16 \arg6, \arg6 // init 0 , and keep 0; + vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0) + vmull.s16 \arg7, \arg2, \arg4 + vmull.s16 \arg8, \arg3, \arg5 + vshr.s32 \arg7, #16 + vshr.s32 \arg8, #16 + vmovn.s32 \arg2, \arg7 + vmovn.s32 \arg3, \arg8 - vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg6, #1 - vmax.s16 \arg9, \arg2, \arg3 - vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg6, #1 + vmax.s16 \arg9, \arg2, \arg3 + vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x +// } .endm .macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4 -// { // input: coef, ff (dst), mf , working_d (all 0), working_q - vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) - vmull.s16 \arg4, \arg1, \arg2 // *= mf - vshr.s32 \arg4, #16 - vmovn.s32 \arg1, \arg4 // >> 16 +// { // input: coef, ff (dst), mf , working_d (all 0), working_q + vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0) + vmull.s16 \arg4, \arg1, \arg2 // *= mf + vshr.s32 \arg4, #16 + vmovn.s32 \arg1, \arg4 // >> 16 - vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 - vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched - vshl.s16 \arg3, #1 - vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x -// } + vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111 + vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched + vshl.s16 \arg3, #1 + vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x +// } .endm .macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2 -// { // input: coef, dst_d, working_d (all 0x01) - vceq.s16 \arg1, \arg0, #0 - vand.s16 \arg1, \arg2 - vpadd.s16 \arg1, \arg1, \arg1 - vpadd.s16 \arg1, \arg1, \arg1 -// } +// { // input: coef, dst_d, working_d (all 0x01) + vceq.s16 \arg1, \arg0, #0 + vand.s16 \arg1, \arg2 + vpadd.s16 \arg1, \arg1, \arg1 + vpadd.s16 \arg1, \arg1, \arg1 +// } .endm .macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4 -// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 - vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 - vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] - vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] -// } +// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1 + vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4 + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3] + vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] +// } .endm .macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6 -// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q - vceq.s16 \arg0, #0 - vceq.s16 \arg1, #0 - vand.s16 \arg0, \arg2 - vand.s16 \arg1, \arg2 +// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q + vceq.s16 \arg0, #0 + vceq.s16 \arg1, #0 + vand.s16 \arg0, \arg2 + vand.s16 \arg1, \arg2 - vpadd.s16 \arg3, \arg3, \arg5 - vpadd.s16 \arg4, \arg4, \arg6 - vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 - vpadd.s16 \arg3, \arg3, \arg3 - vpadd.s16 \arg3, \arg3, \arg3 -// } + vpadd.s16 \arg3, \arg3, \arg5 + vpadd.s16 \arg4, \arg4, \arg6 + vpadd.s16 \arg3, \arg3, \arg4 // 8-->4 + vpadd.s16 \arg3, \arg3, \arg3 + vpadd.s16 \arg3, \arg3, \arg3 +// } .endm .macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2 -// { // input: src_d[0]~[3], working_d, dst_d - vshr.s64 \arg1, \arg0, #32 - vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; - vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; - vtrn.s16 \arg2, \arg1 - vtrn.s32 \arg2, \arg1 -// } +// { // input: src_d[0]~[3], working_d, dst_d + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48]; + vtrn.s16 \arg2, \arg1 + vtrn.s32 \arg2, \arg1 +// } .endm .macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2 -// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 - vshr.s64 \arg1, \arg0, #32 - vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; - vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; - vtrn.s16 \arg2, \arg1 - vrev32.16 \arg1, \arg1 - vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; +// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2 + vshr.s64 \arg1, \arg0, #32 + vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3]; + vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3]; + vtrn.s16 \arg2, \arg1 + vrev32.16 \arg1, \arg1 + vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3]; - vrev64.16 \arg1, \arg2 - vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; - vsub.s16 \arg1, \arg2, \arg1 - vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; - vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; -// } + vrev64.16 \arg1, \arg2 + vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2]; + vsub.s16 \arg1, \arg2, \arg1 + vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3]; + vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3]; +// } .endm .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5 -// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; - vmovl.u8 \arg4,\arg0 - vmovl.u8 \arg5,\arg1 - vadd.s16 \arg4,\arg2 - vadd.s16 \arg5,\arg3 - vqmovun.s16 \arg0,\arg4 - vqmovun.s16 \arg1,\arg5 -// } +// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1; + vmovl.u8 \arg4,\arg0 + vmovl.u8 \arg5,\arg1 + vadd.s16 \arg4,\arg2 + vadd.s16 \arg5,\arg3 + vqmovun.s16 \arg0,\arg4 + vqmovun.s16 \arg1,\arg5 +// } .endm .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], output: e_d[0]~[3]; - vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; - vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; - vshr.s16 \arg6, \arg1, #1 - vshr.s16 \arg7, \arg3, #1 - vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; - vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_d[0]~[3]; + vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2]; + vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2]; + vshr.s16 \arg6, \arg1, #1 + vshr.s16 \arg7, \arg3, #1 + vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3]; + vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1); +// } .endm -.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_d[0]~[3], output: e_q[0]~[3]; - vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; - vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; - vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3]; + vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3]; +// } .endm .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 -// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 - vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; - vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; - vshr.s16 \arg8, \arg1, #1 - vshr.s16 \arg9, \arg3, #1 - vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; - vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); -// } +// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9 + vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; + vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; + vshr.s16 \arg8, \arg1, #1 + vshr.s16 \arg9, \arg3, #1 + vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; + vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); +// } .endm -.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used -// { // output: f_q[0]~[3], input: e_q[0]~[3]; - vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; - vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; - vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; - vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; -// } +.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used +// { // output: f_q[0]~[3], input: e_q[0]~[3]; + vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; + vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; + vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; + vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; +// } .endm .macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; - vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 -// { // input: src_q[0]~[3], output: e_q[0]~[3]; - vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; - vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; - vshr.s32 \arg6, \arg1, #1 - vshr.s32 \arg7, \arg3, #1 - vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; - vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); -// } +// { // input: src_q[0]~[3], output: e_q[0]~[3]; + vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; + vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; + vshr.s32 \arg6, \arg1, #1 + vshr.s32 \arg7, \arg3, #1 + vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; + vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); +// } .endm #endif WELS_ASM_FUNC_BEGIN WelsDctT4_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 + LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4 - vsubl.u8 q0, d4, d6 - vsubl.u8 q1, d5, d7 - vtrn.s32 q0, q1 - vswp d1, d2 + vsubl.u8 q0, d4, d6 + vsubl.u8 q1, d5, d7 + vtrn.s32 q0, q1 + vswp d1, d2 - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - // transform element - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - // transform element - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + // transform element + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q0, q1}, [r0]! - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 + LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 - vsubl.u8 q0, d16, d20 - vsubl.u8 q1, d17, d21 - vsubl.u8 q2, d18, d22 - vsubl.u8 q3, d19, d23 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + vsubl.u8 q0, d16, d20 + vsubl.u8 q1, d17, d21 + vsubl.u8 q2, d18, d22 + vsubl.u8 q3, d19, d23 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - // transform element - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - vswp d1, d2 - vswp d5, d6 - vswp q1, q2 - vst1.s16 {q0, q1}, [r0]! - vst1.s16 {q2, q3}, [r0]! + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! - //////////////// - LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 + //////////////// + LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3 - vsubl.u8 q0, d16, d20 - vsubl.u8 q1, d17, d21 - vsubl.u8 q2, d18, d22 - vsubl.u8 q3, d19, d23 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + vsubl.u8 q0, d16, d20 + vsubl.u8 q1, d17, d21 + vsubl.u8 q2, d18, d22 + vsubl.u8 q3, d19, d23 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - // horizontal transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + // horizontal transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - // transform element - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + // transform element + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - // vertical transform - DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + // vertical transform + DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - vswp d1, d2 - vswp d5, d6 - vswp q1, q2 - vst1.s16 {q0, q1}, [r0]! - vst1.s16 {q2, q3}, [r0]! + vswp d1, d2 + vswp d5, d6 + vswp q1, q2 + vst1.s16 {q0, q1}, [r0]! + vst1.s16 {q2, q3}, [r0]! - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q0, q1}, [r0] - vld1.s16 {q3}, [r2] + vld1.s16 {q2}, [r1] + vld1.s16 {q0, q1}, [r0] + vld1.s16 {q3}, [r2] - vmov q8, q2 + vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 - vst1.s16 {q2}, [r0]! + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 + vst1.s16 {q2}, [r0]! - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r0]! + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r0]! WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon - vld1.s16 {q0, q1}, [r0] - vdup.s16 q2, r1 // even ff range [0, 768] - vdup.s16 q3, r2 + vld1.s16 {q0, q1}, [r0] + vdup.s16 q2, r1 // even ff range [0, 768] + vdup.s16 q3, r2 - vmov q8, q2 + vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 - vst1.s16 {q2}, [r0]! + NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11 + vst1.s16 {q2}, [r0]! - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r0]! + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r0]! WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q3}, [r2] - mov r1, r0 + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 - vst1.s16 {q8}, [r1]! + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11 + vst1.s16 {q8}, [r1]! WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon - vld1.s16 {q2}, [r1] - vld1.s16 {q3}, [r2] - mov r1, r0 + vld1.s16 {q2}, [r1] + vld1.s16 {q3}, [r2] + mov r1, r0 - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 - vst1.s16 {q8}, [r1]! - vmov q12, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 - vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28 + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 + vst1.s16 {q8}, [r1]! + vmov q12, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 + vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28 - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 - vst1.s16 {q8}, [r1]! - vmov q12, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 - vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29 + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 + vst1.s16 {q8}, [r1]! + vmov q12, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 + vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29 - SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 - vst1.s32 {d0[0]}, [r3]! + SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! - /////////// - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 - vst1.s16 {q8}, [r1]! - vmov q12, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 - vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28 + /////////// + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26 + vst1.s16 {q8}, [r1]! + vmov q12, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28 + vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28 - vld1.s16 {q0, q1}, [r0]! - vmov q8, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 - vst1.s16 {q8}, [r1]! - vmov q12, q2 - NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 - vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29 + vld1.s16 {q0, q1}, [r0]! + vmov q8, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27 + vst1.s16 {q8}, [r1]! + vmov q12, q2 + NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29 + vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29 - SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 - vst1.s32 {d0[0]}, [r3]! + SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1 + vst1.s32 {d0[0]}, [r3]! WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon - push {r2,r3} - mov r2, #64 // 2*16*sizeof(int16_t) - add r3, r1, #32 + push {r2,r3} + mov r2, #64 // 2*16*sizeof(int16_t) + add r3, r1, #32 - vld1.s16 {d0}, [r1], r2 - vld1.s16 {d1}, [r3], r2 - vld1.s16 {d4}, [r1], r2 - vld1.s16 {d5}, [r3], r2 - vld1.s16 {d2}, [r1], r2 - vld1.s16 {d3}, [r3], r2 - vld1.s16 {d6}, [r1], r2 - vld1.s16 {d7}, [r3], r2 - vtrn.16 q0, q2 // d0[0 4], d1[1 5] - vtrn.16 q1, q3 // d2[2 6], d3[3 7] + vld1.s16 {d0}, [r1], r2 + vld1.s16 {d1}, [r3], r2 + vld1.s16 {d4}, [r1], r2 + vld1.s16 {d5}, [r3], r2 + vld1.s16 {d2}, [r1], r2 + vld1.s16 {d3}, [r3], r2 + vld1.s16 {d6}, [r1], r2 + vld1.s16 {d7}, [r3], r2 + vtrn.16 q0, q2 // d0[0 4], d1[1 5] + vtrn.16 q1, q3 // d2[2 6], d3[3 7] - vld1.s16 {d16}, [r1], r2 - vld1.s16 {d17}, [r3], r2 - vld1.s16 {d20}, [r1], r2 - vld1.s16 {d21}, [r3], r2 - vld1.s16 {d18}, [r1], r2 - vld1.s16 {d19}, [r3], r2 - vld1.s16 {d22}, [r1], r2 - vld1.s16 {d23}, [r3], r2 - vtrn.16 q8, q10 //d16[08 12],d17[09 13] - vtrn.16 q9, q11 //d18[10 14],d19[11 15] + vld1.s16 {d16}, [r1], r2 + vld1.s16 {d17}, [r3], r2 + vld1.s16 {d20}, [r1], r2 + vld1.s16 {d21}, [r3], r2 + vld1.s16 {d18}, [r1], r2 + vld1.s16 {d19}, [r3], r2 + vld1.s16 {d22}, [r1], r2 + vld1.s16 {d23}, [r3], r2 + vtrn.16 q8, q10 //d16[08 12],d17[09 13] + vtrn.16 q9, q11 //d18[10 14],d19[11 15] - vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] - vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] + vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16] + vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80] - ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9 + ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9 - TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 + TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 - // transform element 32bits - vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] - vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] - vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] - vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] + // transform element 32bits + vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] + vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] + vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] + vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] - COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9 + COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9 - TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 + TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9 - vrshrn.s32 d16, q0, #1 - vrshrn.s32 d17, q1, #1 - vrshrn.s32 d18, q2, #1 - vrshrn.s32 d19, q3, #1 - vst1.16 {q8, q9}, [r0] //store + vrshrn.s32 d16, q0, #1 + vrshrn.s32 d17, q1, #1 + vrshrn.s32 d18, q2, #1 + vrshrn.s32 d19, q3, #1 + vst1.16 {q8, q9}, [r0] //store - pop {r2,r3} + pop {r2,r3} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon - vdup.s16 d1, r1 //ff - vdup.s16 d2, r2 //mf - veor d3, d3 + vdup.s16 d1, r1 //ff + vdup.s16 d2, r2 //mf + veor d3, d3 - mov r1, #32 - mov r2, r0 + mov r1, #32 + mov r2, r0 - vld1.s16 {d0[0]}, [r0], r1 //rs[00] - vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 - vld1.s16 {d0[1]}, [r0], r1 //rs[16] - vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 - vld1.s16 {d0[2]}, [r0], r1 //rs[32] - vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 - vld1.s16 {d0[3]}, [r0], r1 //rs[48] - vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0 + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0 + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0 + vld1.s16 {d0[3]}, [r0], r1 //rs[48] + vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0 - HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 + HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5 - HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 + HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0 - QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 + QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2 - vst1.s16 d1, [r3] // store to dct - ldr r2, [sp, #0] - vst1.s16 d1, [r2] // store to block + vst1.s16 d1, [r3] // store to dct + ldr r2, [sp, #0] + vst1.s16 d1, [r2] // store to block - mov r1, #1 - vdup.s16 d3, r1 - DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 + mov r1, #1 + vdup.s16 d3, r1 + DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3 - vmov r0, r1, d0 - and r0, #0x07 // range [0~4] - rsb r0, #4 + vmov r0, r1, d0 + and r0, #0x07 // range [0~4] + rsb r0, #4 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon - vdup.s16 d3, r1 - mov r1, #32 - vld1.s16 {d0[0]}, [r0], r1 //rs[00] - vld1.s16 {d0[1]}, [r0], r1 //rs[16] - vld1.s16 {d0[2]}, [r0], r1 //rs[32] - vld1.s16 {d0[3]}, [r0], r1 //rs[48] + vdup.s16 d3, r1 + mov r1, #32 + vld1.s16 {d0[0]}, [r0], r1 //rs[00] + vld1.s16 {d0[1]}, [r0], r1 //rs[16] + vld1.s16 {d0[2]}, [r0], r1 //rs[32] + vld1.s16 {d0[3]}, [r0], r1 //rs[48] - HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 + HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2 - HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 + HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0 - vabs.s16 d1, d0 - vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; - vmov r0, r1, d1 - orr r0, r1 + vabs.s16 d1, d0 + vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold; + vmov r0, r1, d1 + orr r0, r1 WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon - push {r1} - vld1.s16 {q0, q1}, [r0] - vmov.s16 q8, #1 + push {r1} + vld1.s16 {q0, q1}, [r0] + vmov.s16 q8, #1 - ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 - vmov r0, r1, d0 - and r0, #0x1F // range [0~16] - rsb r0, #16 - pop {r1} + ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3 + vmov r0, r1, d0 + and r0, #0x1F // range [0~16] + rsb r0, #16 + pop {r1} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon - vld1.s16 {q0, q1}, [r0] - vld1.u16 {q2}, [r1] + vld1.s16 {q0, q1}, [r0] + vld1.u16 {q2}, [r1] - vmul.s16 q8, q0, q2 - vmul.s16 q9, q1, q2 + vmul.s16 q8, q0, q2 + vmul.s16 q9, q1, q2 - vst1.s16 {q8, q9}, [r0] + vst1.s16 {q8, q9}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon - vld1.u16 {q12}, [r1] - mov r1, r0 - vld1.s16 {q0, q1}, [r0]! - vld1.s16 {q2, q3}, [r0]! - vmul.s16 q0, q0, q12 - vld1.s16 {q8, q9}, [r0]! - vmul.s16 q1, q1, q12 - vld1.s16 {q10, q11}, [r0]! + vld1.u16 {q12}, [r1] + mov r1, r0 + vld1.s16 {q0, q1}, [r0]! + vld1.s16 {q2, q3}, [r0]! + vmul.s16 q0, q0, q12 + vld1.s16 {q8, q9}, [r0]! + vmul.s16 q1, q1, q12 + vld1.s16 {q10, q11}, [r0]! - vst1.s16 {q0, q1}, [r1]! + vst1.s16 {q0, q1}, [r1]! - vmul.s16 q2, q2, q12 - vmul.s16 q3, q3, q12 - vmul.s16 q8, q8, q12 - vst1.s16 {q2, q3}, [r1]! + vmul.s16 q2, q2, q12 + vmul.s16 q3, q3, q12 + vmul.s16 q8, q8, q12 + vst1.s16 {q2, q3}, [r1]! - vmul.s16 q9, q9, q12 - vmul.s16 q10, q10, q12 - vmul.s16 q11, q11, q12 - vst1.s16 {q8, q9}, [r1]! - vst1.s16 {q10, q11}, [r1]! + vmul.s16 q9, q9, q12 + vmul.s16 q10, q10, q12 + vmul.s16 q11, q11, q12 + vst1.s16 {q8, q9}, [r1]! + vst1.s16 {q10, q11}, [r1]! WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon - vld1.s16 {q0, q1}, [r0] - vdup.s16 q8, r1 + vld1.s16 {q0, q1}, [r0] + vdup.s16 q8, r1 - IHDM_4x4_TOTAL_16BITS q0, q2, q3 - IHDM_4x4_TOTAL_16BITS q1, q2, q3 + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + IHDM_4x4_TOTAL_16BITS q1, q2, q3 - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - IHDM_4x4_TOTAL_16BITS q0, q2, q3 - vmul.s16 q0, q8 + IHDM_4x4_TOTAL_16BITS q0, q2, q3 + vmul.s16 q0, q8 - IHDM_4x4_TOTAL_16BITS q1, q2, q3 - vmul.s16 q1, q8 + IHDM_4x4_TOTAL_16BITS q1, q2, q3 + vmul.s16 q1, q8 - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - vst1.s16 {q0, q1}, [r0] + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + vst1.s16 {q0, q1}, [r0] WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon - vld1.u32 {d16[0]}, [r2], r3 - push {r4} - ldr r4, [sp, #4] - vld1.u32 {d16[1]}, [r2], r3 + vld1.u32 {d16[0]}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u32 {d16[1]}, [r2], r3 - vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! - vld1.u32 {d17[0]}, [r2], r3 - vld1.u32 {d17[1]}, [r2], r3 // q7 is pred + vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles! + vld1.u32 {d17[0]}, [r2], r3 + vld1.u32 {d17[1]}, [r2], r3 // q7 is pred - ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 + MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3 - ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 - vrshr.s16 d0, d0, #6 - vrshr.s16 d1, d1, #6 - vrshr.s16 d2, d2, #6 - vrshr.s16 d3, d3, #6 + TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7 + vrshr.s16 d0, d0, #6 + vrshr.s16 d1, d1, #6 + vrshr.s16 d2, d2, #6 + vrshr.s16 d3, d3, #6 - //after rounding 6, clip into [0, 255] - vmovl.u8 q2,d16 - vadd.s16 q0,q2 - vqmovun.s16 d16,q0 - vst1.32 {d16[0]},[r0],r1 - vst1.32 {d16[1]},[r0],r1 + //after rounding 6, clip into [0, 255] + vmovl.u8 q2,d16 + vadd.s16 q0,q2 + vqmovun.s16 d16,q0 + vst1.32 {d16[0]},[r0],r1 + vst1.32 {d16[1]},[r0],r1 - vmovl.u8 q2,d17 - vadd.s16 q1,q2 - vqmovun.s16 d17,q1 - vst1.32 {d17[0]},[r0],r1 - vst1.32 {d17[1]},[r0] + vmovl.u8 q2,d17 + vadd.s16 q1,q2 + vqmovun.s16 d17,q1 + vst1.32 {d17[0]},[r0],r1 + vst1.32 {d17[1]},[r0] - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon - vld1.u64 {d24}, [r2], r3 - push {r4} - ldr r4, [sp, #4] - vld1.u64 {d25}, [r2], r3 + vld1.u64 {d24}, [r2], r3 + push {r4} + ldr r4, [sp, #4] + vld1.u64 {d25}, [r2], r3 - vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! - vld1.u64 {d26}, [r2], r3 - vld1.u64 {d27}, [r2], r3 - vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! - vswp d1, d4 - vswp d3, d6 - vswp q1, q2 // q0~q3 + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d26}, [r2], r3 + vld1.u64 {d27}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - vrshr.s16 q2, q2, #6 - vrshr.s16 q3, q3, #6 + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 - //after rounding 6, clip into [0, 255] - vmovl.u8 q8,d24 - vadd.s16 q0,q8 - vqmovun.s16 d24,q0 - vst1.u8 {d24},[r0],r1 + //after rounding 6, clip into [0, 255] + vmovl.u8 q8,d24 + vadd.s16 q0,q8 + vqmovun.s16 d24,q0 + vst1.u8 {d24},[r0],r1 - vmovl.u8 q8,d25 - vadd.s16 q1,q8 - vqmovun.s16 d25,q1 - vst1.u8 {d25},[r0],r1 + vmovl.u8 q8,d25 + vadd.s16 q1,q8 + vqmovun.s16 d25,q1 + vst1.u8 {d25},[r0],r1 - vmovl.u8 q8,d26 - vadd.s16 q2,q8 - vqmovun.s16 d26,q2 - vst1.u8 {d26},[r0],r1 + vmovl.u8 q8,d26 + vadd.s16 q2,q8 + vqmovun.s16 d26,q2 + vst1.u8 {d26},[r0],r1 - vmovl.u8 q8,d27 - vadd.s16 q3,q8 - vqmovun.s16 d27,q3 - vst1.u8 {d27},[r0],r1 + vmovl.u8 q8,d27 + vadd.s16 q3,q8 + vqmovun.s16 d27,q3 + vst1.u8 {d27},[r0],r1 - vld1.u64 {d24}, [r2], r3 - vld1.u64 {d25}, [r2], r3 + vld1.u64 {d24}, [r2], r3 + vld1.u64 {d25}, [r2], r3 - vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! - vld1.u64 {d26}, [r2], r3 - vld1.u64 {d27}, [r2], r3 - vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! - vswp d1, d4 - vswp d3, d6 - vswp q1, q2 // q0~q3 + vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles! + vld1.u64 {d26}, [r2], r3 + vld1.u64 {d27}, [r2], r3 + vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles! + vswp d1, d4 + vswp d3, d6 + vswp q1, q2 // q0~q3 - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 + MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3 - ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - vrshr.s16 q2, q2, #6 - vrshr.s16 q3, q3, #6 + TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11 + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + vrshr.s16 q2, q2, #6 + vrshr.s16 q3, q3, #6 - //after rounding 6, clip into [0, 255] - vmovl.u8 q8,d24 - vadd.s16 q0,q8 - vqmovun.s16 d24,q0 - vst1.u8 {d24},[r0],r1 + //after rounding 6, clip into [0, 255] + vmovl.u8 q8,d24 + vadd.s16 q0,q8 + vqmovun.s16 d24,q0 + vst1.u8 {d24},[r0],r1 - vmovl.u8 q8,d25 - vadd.s16 q1,q8 - vqmovun.s16 d25,q1 - vst1.u8 {d25},[r0],r1 + vmovl.u8 q8,d25 + vadd.s16 q1,q8 + vqmovun.s16 d25,q1 + vst1.u8 {d25},[r0],r1 - vmovl.u8 q8,d26 - vadd.s16 q2,q8 - vqmovun.s16 d26,q2 - vst1.u8 {d26},[r0],r1 + vmovl.u8 q8,d26 + vadd.s16 q2,q8 + vqmovun.s16 d26,q2 + vst1.u8 {d26},[r0],r1 - vmovl.u8 q8,d27 - vadd.s16 q3,q8 - vqmovun.s16 d27,q3 - vst1.u8 {d27},[r0],r1 + vmovl.u8 q8,d27 + vadd.s16 q3,q8 + vqmovun.s16 d27,q3 + vst1.u8 {d27},[r0],r1 - pop {r4} + pop {r4} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon - push {r4} - ldr r4, [sp, #4] + push {r4} + ldr r4, [sp, #4] - vld1.s16 {q8,q9}, [r4] - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 + vld1.s16 {q8,q9}, [r4] + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 - vdup.s16 d20, d16[0] - vdup.s16 d21, d16[1] - vdup.s16 d22, d16[2] - vdup.s16 d23, d16[3] + vdup.s16 d20, d16[0] + vdup.s16 d21, d16[1] + vdup.s16 d22, d16[2] + vdup.s16 d23, d16[3] - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vdup.s16 d20, d17[0] - vdup.s16 d21, d17[1] - vdup.s16 d22, d17[2] - vdup.s16 d23, d17[3] + vdup.s16 d20, d17[0] + vdup.s16 d21, d17[1] + vdup.s16 d22, d17[2] + vdup.s16 d23, d17[3] - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vdup.s16 d20, d18[0] - vdup.s16 d21, d18[1] - vdup.s16 d22, d18[2] - vdup.s16 d23, d18[3] + vdup.s16 d20, d18[0] + vdup.s16 d21, d18[1] + vdup.s16 d22, d18[2] + vdup.s16 d23, d18[3] - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vdup.s16 d20, d19[0] - vdup.s16 d21, d19[1] - vdup.s16 d22, d19[2] - vdup.s16 d23, d19[3] + vdup.s16 d20, d19[0] + vdup.s16 d21, d19[1] + vdup.s16 d22, d19[2] + vdup.s16 d23, d19[3] - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - vld1.u8 {q0}, [r2], r3 - MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 - vst1.u8 {q0}, [r0], r1 + vld1.u8 {q0}, [r2], r3 + MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13 + vst1.u8 {q0}, [r0], r1 - pop {r4} + pop {r4} WELS_ASM_FUNC_END #endif diff --git a/codec/encoder/core/x86/coeff.asm b/codec/encoder/core/x86/coeff.asm index 1a637515..ccc9ded9 100644 --- a/codec/encoder/core/x86/coeff.asm +++ b/codec/encoder/core/x86/coeff.asm @@ -55,262 +55,262 @@ sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1 align 16 byte_1pos_table: - db 0,0,0,0,0,0,0,0, ;0 - db 0,0,0,0,0,0,0,1, ;1 - db 1,0,0,0,0,0,0,1, ;2 - db 1,0,0,0,0,0,0,2, ;3 - db 2,0,0,0,0,0,0,1, ;4 - db 2,0,0,0,0,0,0,2, ;5 - db 2,1,0,0,0,0,0,2, ;6 - db 2,1,0,0,0,0,0,3, ;7 - db 3,0,0,0,0,0,0,1, ;8 - db 3,0,0,0,0,0,0,2, ;9 - db 3,1,0,0,0,0,0,2, ;10 - db 3,1,0,0,0,0,0,3, ;11 - db 3,2,0,0,0,0,0,2, ;12 - db 3,2,0,0,0,0,0,3, ;13 - db 3,2,1,0,0,0,0,3, ;14 - db 3,2,1,0,0,0,0,4, ;15 - db 4,0,0,0,0,0,0,1, ;16 - db 4,0,0,0,0,0,0,2, ;17 - db 4,1,0,0,0,0,0,2, ;18 - db 4,1,0,0,0,0,0,3, ;19 - db 4,2,0,0,0,0,0,2, ;20 - db 4,2,0,0,0,0,0,3, ;21 - db 4,2,1,0,0,0,0,3, ;22 - db 4,2,1,0,0,0,0,4, ;23 - db 4,3,0,0,0,0,0,2, ;24 - db 4,3,0,0,0,0,0,3, ;25 - db 4,3,1,0,0,0,0,3, ;26 - db 4,3,1,0,0,0,0,4, ;27 - db 4,3,2,0,0,0,0,3, ;28 - db 4,3,2,0,0,0,0,4, ;29 - db 4,3,2,1,0,0,0,4, ;30 - db 4,3,2,1,0,0,0,5, ;31 - db 5,0,0,0,0,0,0,1, ;32 - db 5,0,0,0,0,0,0,2, ;33 - db 5,1,0,0,0,0,0,2, ;34 - db 5,1,0,0,0,0,0,3, ;35 - db 5,2,0,0,0,0,0,2, ;36 - db 5,2,0,0,0,0,0,3, ;37 - db 5,2,1,0,0,0,0,3, ;38 - db 5,2,1,0,0,0,0,4, ;39 - db 5,3,0,0,0,0,0,2, ;40 - db 5,3,0,0,0,0,0,3, ;41 - db 5,3,1,0,0,0,0,3, ;42 - db 5,3,1,0,0,0,0,4, ;43 - db 5,3,2,0,0,0,0,3, ;44 - db 5,3,2,0,0,0,0,4, ;45 - db 5,3,2,1,0,0,0,4, ;46 - db 5,3,2,1,0,0,0,5, ;47 - db 5,4,0,0,0,0,0,2, ;48 - db 5,4,0,0,0,0,0,3, ;49 - db 5,4,1,0,0,0,0,3, ;50 - db 5,4,1,0,0,0,0,4, ;51 - db 5,4,2,0,0,0,0,3, ;52 - db 5,4,2,0,0,0,0,4, ;53 - db 5,4,2,1,0,0,0,4, ;54 - db 5,4,2,1,0,0,0,5, ;55 - db 5,4,3,0,0,0,0,3, ;56 - db 5,4,3,0,0,0,0,4, ;57 - db 5,4,3,1,0,0,0,4, ;58 - db 5,4,3,1,0,0,0,5, ;59 - db 5,4,3,2,0,0,0,4, ;60 - db 5,4,3,2,0,0,0,5, ;61 - db 5,4,3,2,1,0,0,5, ;62 - db 5,4,3,2,1,0,0,6, ;63 - db 6,0,0,0,0,0,0,1, ;64 - db 6,0,0,0,0,0,0,2, ;65 - db 6,1,0,0,0,0,0,2, ;66 - db 6,1,0,0,0,0,0,3, ;67 - db 6,2,0,0,0,0,0,2, ;68 - db 6,2,0,0,0,0,0,3, ;69 - db 6,2,1,0,0,0,0,3, ;70 - db 6,2,1,0,0,0,0,4, ;71 - db 6,3,0,0,0,0,0,2, ;72 - db 6,3,0,0,0,0,0,3, ;73 - db 6,3,1,0,0,0,0,3, ;74 - db 6,3,1,0,0,0,0,4, ;75 - db 6,3,2,0,0,0,0,3, ;76 - db 6,3,2,0,0,0,0,4, ;77 - db 6,3,2,1,0,0,0,4, ;78 - db 6,3,2,1,0,0,0,5, ;79 - db 6,4,0,0,0,0,0,2, ;80 - db 6,4,0,0,0,0,0,3, ;81 - db 6,4,1,0,0,0,0,3, ;82 - db 6,4,1,0,0,0,0,4, ;83 - db 6,4,2,0,0,0,0,3, ;84 - db 6,4,2,0,0,0,0,4, ;85 - db 6,4,2,1,0,0,0,4, ;86 - db 6,4,2,1,0,0,0,5, ;87 - db 6,4,3,0,0,0,0,3, ;88 - db 6,4,3,0,0,0,0,4, ;89 - db 6,4,3,1,0,0,0,4, ;90 - db 6,4,3,1,0,0,0,5, ;91 - db 6,4,3,2,0,0,0,4, ;92 - db 6,4,3,2,0,0,0,5, ;93 - db 6,4,3,2,1,0,0,5, ;94 - db 6,4,3,2,1,0,0,6, ;95 - db 6,5,0,0,0,0,0,2, ;96 - db 6,5,0,0,0,0,0,3, ;97 - db 6,5,1,0,0,0,0,3, ;98 - db 6,5,1,0,0,0,0,4, ;99 - db 6,5,2,0,0,0,0,3, ;100 - db 6,5,2,0,0,0,0,4, ;101 - db 6,5,2,1,0,0,0,4, ;102 - db 6,5,2,1,0,0,0,5, ;103 - db 6,5,3,0,0,0,0,3, ;104 - db 6,5,3,0,0,0,0,4, ;105 - db 6,5,3,1,0,0,0,4, ;106 - db 6,5,3,1,0,0,0,5, ;107 - db 6,5,3,2,0,0,0,4, ;108 - db 6,5,3,2,0,0,0,5, ;109 - db 6,5,3,2,1,0,0,5, ;110 - db 6,5,3,2,1,0,0,6, ;111 - db 6,5,4,0,0,0,0,3, ;112 - db 6,5,4,0,0,0,0,4, ;113 - db 6,5,4,1,0,0,0,4, ;114 - db 6,5,4,1,0,0,0,5, ;115 - db 6,5,4,2,0,0,0,4, ;116 - db 6,5,4,2,0,0,0,5, ;117 - db 6,5,4,2,1,0,0,5, ;118 - db 6,5,4,2,1,0,0,6, ;119 - db 6,5,4,3,0,0,0,4, ;120 - db 6,5,4,3,0,0,0,5, ;121 - db 6,5,4,3,1,0,0,5, ;122 - db 6,5,4,3,1,0,0,6, ;123 - db 6,5,4,3,2,0,0,5, ;124 - db 6,5,4,3,2,0,0,6, ;125 - db 6,5,4,3,2,1,0,6, ;126 - db 6,5,4,3,2,1,0,7, ;127 - db 7,0,0,0,0,0,0,1, ;128 - db 7,0,0,0,0,0,0,2, ;129 - db 7,1,0,0,0,0,0,2, ;130 - db 7,1,0,0,0,0,0,3, ;131 - db 7,2,0,0,0,0,0,2, ;132 - db 7,2,0,0,0,0,0,3, ;133 - db 7,2,1,0,0,0,0,3, ;134 - db 7,2,1,0,0,0,0,4, ;135 - db 7,3,0,0,0,0,0,2, ;136 - db 7,3,0,0,0,0,0,3, ;137 - db 7,3,1,0,0,0,0,3, ;138 - db 7,3,1,0,0,0,0,4, ;139 - db 7,3,2,0,0,0,0,3, ;140 - db 7,3,2,0,0,0,0,4, ;141 - db 7,3,2,1,0,0,0,4, ;142 - db 7,3,2,1,0,0,0,5, ;143 - db 7,4,0,0,0,0,0,2, ;144 - db 7,4,0,0,0,0,0,3, ;145 - db 7,4,1,0,0,0,0,3, ;146 - db 7,4,1,0,0,0,0,4, ;147 - db 7,4,2,0,0,0,0,3, ;148 - db 7,4,2,0,0,0,0,4, ;149 - db 7,4,2,1,0,0,0,4, ;150 - db 7,4,2,1,0,0,0,5, ;151 - db 7,4,3,0,0,0,0,3, ;152 - db 7,4,3,0,0,0,0,4, ;153 - db 7,4,3,1,0,0,0,4, ;154 - db 7,4,3,1,0,0,0,5, ;155 - db 7,4,3,2,0,0,0,4, ;156 - db 7,4,3,2,0,0,0,5, ;157 - db 7,4,3,2,1,0,0,5, ;158 - db 7,4,3,2,1,0,0,6, ;159 - db 7,5,0,0,0,0,0,2, ;160 - db 7,5,0,0,0,0,0,3, ;161 - db 7,5,1,0,0,0,0,3, ;162 - db 7,5,1,0,0,0,0,4, ;163 - db 7,5,2,0,0,0,0,3, ;164 - db 7,5,2,0,0,0,0,4, ;165 - db 7,5,2,1,0,0,0,4, ;166 - db 7,5,2,1,0,0,0,5, ;167 - db 7,5,3,0,0,0,0,3, ;168 - db 7,5,3,0,0,0,0,4, ;169 - db 7,5,3,1,0,0,0,4, ;170 - db 7,5,3,1,0,0,0,5, ;171 - db 7,5,3,2,0,0,0,4, ;172 - db 7,5,3,2,0,0,0,5, ;173 - db 7,5,3,2,1,0,0,5, ;174 - db 7,5,3,2,1,0,0,6, ;175 - db 7,5,4,0,0,0,0,3, ;176 - db 7,5,4,0,0,0,0,4, ;177 - db 7,5,4,1,0,0,0,4, ;178 - db 7,5,4,1,0,0,0,5, ;179 - db 7,5,4,2,0,0,0,4, ;180 - db 7,5,4,2,0,0,0,5, ;181 - db 7,5,4,2,1,0,0,5, ;182 - db 7,5,4,2,1,0,0,6, ;183 - db 7,5,4,3,0,0,0,4, ;184 - db 7,5,4,3,0,0,0,5, ;185 - db 7,5,4,3,1,0,0,5, ;186 - db 7,5,4,3,1,0,0,6, ;187 - db 7,5,4,3,2,0,0,5, ;188 - db 7,5,4,3,2,0,0,6, ;189 - db 7,5,4,3,2,1,0,6, ;190 - db 7,5,4,3,2,1,0,7, ;191 - db 7,6,0,0,0,0,0,2, ;192 - db 7,6,0,0,0,0,0,3, ;193 - db 7,6,1,0,0,0,0,3, ;194 - db 7,6,1,0,0,0,0,4, ;195 - db 7,6,2,0,0,0,0,3, ;196 - db 7,6,2,0,0,0,0,4, ;197 - db 7,6,2,1,0,0,0,4, ;198 - db 7,6,2,1,0,0,0,5, ;199 - db 7,6,3,0,0,0,0,3, ;200 - db 7,6,3,0,0,0,0,4, ;201 - db 7,6,3,1,0,0,0,4, ;202 - db 7,6,3,1,0,0,0,5, ;203 - db 7,6,3,2,0,0,0,4, ;204 - db 7,6,3,2,0,0,0,5, ;205 - db 7,6,3,2,1,0,0,5, ;206 - db 7,6,3,2,1,0,0,6, ;207 - db 7,6,4,0,0,0,0,3, ;208 - db 7,6,4,0,0,0,0,4, ;209 - db 7,6,4,1,0,0,0,4, ;210 - db 7,6,4,1,0,0,0,5, ;211 - db 7,6,4,2,0,0,0,4, ;212 - db 7,6,4,2,0,0,0,5, ;213 - db 7,6,4,2,1,0,0,5, ;214 - db 7,6,4,2,1,0,0,6, ;215 - db 7,6,4,3,0,0,0,4, ;216 - db 7,6,4,3,0,0,0,5, ;217 - db 7,6,4,3,1,0,0,5, ;218 - db 7,6,4,3,1,0,0,6, ;219 - db 7,6,4,3,2,0,0,5, ;220 - db 7,6,4,3,2,0,0,6, ;221 - db 7,6,4,3,2,1,0,6, ;222 - db 7,6,4,3,2,1,0,7, ;223 - db 7,6,5,0,0,0,0,3, ;224 - db 7,6,5,0,0,0,0,4, ;225 - db 7,6,5,1,0,0,0,4, ;226 - db 7,6,5,1,0,0,0,5, ;227 - db 7,6,5,2,0,0,0,4, ;228 - db 7,6,5,2,0,0,0,5, ;229 - db 7,6,5,2,1,0,0,5, ;230 - db 7,6,5,2,1,0,0,6, ;231 - db 7,6,5,3,0,0,0,4, ;232 - db 7,6,5,3,0,0,0,5, ;233 - db 7,6,5,3,1,0,0,5, ;234 - db 7,6,5,3,1,0,0,6, ;235 - db 7,6,5,3,2,0,0,5, ;236 - db 7,6,5,3,2,0,0,6, ;237 - db 7,6,5,3,2,1,0,6, ;238 - db 7,6,5,3,2,1,0,7, ;239 - db 7,6,5,4,0,0,0,4, ;240 - db 7,6,5,4,0,0,0,5, ;241 - db 7,6,5,4,1,0,0,5, ;242 - db 7,6,5,4,1,0,0,6, ;243 - db 7,6,5,4,2,0,0,5, ;244 - db 7,6,5,4,2,0,0,6, ;245 - db 7,6,5,4,2,1,0,6, ;246 - db 7,6,5,4,2,1,0,7, ;247 - db 7,6,5,4,3,0,0,5, ;248 - db 7,6,5,4,3,0,0,6, ;249 - db 7,6,5,4,3,1,0,6, ;250 - db 7,6,5,4,3,1,0,7, ;251 - db 7,6,5,4,3,2,0,6, ;252 - db 7,6,5,4,3,2,0,7, ;253 - db 7,6,5,4,3,2,1,7, ;254 - db 7,6,5,4,3,2,1,8, ;255 + db 0,0,0,0,0,0,0,0, ;0 + db 0,0,0,0,0,0,0,1, ;1 + db 1,0,0,0,0,0,0,1, ;2 + db 1,0,0,0,0,0,0,2, ;3 + db 2,0,0,0,0,0,0,1, ;4 + db 2,0,0,0,0,0,0,2, ;5 + db 2,1,0,0,0,0,0,2, ;6 + db 2,1,0,0,0,0,0,3, ;7 + db 3,0,0,0,0,0,0,1, ;8 + db 3,0,0,0,0,0,0,2, ;9 + db 3,1,0,0,0,0,0,2, ;10 + db 3,1,0,0,0,0,0,3, ;11 + db 3,2,0,0,0,0,0,2, ;12 + db 3,2,0,0,0,0,0,3, ;13 + db 3,2,1,0,0,0,0,3, ;14 + db 3,2,1,0,0,0,0,4, ;15 + db 4,0,0,0,0,0,0,1, ;16 + db 4,0,0,0,0,0,0,2, ;17 + db 4,1,0,0,0,0,0,2, ;18 + db 4,1,0,0,0,0,0,3, ;19 + db 4,2,0,0,0,0,0,2, ;20 + db 4,2,0,0,0,0,0,3, ;21 + db 4,2,1,0,0,0,0,3, ;22 + db 4,2,1,0,0,0,0,4, ;23 + db 4,3,0,0,0,0,0,2, ;24 + db 4,3,0,0,0,0,0,3, ;25 + db 4,3,1,0,0,0,0,3, ;26 + db 4,3,1,0,0,0,0,4, ;27 + db 4,3,2,0,0,0,0,3, ;28 + db 4,3,2,0,0,0,0,4, ;29 + db 4,3,2,1,0,0,0,4, ;30 + db 4,3,2,1,0,0,0,5, ;31 + db 5,0,0,0,0,0,0,1, ;32 + db 5,0,0,0,0,0,0,2, ;33 + db 5,1,0,0,0,0,0,2, ;34 + db 5,1,0,0,0,0,0,3, ;35 + db 5,2,0,0,0,0,0,2, ;36 + db 5,2,0,0,0,0,0,3, ;37 + db 5,2,1,0,0,0,0,3, ;38 + db 5,2,1,0,0,0,0,4, ;39 + db 5,3,0,0,0,0,0,2, ;40 + db 5,3,0,0,0,0,0,3, ;41 + db 5,3,1,0,0,0,0,3, ;42 + db 5,3,1,0,0,0,0,4, ;43 + db 5,3,2,0,0,0,0,3, ;44 + db 5,3,2,0,0,0,0,4, ;45 + db 5,3,2,1,0,0,0,4, ;46 + db 5,3,2,1,0,0,0,5, ;47 + db 5,4,0,0,0,0,0,2, ;48 + db 5,4,0,0,0,0,0,3, ;49 + db 5,4,1,0,0,0,0,3, ;50 + db 5,4,1,0,0,0,0,4, ;51 + db 5,4,2,0,0,0,0,3, ;52 + db 5,4,2,0,0,0,0,4, ;53 + db 5,4,2,1,0,0,0,4, ;54 + db 5,4,2,1,0,0,0,5, ;55 + db 5,4,3,0,0,0,0,3, ;56 + db 5,4,3,0,0,0,0,4, ;57 + db 5,4,3,1,0,0,0,4, ;58 + db 5,4,3,1,0,0,0,5, ;59 + db 5,4,3,2,0,0,0,4, ;60 + db 5,4,3,2,0,0,0,5, ;61 + db 5,4,3,2,1,0,0,5, ;62 + db 5,4,3,2,1,0,0,6, ;63 + db 6,0,0,0,0,0,0,1, ;64 + db 6,0,0,0,0,0,0,2, ;65 + db 6,1,0,0,0,0,0,2, ;66 + db 6,1,0,0,0,0,0,3, ;67 + db 6,2,0,0,0,0,0,2, ;68 + db 6,2,0,0,0,0,0,3, ;69 + db 6,2,1,0,0,0,0,3, ;70 + db 6,2,1,0,0,0,0,4, ;71 + db 6,3,0,0,0,0,0,2, ;72 + db 6,3,0,0,0,0,0,3, ;73 + db 6,3,1,0,0,0,0,3, ;74 + db 6,3,1,0,0,0,0,4, ;75 + db 6,3,2,0,0,0,0,3, ;76 + db 6,3,2,0,0,0,0,4, ;77 + db 6,3,2,1,0,0,0,4, ;78 + db 6,3,2,1,0,0,0,5, ;79 + db 6,4,0,0,0,0,0,2, ;80 + db 6,4,0,0,0,0,0,3, ;81 + db 6,4,1,0,0,0,0,3, ;82 + db 6,4,1,0,0,0,0,4, ;83 + db 6,4,2,0,0,0,0,3, ;84 + db 6,4,2,0,0,0,0,4, ;85 + db 6,4,2,1,0,0,0,4, ;86 + db 6,4,2,1,0,0,0,5, ;87 + db 6,4,3,0,0,0,0,3, ;88 + db 6,4,3,0,0,0,0,4, ;89 + db 6,4,3,1,0,0,0,4, ;90 + db 6,4,3,1,0,0,0,5, ;91 + db 6,4,3,2,0,0,0,4, ;92 + db 6,4,3,2,0,0,0,5, ;93 + db 6,4,3,2,1,0,0,5, ;94 + db 6,4,3,2,1,0,0,6, ;95 + db 6,5,0,0,0,0,0,2, ;96 + db 6,5,0,0,0,0,0,3, ;97 + db 6,5,1,0,0,0,0,3, ;98 + db 6,5,1,0,0,0,0,4, ;99 + db 6,5,2,0,0,0,0,3, ;100 + db 6,5,2,0,0,0,0,4, ;101 + db 6,5,2,1,0,0,0,4, ;102 + db 6,5,2,1,0,0,0,5, ;103 + db 6,5,3,0,0,0,0,3, ;104 + db 6,5,3,0,0,0,0,4, ;105 + db 6,5,3,1,0,0,0,4, ;106 + db 6,5,3,1,0,0,0,5, ;107 + db 6,5,3,2,0,0,0,4, ;108 + db 6,5,3,2,0,0,0,5, ;109 + db 6,5,3,2,1,0,0,5, ;110 + db 6,5,3,2,1,0,0,6, ;111 + db 6,5,4,0,0,0,0,3, ;112 + db 6,5,4,0,0,0,0,4, ;113 + db 6,5,4,1,0,0,0,4, ;114 + db 6,5,4,1,0,0,0,5, ;115 + db 6,5,4,2,0,0,0,4, ;116 + db 6,5,4,2,0,0,0,5, ;117 + db 6,5,4,2,1,0,0,5, ;118 + db 6,5,4,2,1,0,0,6, ;119 + db 6,5,4,3,0,0,0,4, ;120 + db 6,5,4,3,0,0,0,5, ;121 + db 6,5,4,3,1,0,0,5, ;122 + db 6,5,4,3,1,0,0,6, ;123 + db 6,5,4,3,2,0,0,5, ;124 + db 6,5,4,3,2,0,0,6, ;125 + db 6,5,4,3,2,1,0,6, ;126 + db 6,5,4,3,2,1,0,7, ;127 + db 7,0,0,0,0,0,0,1, ;128 + db 7,0,0,0,0,0,0,2, ;129 + db 7,1,0,0,0,0,0,2, ;130 + db 7,1,0,0,0,0,0,3, ;131 + db 7,2,0,0,0,0,0,2, ;132 + db 7,2,0,0,0,0,0,3, ;133 + db 7,2,1,0,0,0,0,3, ;134 + db 7,2,1,0,0,0,0,4, ;135 + db 7,3,0,0,0,0,0,2, ;136 + db 7,3,0,0,0,0,0,3, ;137 + db 7,3,1,0,0,0,0,3, ;138 + db 7,3,1,0,0,0,0,4, ;139 + db 7,3,2,0,0,0,0,3, ;140 + db 7,3,2,0,0,0,0,4, ;141 + db 7,3,2,1,0,0,0,4, ;142 + db 7,3,2,1,0,0,0,5, ;143 + db 7,4,0,0,0,0,0,2, ;144 + db 7,4,0,0,0,0,0,3, ;145 + db 7,4,1,0,0,0,0,3, ;146 + db 7,4,1,0,0,0,0,4, ;147 + db 7,4,2,0,0,0,0,3, ;148 + db 7,4,2,0,0,0,0,4, ;149 + db 7,4,2,1,0,0,0,4, ;150 + db 7,4,2,1,0,0,0,5, ;151 + db 7,4,3,0,0,0,0,3, ;152 + db 7,4,3,0,0,0,0,4, ;153 + db 7,4,3,1,0,0,0,4, ;154 + db 7,4,3,1,0,0,0,5, ;155 + db 7,4,3,2,0,0,0,4, ;156 + db 7,4,3,2,0,0,0,5, ;157 + db 7,4,3,2,1,0,0,5, ;158 + db 7,4,3,2,1,0,0,6, ;159 + db 7,5,0,0,0,0,0,2, ;160 + db 7,5,0,0,0,0,0,3, ;161 + db 7,5,1,0,0,0,0,3, ;162 + db 7,5,1,0,0,0,0,4, ;163 + db 7,5,2,0,0,0,0,3, ;164 + db 7,5,2,0,0,0,0,4, ;165 + db 7,5,2,1,0,0,0,4, ;166 + db 7,5,2,1,0,0,0,5, ;167 + db 7,5,3,0,0,0,0,3, ;168 + db 7,5,3,0,0,0,0,4, ;169 + db 7,5,3,1,0,0,0,4, ;170 + db 7,5,3,1,0,0,0,5, ;171 + db 7,5,3,2,0,0,0,4, ;172 + db 7,5,3,2,0,0,0,5, ;173 + db 7,5,3,2,1,0,0,5, ;174 + db 7,5,3,2,1,0,0,6, ;175 + db 7,5,4,0,0,0,0,3, ;176 + db 7,5,4,0,0,0,0,4, ;177 + db 7,5,4,1,0,0,0,4, ;178 + db 7,5,4,1,0,0,0,5, ;179 + db 7,5,4,2,0,0,0,4, ;180 + db 7,5,4,2,0,0,0,5, ;181 + db 7,5,4,2,1,0,0,5, ;182 + db 7,5,4,2,1,0,0,6, ;183 + db 7,5,4,3,0,0,0,4, ;184 + db 7,5,4,3,0,0,0,5, ;185 + db 7,5,4,3,1,0,0,5, ;186 + db 7,5,4,3,1,0,0,6, ;187 + db 7,5,4,3,2,0,0,5, ;188 + db 7,5,4,3,2,0,0,6, ;189 + db 7,5,4,3,2,1,0,6, ;190 + db 7,5,4,3,2,1,0,7, ;191 + db 7,6,0,0,0,0,0,2, ;192 + db 7,6,0,0,0,0,0,3, ;193 + db 7,6,1,0,0,0,0,3, ;194 + db 7,6,1,0,0,0,0,4, ;195 + db 7,6,2,0,0,0,0,3, ;196 + db 7,6,2,0,0,0,0,4, ;197 + db 7,6,2,1,0,0,0,4, ;198 + db 7,6,2,1,0,0,0,5, ;199 + db 7,6,3,0,0,0,0,3, ;200 + db 7,6,3,0,0,0,0,4, ;201 + db 7,6,3,1,0,0,0,4, ;202 + db 7,6,3,1,0,0,0,5, ;203 + db 7,6,3,2,0,0,0,4, ;204 + db 7,6,3,2,0,0,0,5, ;205 + db 7,6,3,2,1,0,0,5, ;206 + db 7,6,3,2,1,0,0,6, ;207 + db 7,6,4,0,0,0,0,3, ;208 + db 7,6,4,0,0,0,0,4, ;209 + db 7,6,4,1,0,0,0,4, ;210 + db 7,6,4,1,0,0,0,5, ;211 + db 7,6,4,2,0,0,0,4, ;212 + db 7,6,4,2,0,0,0,5, ;213 + db 7,6,4,2,1,0,0,5, ;214 + db 7,6,4,2,1,0,0,6, ;215 + db 7,6,4,3,0,0,0,4, ;216 + db 7,6,4,3,0,0,0,5, ;217 + db 7,6,4,3,1,0,0,5, ;218 + db 7,6,4,3,1,0,0,6, ;219 + db 7,6,4,3,2,0,0,5, ;220 + db 7,6,4,3,2,0,0,6, ;221 + db 7,6,4,3,2,1,0,6, ;222 + db 7,6,4,3,2,1,0,7, ;223 + db 7,6,5,0,0,0,0,3, ;224 + db 7,6,5,0,0,0,0,4, ;225 + db 7,6,5,1,0,0,0,4, ;226 + db 7,6,5,1,0,0,0,5, ;227 + db 7,6,5,2,0,0,0,4, ;228 + db 7,6,5,2,0,0,0,5, ;229 + db 7,6,5,2,1,0,0,5, ;230 + db 7,6,5,2,1,0,0,6, ;231 + db 7,6,5,3,0,0,0,4, ;232 + db 7,6,5,3,0,0,0,5, ;233 + db 7,6,5,3,1,0,0,5, ;234 + db 7,6,5,3,1,0,0,6, ;235 + db 7,6,5,3,2,0,0,5, ;236 + db 7,6,5,3,2,0,0,6, ;237 + db 7,6,5,3,2,1,0,6, ;238 + db 7,6,5,3,2,1,0,7, ;239 + db 7,6,5,4,0,0,0,4, ;240 + db 7,6,5,4,0,0,0,5, ;241 + db 7,6,5,4,1,0,0,5, ;242 + db 7,6,5,4,1,0,0,6, ;243 + db 7,6,5,4,2,0,0,5, ;244 + db 7,6,5,4,2,0,0,6, ;245 + db 7,6,5,4,2,1,0,6, ;246 + db 7,6,5,4,2,1,0,7, ;247 + db 7,6,5,4,3,0,0,5, ;248 + db 7,6,5,4,3,0,0,6, ;249 + db 7,6,5,4,3,1,0,6, ;250 + db 7,6,5,4,3,1,0,7, ;251 + db 7,6,5,4,3,2,0,6, ;252 + db 7,6,5,4,3,2,0,7, ;253 + db 7,6,5,4,3,2,1,7, ;254 + db 7,6,5,4,3,2,1,8, ;255 ;*********************************************************************** ; Code @@ -323,43 +323,43 @@ SECTION .text ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); ;*********************************************************************** WELS_EXTERN CavlcParamCal_sse2 - push ebx - push edi - push esi + push ebx + push edi + push esi - mov eax, [esp+16] ;coffLevel - mov edi, [esp+24] ;Level - mov ebx, [esp+32] ;endIdx - cmp ebx, 3 - jne .Level16 - pxor xmm1, xmm1 - movq xmm0, [eax] ; removed QWORD - jmp .Cal_begin + mov eax, [esp+16] ;coffLevel + mov edi, [esp+24] ;Level + mov ebx, [esp+32] ;endIdx + cmp ebx, 3 + jne .Level16 + pxor xmm1, xmm1 + movq xmm0, [eax] ; removed QWORD + jmp .Cal_begin .Level16: - movdqa xmm0, [eax] - movdqa xmm1, [eax+16] + movdqa xmm0, [eax] + movdqa xmm1, [eax+16] .Cal_begin: - movdqa xmm2, xmm0 - packsswb xmm0, xmm1 - movdqa xmm4, xmm0 - pxor xmm3, xmm3 - pcmpgtb xmm0, xmm3 - pcmpgtb xmm3, xmm4 - por xmm0, xmm3 - pmovmskb edx, xmm0 - cmp edx, 0 - je near .return - movdqa xmm6, [sse2_b_1] - pcmpeqw xmm7, xmm7 ;generate -1 - mov ebx, 0xff - ;pinsrw xmm6, ebx, 3 + movdqa xmm2, xmm0 + packsswb xmm0, xmm1 + movdqa xmm4, xmm0 + pxor xmm3, xmm3 + pcmpgtb xmm0, xmm3 + pcmpgtb xmm3, xmm4 + por xmm0, xmm3 + pmovmskb edx, xmm0 + cmp edx, 0 + je near .return + movdqa xmm6, [sse2_b_1] + pcmpeqw xmm7, xmm7 ;generate -1 + mov ebx, 0xff + ;pinsrw xmm6, ebx, 3 mov bl, dh - lea ebx, [byte_1pos_table+8*ebx] - movq xmm0, [ebx] - pextrw ecx, xmm0, 3 - shr ecx, 8 + lea ebx, [byte_1pos_table+8*ebx] + movq xmm0, [ebx] + pextrw ecx, xmm0, 3 + shr ecx, 8 mov dh, cl .loopHighFind0: @@ -367,19 +367,19 @@ WELS_EXTERN CavlcParamCal_sse2 je .loopHighFind0End ;mov esi, [ebx] ;and esi, 0xff - movzx esi, byte [ebx] + movzx esi, byte [ebx] add esi, 8 mov esi, [eax+2*esi] mov [edi], si add edi, 2 ;add ebx, 1 - inc ebx + inc ebx dec ecx - jmp .loopHighFind0 + jmp .loopHighFind0 .loopHighFind0End: mov cl, dh cmp cl, 8 - pand xmm0, xmm6 + pand xmm0, xmm6 jne .LowByteFind0 sub edi, 2 mov esi, [eax+16] @@ -387,8 +387,8 @@ WELS_EXTERN CavlcParamCal_sse2 add edi, 2 .LowByteFind0: and edx, 0xff - lea ebx, [byte_1pos_table+8*edx] - movq xmm1, [ebx] + lea ebx, [byte_1pos_table+8*edx] + movq xmm1, [ebx] pextrw esi, xmm1, 3 or esi, 0xff or ecx, 0xff00 @@ -398,16 +398,16 @@ WELS_EXTERN CavlcParamCal_sse2 .loopLowFind0: cmp esi, 0 je .loopLowFind0End - ;mov edx, [ebx] - ;and edx, 0xff - movzx edx, byte [ebx] - mov edx, [eax+2*edx] - mov [edi], dx - add edi, 2 - ;add ebx, 1 - inc ebx + ;mov edx, [ebx] + ;and edx, 0xff + movzx edx, byte [ebx] + mov edx, [eax+2*edx] + mov [edi], dx + add edi, 2 + ;add ebx, 1 + inc ebx dec esi - jmp .loopLowFind0 + jmp .loopLowFind0 .loopLowFind0End: cmp ch, 8 jne .getLevelEnd @@ -415,12 +415,12 @@ WELS_EXTERN CavlcParamCal_sse2 mov edx, [eax] mov [edi], dx .getLevelEnd: - mov edx, [esp+28] ;total_coeffs + mov edx, [esp+28] ;total_coeffs ;mov ebx, ecx ;and ebx, 0xff - movzx ebx, byte cl + movzx ebx, byte cl add cl, ch - mov [edx], cl + mov [edx], cl ;getRun movq xmm5, [sse2_b8] paddb xmm0, xmm5 @@ -430,7 +430,7 @@ WELS_EXTERN CavlcParamCal_sse2 sub eax, ebx shl eax, 3 shl ebx, 3 - pinsrw xmm2, ebx, 0 + pinsrw xmm2, ebx, 0 pinsrw xmm3, eax, 0 psllq xmm0, xmm3 psrlq xmm0, xmm3 @@ -441,19 +441,19 @@ WELS_EXTERN CavlcParamCal_sse2 por xmm0, xmm1 pextrw eax, xmm0, 0 - and eax, 0xff + and eax, 0xff inc eax sub al, cl - movdqa xmm1, xmm0 - paddb xmm1, xmm7 - psrldq xmm0, 1 - psubb xmm1, xmm0 + movdqa xmm1, xmm0 + paddb xmm1, xmm7 + psrldq xmm0, 1 + psubb xmm1, xmm0 mov ecx, [esp+20] ;run - movdqa [ecx], xmm1 + movdqa [ecx], xmm1 ;getRunEnd .return: - pop esi - pop edi - pop ebx - ret + pop esi + pop edi + pop ebx + ret %endif diff --git a/codec/encoder/core/x86/dct.asm b/codec/encoder/core/x86/dct.asm index a47de8a6..e5738053 100644 --- a/codec/encoder/core/x86/dct.asm +++ b/codec/encoder/core/x86/dct.asm @@ -50,17 +50,17 @@ SECTION .rodata align=16 align 16 SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16, - dw 10, 13, 10, 13, 13, 16, 13, 16, + dw 10, 13, 10, 13, 13, 16, 13, 16, dw 11, 14, 11, 14, 14, 18, 14, 18, - dw 11, 14, 11, 14, 14, 18, 14, 18, - dw 13, 16, 13, 16, 16, 20, 16, 20, - dw 13, 16, 13, 16, 16, 20, 16, 20, + dw 11, 14, 11, 14, 14, 18, 14, 18, + dw 13, 16, 13, 16, 16, 20, 16, 20, + dw 13, 16, 13, 16, 16, 20, 16, 20, dw 14, 18, 14, 18, 18, 23, 18, 23, - dw 14, 18, 14, 18, 18, 23, 18, 23, - dw 16, 20, 16, 20, 20, 25, 20, 25, - dw 16, 20, 16, 20, 20, 25, 20, 25, + dw 14, 18, 14, 18, 18, 23, 18, 23, + dw 16, 20, 16, 20, 20, 25, 20, 25, + dw 16, 20, 16, 20, 20, 25, 20, 25, dw 18, 23, 18, 23, 23, 29, 23, 29, - dw 18, 23, 18, 23, 23, 29, 23, 29 + dw 18, 23, 18, 23, 23, 29, 23, 29 ;*********************************************************************** @@ -68,27 +68,27 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16, ;*********************************************************************** %macro MMX_LoadDiff4P 5 - movd %1, [%3] - movd %2, [%4] - punpcklbw %1, %5 - punpcklbw %2, %5 - psubw %1, %2 + movd %1, [%3] + movd %2, [%4] + punpcklbw %1, %5 + punpcklbw %2, %5 + psubw %1, %2 %endmacro %macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm) - MMX_LoadDiff4P %1, %9, %5, %7, %10 - MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10 - lea %5, [%5+2*%6] - lea %7, [%7+2*%8] - MMX_LoadDiff4P %3, %9, %5, %7, %10 - MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10 + MMX_LoadDiff4P %1, %9, %5, %7, %10 + MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10 + lea %5, [%5+2*%6] + lea %7, [%7+2*%8] + MMX_LoadDiff4P %3, %9, %5, %7, %10 + MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10 %endmacro %macro MMX_SumSubMul2 3 - movq %3, %1 - psllw %1, $01 - paddw %1, %2 - psllw %2, $01 + movq %3, %1 + psllw %1, $01 + paddw %1, %2 + psllw %2, $01 psubw %3, %2 %endmacro @@ -101,23 +101,23 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16, %endmacro %macro MMX_SumSub 3 - movq %3, %2 + movq %3, %2 psubw %2, %1 paddw %1, %3 %endmacro %macro MMX_DCT 6 - MMX_SumSub %4, %1, %6 - MMX_SumSub %3, %2, %6 - MMX_SumSub %3, %4, %6 + MMX_SumSub %4, %1, %6 + MMX_SumSub %3, %2, %6 + MMX_SumSub %3, %4, %6 MMX_SumSubMul2 %1, %2, %5 %endmacro %macro MMX_IDCT 6 MMX_SumSub %4, %5, %6 MMX_SumSubDiv2 %3, %2, %1 - MMX_SumSub %1, %4, %6 - MMX_SumSub %3, %5, %6 + MMX_SumSub %1, %4, %6 + MMX_SumSub %3, %5, %6 %endmacro %macro MMX_StoreDiff4P 6 @@ -142,11 +142,11 @@ WELS_EXTERN WelsDctT4_mmx MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7 - MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6 - MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2 + MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6 + MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2 - MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6 - MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5 + MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6 + MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5 movq [r0+ 0], mm2 movq [r0+ 8], mm1 @@ -170,22 +170,22 @@ WELS_EXTERN WelsIDctT4Rec_mmx movq mm2, [r4+16] movq mm3, [r4+24] - MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 - MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 - MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 - MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 + MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 + MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 + MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 + MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 - WELS_Zero mm7 - WELS_DW32 mm6 + WELS_Zero mm7 + WELS_DW32 mm6 - MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2] - MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3] + MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2] + MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3] lea r0, [r0+2*r1] lea r2, [r2+2*r3] - MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2] - MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3] + MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2] + MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3] - WELSEMMS + WELSEMMS LOAD_5_PARA_POP ret @@ -194,21 +194,21 @@ WELS_EXTERN WelsIDctT4Rec_mmx ; SSE2 functions ;*********************************************************************** %macro SSE2_Store4x8p 6 - SSE2_XSawp qdq, %2, %3, %6 - SSE2_XSawp qdq, %4, %5, %3 - MOVDQ [%1+0x00], %2 - MOVDQ [%1+0x10], %4 - MOVDQ [%1+0x20], %6 - MOVDQ [%1+0x30], %3 + SSE2_XSawp qdq, %2, %3, %6 + SSE2_XSawp qdq, %4, %5, %3 + MOVDQ [%1+0x00], %2 + MOVDQ [%1+0x10], %4 + MOVDQ [%1+0x20], %6 + MOVDQ [%1+0x30], %3 %endmacro %macro SSE2_Load4x8p 6 - MOVDQ %2, [%1+0x00] - MOVDQ %4, [%1+0x10] - MOVDQ %6, [%1+0x20] - MOVDQ %3, [%1+0x30] - SSE2_XSawp qdq, %4, %3, %5 - SSE2_XSawp qdq, %2, %6, %3 + MOVDQ %2, [%1+0x00] + MOVDQ %4, [%1+0x10] + MOVDQ %6, [%1+0x20] + MOVDQ %3, [%1+0x30] + SSE2_XSawp qdq, %4, %3, %5 + SSE2_XSawp qdq, %2, %6, %3 %endmacro %macro SSE2_SumSubMul2 3 @@ -231,57 +231,57 @@ WELS_EXTERN WelsIDctT4Rec_mmx %macro SSE2_StoreDiff8p 6 paddw %1, %3 psraw %1, $06 - movq %2, %6 + movq %2, %6 punpcklbw %2, %4 paddsw %2, %1 packuswb %2, %2 - movq %5, %2 + movq %5, %2 %endmacro %macro SSE2_StoreDiff8p 5 - movq %2, %5 + movq %2, %5 punpcklbw %2, %3 paddsw %2, %1 packuswb %2, %2 - movq %4, %2 + movq %4, %2 %endmacro -%macro SSE2_Load8DC 6 - movdqa %1, %6 ; %1 = dc0 dc1 - paddw %1, %5 - psraw %1, $06 ; (dc + 32) >> 6 +%macro SSE2_Load8DC 6 + movdqa %1, %6 ; %1 = dc0 dc1 + paddw %1, %5 + psraw %1, $06 ; (dc + 32) >> 6 - movdqa %2, %1 - psrldq %2, 4 - punpcklwd %2, %2 - punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 + movdqa %2, %1 + psrldq %2, 4 + punpcklwd %2, %2 + punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 - movdqa %3, %1 - psrldq %3, 8 - punpcklwd %3, %3 - punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5 + movdqa %3, %1 + psrldq %3, 8 + punpcklwd %3, %3 + punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5 - movdqa %4, %1 - psrldq %4, 12 - punpcklwd %4, %4 - punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7 + movdqa %4, %1 + psrldq %4, 12 + punpcklwd %4, %4 + punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7 - punpcklwd %1, %1 - punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 + punpcklwd %1, %1 + punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 %endmacro %macro SSE2_DCT 6 - SSE2_SumSub %6, %3, %5 - SSE2_SumSub %1, %2, %5 - SSE2_SumSub %3, %2, %5 - SSE2_SumSubMul2 %6, %1, %4 + SSE2_SumSub %6, %3, %5 + SSE2_SumSub %1, %2, %5 + SSE2_SumSub %3, %2, %5 + SSE2_SumSubMul2 %6, %1, %4 %endmacro %macro SSE2_IDCT 7 SSE2_SumSub %7, %2, %6 SSE2_SumSubDiv2 %1, %3, %5, %4 - SSE2_SumSub %2, %1, %5 - SSE2_SumSub %7, %4, %5 + SSE2_SumSub %2, %1, %5 + SSE2_SumSub %7, %4, %5 %endmacro ;*********************************************************************** @@ -294,42 +294,42 @@ WELS_EXTERN WelsDctFourT4_sse2 SIGN_EXTENSION r2, r2d SIGN_EXTENSION r4, r4d pxor xmm7, xmm7 - ;Load 4x8 - SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3] + ;Load 4x8 + SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3] SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4] - lea r1, [r1 + 2 * r2] - lea r3, [r3 + 2 * r4] - SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] - SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] - - SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 - SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1 - SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2 - SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0 - - SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5 - - lea r1, [r1 + 2 * r2] - lea r3, [r3 + 2 * r4] - - ;Load 4x8 - SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ] - SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4] - lea r1, [r1 + 2 * r2] - lea r3, [r3 + 2 * r4] + lea r1, [r1 + 2 * r2] + lea r3, [r3 + 2 * r4] SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] - SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 - SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1 - SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2 - SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0 + SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 + SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1 + SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2 + SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0 - lea r0, [r0+64] - SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5 + SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5 - POP_XMM - LOAD_5_PARA_POP + lea r1, [r1 + 2 * r2] + lea r3, [r3 + 2 * r4] + + ;Load 4x8 + SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ] + SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4] + lea r1, [r1 + 2 * r2] + lea r3, [r3 + 2 * r4] + SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] + SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] + + SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 + SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1 + SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2 + SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0 + + lea r0, [r0+64] + SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5 + + POP_XMM + LOAD_5_PARA_POP ret @@ -337,168 +337,168 @@ WELS_EXTERN WelsDctFourT4_sse2 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs); ;*********************************************************************** WELS_EXTERN WelsIDctFourT4Rec_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - ;Load 4x8 - SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + ;Load 4x8 + SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 - SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 - SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0 - SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3 - SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1 - - WELS_Zero xmm7 - WELS_DW32 xmm6 - - SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2] - SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2] - SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] - - add r4, 64 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 - - SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 - SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0 + SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 + SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0 SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3 - SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1 + SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1 - WELS_Zero xmm7 - WELS_DW32 xmm6 + WELS_Zero xmm7 + WELS_DW32 xmm6 - SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2] - SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2] - SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3] - POP_XMM - LOAD_5_PARA_POP - ; pop esi - ; pop ebx + SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2] + SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2] + SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] + + add r4, 64 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 + + SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 + SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0 + SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3 + SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1 + + WELS_Zero xmm7 + WELS_DW32 xmm6 + + SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2] + SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2] + SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3] + POP_XMM + LOAD_5_PARA_POP + ; pop esi + ; pop ebx ret %macro SSE2_StoreDiff4x8p 8 - SSE2_StoreDiff8p %1, %3, %4, [%5], [%6] - SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8] - SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8] - SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8] + SSE2_StoreDiff8p %1, %3, %4, [%5], [%6] + SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8] + SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8] + SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8] %endmacro ;*********************************************************************** ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc) ;*********************************************************************** WELS_EXTERN WelsIDctRecI16x16Dc_sse2 - %assign push_num 0 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm7, xmm7 - WELS_DW32 xmm6 + %assign push_num 0 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm7, xmm7 + WELS_DW32 xmm6 - SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4] - SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 + SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4] + SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 - SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16] - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 + SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 - POP_XMM - LOAD_5_PARA_POP + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 + POP_XMM + LOAD_5_PARA_POP ret %macro SSE2_SumSubD 3 - movdqa %3, %2 + movdqa %3, %2 paddd %2, %1 psubd %1, %3 %endmacro %macro SSE2_SumSubDiv2D 4 - paddd %1, %2 - paddd %1, %3 - psrad %1, 1 - movdqa %4, %1 - psubd %4, %2 + paddd %1, %2 + paddd %1, %3 + psrad %1, 1 + movdqa %4, %1 + psubd %4, %2 %endmacro -%macro SSE2_Load4Col 5 - movsx r2, WORD[%5] - movd %1, r2d - movsx r2, WORD[%5 + 0x20] - movd %2, r2d - punpckldq %1, %2 - movsx r2, WORD[%5 + 0x80] - movd %3, r2d - movsx r2, WORD[%5 + 0xa0] - movd %4, r2d - punpckldq %3, %4 - punpcklqdq %1, %3 +%macro SSE2_Load4Col 5 + movsx r2, WORD[%5] + movd %1, r2d + movsx r2, WORD[%5 + 0x20] + movd %2, r2d + punpckldq %1, %2 + movsx r2, WORD[%5 + 0x80] + movd %3, r2d + movsx r2, WORD[%5 + 0xa0] + movd %4, r2d + punpckldq %3, %4 + punpcklqdq %1, %3 %endmacro ;*********************************************************************** ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct) ;*********************************************************************** WELS_EXTERN WelsHadamardT4Dc_sse2 - %assign push_num 0 - LOAD_2_PARA - PUSH_XMM 8 - SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1 - SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40 - SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100 - SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140 + %assign push_num 0 + LOAD_2_PARA + PUSH_XMM 8 + SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1 + SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40 + SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100 + SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140 - SSE2_SumSubD xmm1, xmm2, xmm7 - SSE2_SumSubD xmm3, xmm4, xmm7 - SSE2_SumSubD xmm2, xmm4, xmm7 - SSE2_SumSubD xmm1, xmm3, xmm7 + SSE2_SumSubD xmm1, xmm2, xmm7 + SSE2_SumSubD xmm3, xmm4, xmm7 + SSE2_SumSubD xmm2, xmm4, xmm7 + SSE2_SumSubD xmm1, xmm3, xmm7 - SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1 + SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1 - SSE2_SumSubD xmm4, xmm3, xmm7 - SSE2_SumSubD xmm5, xmm1, xmm7 + SSE2_SumSubD xmm4, xmm3, xmm7 + SSE2_SumSubD xmm5, xmm1, xmm7 - WELS_DD1 xmm6 - SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2 - SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2 - SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1 + WELS_DD1 xmm6 + SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2 + SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2 + SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1 - packssdw xmm3, xmm4 - packssdw xmm2, xmm1 - movdqa [r0+ 0], xmm3 - movdqa [r0+16], xmm2 + packssdw xmm3, xmm4 + packssdw xmm2, xmm1 + movdqa [r0+ 0], xmm3 + movdqa [r0+16], xmm2 - POP_XMM - ret + POP_XMM + ret diff --git a/codec/encoder/core/x86/intra_pred.asm b/codec/encoder/core/x86/intra_pred.asm index 16eb8bc3..acbc265d 100644 --- a/codec/encoder/core/x86/intra_pred.asm +++ b/codec/encoder/core/x86/intra_pred.asm @@ -61,7 +61,7 @@ align 16 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4 align 16 -mmx_01bytes: times 16 db 1 +mmx_01bytes: times 16 db 1 align 16 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 @@ -73,106 +73,106 @@ mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 ;%1 will keep the last result %macro SSE_DB_1_2REG 2 - pxor %1, %1 - pcmpeqw %2, %2 - psubb %1, %2 + pxor %1, %1 + pcmpeqw %2, %2 + psubb %1, %2 %endmacro ;xmm0, xmm1, xmm2, eax, ecx ;lower 64 bits of xmm0 save the result %macro SSE2_PRED_H_4X4_TWO_LINE 5 - movd %1, [%4-1] - movdqa %3, %1 - punpcklbw %1, %3 - movdqa %3, %1 - punpcklbw %1, %3 + movd %1, [%4-1] + movdqa %3, %1 + punpcklbw %1, %3 + movdqa %3, %1 + punpcklbw %1, %3 - ;add %4, %5 - movd %2, [%4+%5-1] - movdqa %3, %2 - punpcklbw %2, %3 - movdqa %3, %2 - punpcklbw %2, %3 - punpckldq %1, %2 + ;add %4, %5 + movd %2, [%4+%5-1] + movdqa %3, %2 + punpcklbw %2, %3 + movdqa %3, %2 + punpcklbw %2, %3 + punpckldq %1, %2 %endmacro %macro SUMW_HORIZON1 2 - movdqa %2, %1 - psrldq %2, 8 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 4 - paddusw %1, %2 - movdqa %2, %1 - psrldq %2, 2 - paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 8 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 4 + paddusw %1, %2 + movdqa %2, %1 + psrldq %2, 2 + paddusw %1, %2 %endmacro %macro LOAD_COLUMN 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpcklwd %1, %3 - lea %5, [%5+2*%6] - movd %4, [%5] - movd %2, [%5+%6] - punpcklbw %4, %2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - lea %5, [%5+2*%6] - punpcklbw %3, %2 - punpcklwd %4, %3 - punpckhdq %1, %4 + movd %1, [%5] + movd %2, [%5+%6] + punpcklbw %1, %2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + punpcklbw %3, %2 + punpcklwd %1, %3 + lea %5, [%5+2*%6] + movd %4, [%5] + movd %2, [%5+%6] + punpcklbw %4, %2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + lea %5, [%5+2*%6] + punpcklbw %3, %2 + punpcklwd %4, %3 + punpckhdq %1, %4 %endmacro %macro SUMW_HORIZON 3 - movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 - paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 - punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 - movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 - paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 - pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 - paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 + movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 + paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 + punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 + movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 + paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 + pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 + paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 %endmacro %macro COPY_16_TIMES 2 - movdqa %2, [%1-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 + movdqa %2, [%1-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 %endmacro %macro COPY_16_TIMESS 3 - movdqa %2, [%1+%3-16] - psrldq %2, 15 - pmuludq %2, [mmx_01bytes] - pshufd %2, %2, 0 + movdqa %2, [%1+%3-16] + psrldq %2, 15 + pmuludq %2, [mmx_01bytes] + pshufd %2, %2, 0 %endmacro %macro LOAD_COLUMN_C 6 - movd %1, [%5] - movd %2, [%5+%6] - punpcklbw %1,%2 - lea %5, [%5+2*%6] - movd %3, [%5] - movd %2, [%5+%6] - punpcklbw %3, %2 - punpckhwd %1, %3 - lea %5, [%5+2*%6] + movd %1, [%5] + movd %2, [%5+%6] + punpcklbw %1,%2 + lea %5, [%5+2*%6] + movd %3, [%5] + movd %2, [%5+%6] + punpcklbw %3, %2 + punpckhwd %1, %3 + lea %5, [%5+2*%6] %endmacro %macro LOAD_2_LEFT_AND_ADD 0 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] - add r3, r4 - movzx r4, byte [r1+r2-0x01] - add r3, r4 + lea r1, [r1+2*r2] + movzx r4, byte [r1-0x01] + add r3, r4 + movzx r4, byte [r1+r2-0x01] + add r3, r4 %endmacro ;*********************************************************************** @@ -184,165 +184,165 @@ SECTION .text ;*********************************************************************** ; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) ; -; pred must align to 16 +; pred must align to 16 ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredH_sse2 - push r3 - %assign push_num 1 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - movzx r3, byte [r1-1] - movd xmm0, r3d - pmuludq xmm0, [mmx_01bytes] + push r3 + %assign push_num 1 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + movzx r3, byte [r1-1] + movd xmm0, r3d + pmuludq xmm0, [mmx_01bytes] - movzx r3, byte [r1+r2-1] - movd xmm1, r3d - pmuludq xmm1, [mmx_01bytes] + movzx r3, byte [r1+r2-1] + movd xmm1, r3d + pmuludq xmm1, [mmx_01bytes] - unpcklps xmm0, xmm1 + unpcklps xmm0, xmm1 - lea r1, [r1+r2*2] - movzx r3, byte [r1-1] - movd xmm2, r3d - pmuludq xmm2, [mmx_01bytes] + lea r1, [r1+r2*2] + movzx r3, byte [r1-1] + movd xmm2, r3d + pmuludq xmm2, [mmx_01bytes] - movzx r3, byte [r1+r2-1] - movd xmm3, r3d - pmuludq xmm3, [mmx_01bytes] + movzx r3, byte [r1+r2-1] + movd xmm3, r3d + pmuludq xmm3, [mmx_01bytes] - unpcklps xmm2, xmm3 - unpcklpd xmm0, xmm2 + unpcklps xmm2, xmm3 + unpcklpd xmm0, xmm2 - movdqa [r0], xmm0 - pop r3 - ret + movdqa [r0], xmm0 + pop r3 + ret ;*********************************************************************** ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); ;*********************************************************************** WELS_EXTERN WelsI16x16LumaPredPlane_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - PUSH_XMM 8 - SIGN_EXTENSION r2, r2d - sub r1, 1 - sub r1, r2 + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + PUSH_XMM 8 + SIGN_EXTENSION r2, r2d + sub r1, 1 + sub r1, r2 - ;for H - pxor xmm7, xmm7 - movq xmm0, [r1] - movdqa xmm5, [sse2_plane_dec] - punpcklbw xmm0, xmm7 - pmullw xmm0, xmm5 - movq xmm1, [r1 + 9] - movdqa xmm6, [sse2_plane_inc] - punpcklbw xmm1, xmm7 - pmullw xmm1, xmm6 - psubw xmm1, xmm0 + ;for H + pxor xmm7, xmm7 + movq xmm0, [r1] + movdqa xmm5, [sse2_plane_dec] + punpcklbw xmm0, xmm7 + pmullw xmm0, xmm5 + movq xmm1, [r1 + 9] + movdqa xmm6, [sse2_plane_inc] + punpcklbw xmm1, xmm7 + pmullw xmm1, xmm6 + psubw xmm1, xmm0 - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); - movsx r3, r3w - imul r3, 5 - add r3, 32 - sar r3, 6 ; b = (5 * H + 32) >> 6; - SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b + SUMW_HORIZON xmm1,xmm0,xmm2 + movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); + movsx r3, r3w + imul r3, 5 + add r3, 32 + sar r3, 6 ; b = (5 * H + 32) >> 6; + SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b - movzx r4, BYTE [r1+16] - sub r1, 3 - LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2 + movzx r4, BYTE [r1+16] + sub r1, 3 + LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2 - add r1, 3 - movzx r3, BYTE [r1+8*r2] - add r4, r3 - shl r4, 4 ; a = (left[15*stride] + top[15]) << 4; + add r1, 3 + movzx r3, BYTE [r1+8*r2] + add r4, r3 + shl r4, 4 ; a = (left[15*stride] + top[15]) << 4; - sub r1, 3 - add r1, r2 - LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2 - pxor xmm4, xmm4 - punpckhbw xmm0, xmm4 - pmullw xmm0, xmm5 - punpckhbw xmm7, xmm4 - pmullw xmm7, xmm6 - psubw xmm7, xmm0 + sub r1, 3 + add r1, r2 + LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2 + pxor xmm4, xmm4 + punpckhbw xmm0, xmm4 + pmullw xmm0, xmm5 + punpckhbw xmm7, xmm4 + pmullw xmm7, xmm6 + psubw xmm7, xmm0 - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r3d, xmm7 ; V - movsx r3, r3w - imul r3, 5 - add r3, 32 - sar r3, 6 ; c = (5 * V + 32) >> 6; - SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c + SUMW_HORIZON xmm7,xmm0,xmm2 + movd r3d, xmm7 ; V + movsx r3, r3w + imul r3, 5 + add r3, 32 + sar r3, 6 ; c = (5 * V + 32) >> 6; + SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c - add r4, 16 - imul r3, -7 - add r3, r4 ; s = a + 16 + (-7)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s + add r4, 16 + imul r3, -7 + add r3, r4 ; s = a + 16 + (-7)*c + SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - xor r3, r3 - movdqa xmm5, [sse2_plane_inc_minus] + xor r3, r3 + movdqa xmm5, [sse2_plane_inc_minus] get_i16x16_luma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - movdqa xmm3, xmm1 - pmullw xmm3, xmm6 - paddw xmm3, xmm0 - psraw xmm3, 5 - packuswb xmm2, xmm3 - movdqa [r0], xmm2 - paddw xmm0, xmm4 - add r0, 16 - inc r3 - cmp r3, 16 - jnz get_i16x16_luma_pred_plane_sse2_1 - POP_XMM - pop r4 - pop r3 - ret + movdqa xmm2, xmm1 + pmullw xmm2, xmm5 + paddw xmm2, xmm0 + psraw xmm2, 5 + movdqa xmm3, xmm1 + pmullw xmm3, xmm6 + paddw xmm3, xmm0 + psraw xmm3, 5 + packuswb xmm2, xmm3 + movdqa [r0], xmm2 + paddw xmm0, xmm4 + add r0, 16 + inc r3 + cmp r3, 16 + jnz get_i16x16_luma_pred_plane_sse2_1 + POP_XMM + pop r4 + pop r3 + ret ;*********************************************************************** ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); ;*********************************************************************** %macro SSE2_PRED_H_16X16_ONE_LINE 0 - add r0, 16 - add r1, r2 - movzx r3, byte [r1] - SSE2_Copy16Times xmm0, r3d - movdqa [r0], xmm0 + add r0, 16 + add r1, r2 + movzx r3, byte [r1] + SSE2_Copy16Times xmm0, r3d + movdqa [r0], xmm0 %endmacro WELS_EXTERN WelsI16x16LumaPredH_sse2 - push r3 - %assign push_num 1 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - dec r1 - movzx r3, byte [r1] - SSE2_Copy16Times xmm0, r3d - movdqa [r0], xmm0 - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - SSE2_PRED_H_16X16_ONE_LINE - pop r3 + push r3 + %assign push_num 1 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + dec r1 + movzx r3, byte [r1] + SSE2_Copy16Times xmm0, r3d + movdqa [r0], xmm0 + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + SSE2_PRED_H_16X16_ONE_LINE + pop r3 ret ;*********************************************************************** @@ -378,289 +378,289 @@ WELS_EXTERN WelsI16x16LumaPredV_sse2 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); ;*********************************************************************** WELS_EXTERN WelsIChromaPredPlane_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - PUSH_XMM 8 - SIGN_EXTENSION r2, r2d - sub r1, 1 - sub r1, r2 + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + PUSH_XMM 8 + SIGN_EXTENSION r2, r2d + sub r1, 1 + sub r1, r2 - pxor mm7, mm7 - movq mm0, [r1] - movq mm5, [sse2_plane_dec_c] - punpcklbw mm0, mm7 - pmullw mm0, mm5 - movq mm1, [r1 + 5] - movq mm6, [sse2_plane_inc_c] - punpcklbw mm1, mm7 - pmullw mm1, mm6 - psubw mm1, mm0 + pxor mm7, mm7 + movq mm0, [r1] + movq mm5, [sse2_plane_dec_c] + punpcklbw mm0, mm7 + pmullw mm0, mm5 + movq mm1, [r1 + 5] + movq mm6, [sse2_plane_inc_c] + punpcklbw mm1, mm7 + pmullw mm1, mm6 + psubw mm1, mm0 - movq2dq xmm1, mm1 - pxor xmm2, xmm2 - SUMW_HORIZON xmm1,xmm0,xmm2 - movd r3d, xmm1 - movsx r3, r3w - imul r3, 17 - add r3, 16 - sar r3, 5 ; b = (17 * H + 16) >> 5; - SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b + movq2dq xmm1, mm1 + pxor xmm2, xmm2 + SUMW_HORIZON xmm1,xmm0,xmm2 + movd r3d, xmm1 + movsx r3, r3w + imul r3, 17 + add r3, 16 + sar r3, 5 ; b = (17 * H + 16) >> 5; + SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b - movzx r3, BYTE [r1+8] - sub r1, 3 - LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2 + movzx r3, BYTE [r1+8] + sub r1, 3 + LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2 - add r1, 3 - movzx r4, BYTE [r1+4*r2] - add r4, r3 - shl r4, 4 ; a = (left[7*stride] + top[7]) << 4; + add r1, 3 + movzx r4, BYTE [r1+4*r2] + add r4, r3 + shl r4, 4 ; a = (left[7*stride] + top[7]) << 4; - sub r1, 3 - add r1, r2 - LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2 - pxor mm4, mm4 - punpckhbw mm0, mm4 - pmullw mm0, mm5 - punpckhbw mm7, mm4 - pmullw mm7, mm6 - psubw mm7, mm0 + sub r1, 3 + add r1, r2 + LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2 + pxor mm4, mm4 + punpckhbw mm0, mm4 + pmullw mm0, mm5 + punpckhbw mm7, mm4 + pmullw mm7, mm6 + psubw mm7, mm0 - movq2dq xmm7, mm7 - pxor xmm2, xmm2 - SUMW_HORIZON xmm7,xmm0,xmm2 - movd r3d, xmm7 ; V - movsx r3, r3w - imul r3, 17 - add r3, 16 - sar r3, 5 ; c = (17 * V + 16) >> 5; - SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c + movq2dq xmm7, mm7 + pxor xmm2, xmm2 + SUMW_HORIZON xmm7,xmm0,xmm2 + movd r3d, xmm7 ; V + movsx r3, r3w + imul r3, 17 + add r3, 16 + sar r3, 5 ; c = (17 * V + 16) >> 5; + SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c - add r4, 16 - imul r3, -3 - add r3, r4 ; s = a + 16 + (-3)*c - SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s + add r4, 16 + imul r3, -3 + add r3, r4 ; s = a + 16 + (-3)*c + SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s - xor r3, r3 - movdqa xmm5, [sse2_plane_mul_b_c] + xor r3, r3 + movdqa xmm5, [sse2_plane_mul_b_c] get_i_chroma_pred_plane_sse2_1: - movdqa xmm2, xmm1 - pmullw xmm2, xmm5 - paddw xmm2, xmm0 - psraw xmm2, 5 - packuswb xmm2, xmm2 - movq [r0], xmm2 - paddw xmm0, xmm4 - add r0, 8 - inc r3 - cmp r3, 8 - jnz get_i_chroma_pred_plane_sse2_1 - POP_XMM - pop r4 - pop r3 - WELSEMMS - ret + movdqa xmm2, xmm1 + pmullw xmm2, xmm5 + paddw xmm2, xmm0 + psraw xmm2, 5 + packuswb xmm2, xmm2 + movq [r0], xmm2 + paddw xmm0, xmm4 + add r0, 8 + inc r3 + cmp r3, 8 + jnz get_i_chroma_pred_plane_sse2_1 + POP_XMM + pop r4 + pop r3 + WELSEMMS + ret ;*********************************************************************** -; 0 |1 |2 |3 |4 | -; 6 |7 |8 |9 |10| -; 11|12|13|14|15| -; 16|17|18|19|20| -; 21|22|23|24|25| -; 7 is the start pixel of current 4x4 block -; pred[7] = ([6]+[0]*2+[1]+2)/4 +; 0 |1 |2 |3 |4 | +; 6 |7 |8 |9 |10| +; 11|12|13|14|15| +; 16|17|18|19|20| +; 21|22|23|24|25| +; 7 is the start pixel of current 4x4 block +; pred[7] = ([6]+[0]*2+[1]+2)/4 ; ; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ; ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredDDR_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 - movq mm2,[r1-8] ;get value of 6 mm2[8] = 6 - sub r1, r2 ;mov eax to above line of current block(postion of 1) - punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] - movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] - punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] - psllq mm3,18h ;mm3[5]=[1] - psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] - movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] - lea r1,[r1+r2*2-8h] ;set eax point to 12 - movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16] - psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[16] - por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] - movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] - movq mm4,[r1+r2*2] ;mm4[8]=[21] - psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 - psrlq mm4,38h ;mm4[1]=[21] - por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] - movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] - pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 - pxor mm1,mm4 ;find odd value in the lowest bit of each byte - pand mm1,[mmx_01bytes] ;set the odd bit - psubusb mm3,mm1 ;decrease 1 from odd bytes - pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 + movq mm2,[r1-8] ;get value of 6 mm2[8] = 6 + sub r1, r2 ;mov eax to above line of current block(postion of 1) + punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] + movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] + punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] + psllq mm3,18h ;mm3[5]=[1] + psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] + por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] + movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] + lea r1,[r1+r2*2-8h] ;set eax point to 12 + movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16] + psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 + psrlq mm4,38h ;mm4[1]=[16] + por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] + movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] + movq mm4,[r1+r2*2] ;mm4[8]=[21] + psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 + psrlq mm4,38h ;mm4[1]=[21] + por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] + movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] + pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 + pxor mm1,mm4 ;find odd value in the lowest bit of each byte + pand mm1,[mmx_01bytes] ;set the odd bit + psubusb mm3,mm1 ;decrease 1 from odd bytes + pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 - movd [r0+12],mm2 - psrlq mm2,8 - movd [r0+8],mm2 - psrlq mm2,8 - movd [r0+4],mm2 - psrlq mm2,8 - movd [r0],mm2 - WELSEMMS - ret + movd [r0+12],mm2 + psrlq mm2,8 + movd [r0+8],mm2 + psrlq mm2,8 + movd [r0+4],mm2 + psrlq mm2,8 + movd [r0],mm2 + WELSEMMS + ret ;*********************************************************************** -; 0 |1 |2 |3 |4 | -; 5 |6 |7 |8 |9 | -; 10|11|12|13|14| -; 15|16|17|18|19| -; 20|21|22|23|24| -; 6 is the start pixel of current 4x4 block -; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8 +; 0 |1 |2 |3 |4 | +; 5 |6 |7 |8 |9 | +; 10|11|12|13|14| +; 15|16|17|18|19| +; 20|21|22|23|24| +; 6 is the start pixel of current 4x4 block +; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8 ; ; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride) ; ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredDc_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - movzx r4, byte [r1-1h] - sub r1, r2 - movd xmm0, [r1] - pxor xmm1, xmm1 - psadbw xmm0, xmm1 - xor r3, r3 - movd r3d, xmm0 - add r3, r4 - movzx r4, byte [r1+r2*2-1h] - add r3, r4 + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + movzx r4, byte [r1-1h] + sub r1, r2 + movd xmm0, [r1] + pxor xmm1, xmm1 + psadbw xmm0, xmm1 + xor r3, r3 + movd r3d, xmm0 + add r3, r4 + movzx r4, byte [r1+r2*2-1h] + add r3, r4 - lea r1, [r1+r2*2-1] - movzx r4, byte [r1+r2] - add r3, r4 + lea r1, [r1+r2*2-1] + movzx r4, byte [r1+r2] + add r3, r4 - movzx r4, byte [r1+r2*2] - add r3, r4 - add r3, 4 - sar r3, 3 - imul r3, 0x01010101 + movzx r4, byte [r1+r2*2] + add r3, r4 + add r3, 4 + sar r3, 3 + imul r3, 0x01010101 - movd xmm0, r3d - pshufd xmm0, xmm0, 0 - movdqa [r0], xmm0 - pop r4 - pop r3 - ret + movd xmm0, r3d + pshufd xmm0, xmm0, 0 + movdqa [r0], xmm0 + pop r4 + pop r3 + ret ;*********************************************************************** -; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride) +; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride) ; copy 8 pixel of 8 line from left ;*********************************************************************** %macro MMX_PRED_H_8X8_ONE_LINE 4 - movq %1, [%3-8] - psrlq %1, 38h + movq %1, [%3-8] + psrlq %1, 38h - ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 + ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes + pmullw %1, [mmx_01bytes] + pshufw %1, %1, 0 + movq [%4], %1 %endmacro %macro MMX_PRED_H_8X8_ONE_LINEE 4 - movq %1, [%3+r2-8] - psrlq %1, 38h + movq %1, [%3+r2-8] + psrlq %1, 38h - ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes - pmullw %1, [mmx_01bytes] - pshufw %1, %1, 0 - movq [%4], %1 + ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes + pmullw %1, [mmx_01bytes] + pshufw %1, %1, 0 + movq [%4], %1 %endmacro WELS_EXTERN WelsIChromaPredH_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - movq mm0, [r1-8] - psrlq mm0, 38h + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + movq mm0, [r1-8] + psrlq mm0, 38h - ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes - pmullw mm0, [mmx_01bytes] - pshufw mm0, mm0, 0 - movq [r0], mm0 + ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes + pmullw mm0, [mmx_01bytes] + pshufw mm0, mm0, 0 + movq [r0], mm0 - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8 + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8 - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16 + lea r1,[r1+r2*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16 - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24 + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24 - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32 + lea r1,[r1+r2*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32 - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40 + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40 - lea r1,[r1+r2*2] - MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48 + lea r1,[r1+r2*2] + MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48 - MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56 - WELSEMMS - ret + MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56 + WELSEMMS + ret ;*********************************************************************** -; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) +; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) ; copy pixels from top 4 pixels ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredV_sse2 - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movd xmm0, [r1] - pshufd xmm0, xmm0, 0 - movdqa [r0], xmm0 - ret + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movd xmm0, [r1] + pshufd xmm0, xmm0, 0 + movdqa [r0], xmm0 + ret ;*********************************************************************** -; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) +; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) ; copy 8 pixels from top 8 pixels ;*********************************************************************** WELS_EXTERN WelsIChromaPredV_sse2 - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movq xmm0, [r1] - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm1 - movdqa [r0], xmm0 - movdqa [r0+16], xmm0 - movdqa [r0+32], xmm0 - movdqa [r0+48], xmm0 - ret + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movq xmm0, [r1] + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm1 + movdqa [r0], xmm0 + movdqa [r0+16], xmm0 + movdqa [r0+32], xmm0 + movdqa [r0+48], xmm0 + ret ;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; t3 will never been used ; destination: -; |a |b |c |d | -; |e |f |a |b | -; |g |h |e |f | -; |i |j |g |h | +; |a |b |c |d | +; |e |f |a |b | +; |g |h |e |f | +; |i |j |g |h | ; a = (1 + lt + l0)>>1 ; e = (1 + l0 + l1)>>1 @@ -679,68 +679,68 @@ WELS_EXTERN WelsIChromaPredV_sse2 ; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredHD_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] - psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] + psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] - movd mm1, [r1+2*r2-4] - punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 - lea r1, [r1+2*r2] - movd mm2, [r1+2*r2-4] - punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] - psrlq mm2, 20h - pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] + movd mm1, [r1+2*r2-4] + punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 + lea r1, [r1+2*r2] + movd mm2, [r1+2*r2-4] + punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3 + punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] + psrlq mm2, 20h + pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] - movq mm1, mm0 - psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] - movq mm2, mm0 - psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] - movq mm3, mm2 - movq mm4, mm1 - pavgb mm1, mm0 + movq mm1, mm0 + psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] + movq mm2, mm0 + psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] + movq mm3, mm2 + movq mm4, mm1 + pavgb mm1, mm0 - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm4 ; decrease 1 from odd bytes + pxor mm4, mm0 ; find odd value in the lowest bit of each byte + pand mm4, [mmx_01bytes] ; set the odd bit + psubusb mm1, mm4 ; decrease 1 from odd bytes - pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] + pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] - movq mm4, mm0 - pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] - punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] + movq mm4, mm0 + pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] + punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] - psrlq mm2, 20h - psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] - movq mm4, mm3 - psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] - pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] - psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] + psrlq mm2, 20h + psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] + movq mm4, mm3 + psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] + pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] + psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] - movd [r0], mm2 - movd [r0+12], mm3 - psrlq mm3, 10h - movd [r0+8], mm3 - psrlq mm3, 10h - movd [r0+4], mm3 - WELSEMMS - ret + movd [r0], mm2 + movd [r0+12], mm3 + psrlq mm3, 10h + movd [r0+8], mm3 + psrlq mm3, 10h + movd [r0+4], mm3 + WELSEMMS + ret ;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; t3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; t3 will never been used ; destination: -; |a |b |c |d | -; |c |d |e |f | -; |e |f |g |g | -; |g |g |g |g | +; |a |b |c |d | +; |c |d |e |f | +; |e |f |g |g | +; |g |g |g |g | ; a = (1 + l0 + l1)>>1 ; c = (1 + l1 + l2)>>1 @@ -756,70 +756,70 @@ WELS_EXTERN WelsI4x4LumaPredHD_mmx ; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredHU_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - movd mm0, [r1-4] ; mm0[3] = l0 - punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0 - lea r1, [r1+2*r2] - movd mm2, [r1-4] ; mm2[3] = l2 - movd mm4, [r1+r2-4] ; mm4[3] = l3 - punpcklbw mm2, mm4 - punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + movd mm0, [r1-4] ; mm0[3] = l0 + punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0 + lea r1, [r1+2*r2] + movd mm2, [r1-4] ; mm2[3] = l2 + movd mm4, [r1+r2-4] ; mm4[3] = l3 + punpcklbw mm2, mm4 + punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] - psrlq mm4, 18h - psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] - psrlq mm0, 8h - pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] + psrlq mm4, 18h + psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] + psrlq mm0, 8h + pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] - movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] - pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] + movq mm1, mm0 + psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] + movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] + pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] - movq mm5, mm2 - pavgb mm2, mm0 + movq mm2, mm0 + psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] + movq mm5, mm2 + pavgb mm2, mm0 - pxor mm5, mm0 ; find odd value in the lowest bit of each byte - pand mm5, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm5 ; decrease 1 from odd bytes + pxor mm5, mm0 ; find odd value in the lowest bit of each byte + pand mm5, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm5 ; decrease 1 from odd bytes - pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] + pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] - psrlq mm2, 8h - pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] + psrlq mm2, 8h + pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] - punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] - punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] - punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] + punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] + punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] + punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] - psrlq mm4, 20h - movd [r0+12], mm4 + psrlq mm4, 20h + movd [r0+12], mm4 - movd [r0], mm1 - psrlq mm1, 10h - movd [r0+4], mm1 - psrlq mm1, 10h - movd [r0+8], mm1 - WELSEMMS - ret + movd [r0], mm1 + psrlq mm1, 10h + movd [r0+4], mm1 + psrlq mm1, 10h + movd [r0+8], mm1 + WELSEMMS + ret ;*********************************************************************** -; lt|t0|t1|t2|t3| -; l0| -; l1| -; l2| -; l3| -; l3 will never been used +; lt|t0|t1|t2|t3| +; l0| +; l1| +; l2| +; l3| +; l3 will never been used ; destination: -; |a |b |c |d | -; |e |f |g |h | -; |i |a |b |c | -; |j |e |f |g | +; |a |b |c |d | +; |e |f |g |h | +; |i |a |b |c | +; |j |e |f |g | ; a = (1 + lt + t0)>>1 ; b = (1 + t0 + t1)>>1 @@ -837,75 +837,75 @@ WELS_EXTERN WelsI4x4LumaPredHU_mmx ; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredVR_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] - psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] + psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] - movd mm1, [r1+2*r2-4] - punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 - lea r1, [r1+2*r2] - movq mm2, [r1+r2-8] ; mm2[7] = l2 - punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] - psrlq mm2, 28h - pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] + movd mm1, [r1+2*r2-4] + punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 + lea r1, [r1+2*r2] + movq mm2, [r1+r2-8] ; mm2[7] = l2 + punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] + psrlq mm2, 28h + pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] - movq mm1, mm0 - psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] + movq mm1, mm0 + psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] + pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] - movq mm2, mm0 - psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] - movq mm3, mm2 - pavgb mm2, mm0 + movq mm2, mm0 + psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] + movq mm3, mm2 + pavgb mm2, mm0 - pxor mm3, mm0 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm3 ; decrease 1 from odd bytes + pxor mm3, mm0 ; find odd value in the lowest bit of each byte + pand mm3, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm3 ; decrease 1 from odd bytes - movq mm3, mm0 - psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] - pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] - movq mm2, mm3 + movq mm3, mm0 + psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] + pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] + movq mm2, mm3 - psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] - movd [r0], mm1 + psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] + movd [r0], mm1 - psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] - movd [r0+4], mm2 + psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] + movd [r0+4], mm2 - movq mm4, mm3 - psllq mm4, 20h - psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] + movq mm4, mm3 + psllq mm4, 20h + psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] - movq mm5, mm3 - psllq mm5, 28h - psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] + movq mm5, mm3 + psllq mm5, 28h + psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] - psllq mm1, 8h - pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] - movd [r0+8], mm4 + psllq mm1, 8h + pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] + movd [r0+8], mm4 - psllq mm2, 8h - pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] - movd [r0+12], mm5 - WELSEMMS - ret + psllq mm2, 8h + pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] + movd [r0+12], mm5 + WELSEMMS + ret ;*********************************************************************** -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used +; lt|t0|t1|t2|t3|t4|t5|t6|t7 +; l0| +; l1| +; l2| +; l3| +; lt,t0,t1,t2,t3 will never been used ; destination: -; |a |b |c |d | -; |b |c |d |e | -; |c |d |e |f | -; |d |e |f |g | +; |a |b |c |d | +; |b |c |d |e | +; |c |d |e |f | +; |d |e |f |g | ; a = (2 + t0 + t2 + (t1<<1))>>2 ; b = (2 + t1 + t3 + (t2<<1))>>2 @@ -921,54 +921,54 @@ WELS_EXTERN WelsI4x4LumaPredVR_mmx ; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredDDL_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] + movq mm1, mm0 + movq mm2, mm0 - movq mm3, mm0 - psrlq mm3, 38h - psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] + movq mm3, mm0 + psrlq mm3, 38h + psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] - psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] - psrlq mm2, 8h - pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] + psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] + psrlq mm2, 8h + pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] - movq mm3, mm1 - pavgb mm1, mm2 - pxor mm3, mm2 ; find odd value in the lowest bit of each byte - pand mm3, [mmx_01bytes] ; set the odd bit - psubusb mm1, mm3 ; decrease 1 from odd bytes + movq mm3, mm1 + pavgb mm1, mm2 + pxor mm3, mm2 ; find odd value in the lowest bit of each byte + pand mm3, [mmx_01bytes] ; set the odd bit + psubusb mm1, mm3 ; decrease 1 from odd bytes - pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] + pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] - psrlq mm0, 8h - movd [r0], mm0 - psrlq mm0, 8h - movd [r0+4], mm0 - psrlq mm0, 8h - movd [r0+8], mm0 - psrlq mm0, 8h - movd [r0+12], mm0 - WELSEMMS - ret + psrlq mm0, 8h + movd [r0], mm0 + psrlq mm0, 8h + movd [r0+4], mm0 + psrlq mm0, 8h + movd [r0+8], mm0 + psrlq mm0, 8h + movd [r0+12], mm0 + WELSEMMS + ret ;*********************************************************************** -; lt|t0|t1|t2|t3|t4|t5|t6|t7 -; l0| -; l1| -; l2| -; l3| -; lt,t0,t1,t2,t3 will never been used +; lt|t0|t1|t2|t3|t4|t5|t6|t7 +; l0| +; l1| +; l2| +; l3| +; lt,t0,t1,t2,t3 will never been used ; destination: -; |a |b |c |d | -; |e |f |g |h | -; |b |c |d |i | -; |f |g |h |j | +; |a |b |c |d | +; |e |f |g |h | +; |b |c |d |i | +; |f |g |h |j | ; a = (1 + t0 + t1)>>1 ; b = (1 + t1 + t2)>>1 @@ -987,125 +987,125 @@ WELS_EXTERN WelsI4x4LumaPredDDL_mmx ; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI4x4LumaPredVL_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] - movq mm1, mm0 - movq mm2, mm0 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] + movq mm1, mm0 + movq mm2, mm0 - psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] - psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] + psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] + psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] - movq mm3, mm1 - pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] + movq mm3, mm1 + pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] - movq mm4, mm2 - pavgb mm2, mm0 - pxor mm4, mm0 ; find odd value in the lowest bit of each byte - pand mm4, [mmx_01bytes] ; set the odd bit - psubusb mm2, mm4 ; decrease 1 from odd bytes + movq mm4, mm2 + pavgb mm2, mm0 + pxor mm4, mm0 ; find odd value in the lowest bit of each byte + pand mm4, [mmx_01bytes] ; set the odd bit + psubusb mm2, mm4 ; decrease 1 from odd bytes - pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] + pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] - movd [r0], mm3 - psrlq mm3, 8h - movd [r0+8], mm3 + movd [r0], mm3 + psrlq mm3, 8h + movd [r0+8], mm3 - movd [r0+4], mm2 - psrlq mm2, 8h - movd [r0+12], mm2 - WELSEMMS - ret + movd [r0+4], mm2 + psrlq mm2, 8h + movd [r0+12], mm2 + WELSEMMS + ret ;*********************************************************************** ; ; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) ;*********************************************************************** WELS_EXTERN WelsIChromaPredDc_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movq mm0, [r1] + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movq mm0, [r1] - movzx r3, byte [r1+r2-0x01] ; l1 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l2 - add r3, r4 - movzx r4, byte [r1+r2-0x01] ; l3 - add r3, r4 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l4 - add r3, r4 - movd mm1, r3d ; mm1 = l1+l2+l3+l4 + movzx r3, byte [r1+r2-0x01] ; l1 + lea r1, [r1+2*r2] + movzx r4, byte [r1-0x01] ; l2 + add r3, r4 + movzx r4, byte [r1+r2-0x01] ; l3 + add r3, r4 + lea r1, [r1+2*r2] + movzx r4, byte [r1-0x01] ; l4 + add r3, r4 + movd mm1, r3d ; mm1 = l1+l2+l3+l4 - movzx r3, byte [r1+r2-0x01] ; l5 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l6 - add r3, r4 - movzx r4, byte [r1+r2-0x01] ; l7 - add r3, r4 - lea r1, [r1+2*r2] - movzx r4, byte [r1-0x01] ; l8 - add r3, r4 - movd mm2, r3d ; mm2 = l5+l6+l7+l8 + movzx r3, byte [r1+r2-0x01] ; l5 + lea r1, [r1+2*r2] + movzx r4, byte [r1-0x01] ; l6 + add r3, r4 + movzx r4, byte [r1+r2-0x01] ; l7 + add r3, r4 + lea r1, [r1+2*r2] + movzx r4, byte [r1-0x01] ; l8 + add r3, r4 + movd mm2, r3d ; mm2 = l5+l6+l7+l8 - movq mm3, mm0 - psrlq mm0, 0x20 - psllq mm3, 0x20 - psrlq mm3, 0x20 - pxor mm4, mm4 - psadbw mm0, mm4 - psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 + movq mm3, mm0 + psrlq mm0, 0x20 + psllq mm3, 0x20 + psrlq mm3, 0x20 + pxor mm4, mm4 + psadbw mm0, mm4 + psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 - paddq mm3, mm1 - movq mm1, mm2 - paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 + paddq mm3, mm1 + movq mm1, mm2 + paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 - movq mm4, [mmx_0x02] + movq mm4, [mmx_0x02] - paddq mm0, mm4 - psrlq mm0, 0x02 + paddq mm0, mm4 + psrlq mm0, 0x02 - paddq mm2, mm4 - psrlq mm2, 0x02 + paddq mm2, mm4 + psrlq mm2, 0x02 - paddq mm3, mm4 - paddq mm3, mm4 - psrlq mm3, 0x03 + paddq mm3, mm4 + paddq mm3, mm4 + psrlq mm3, 0x03 - paddq mm1, mm4 - paddq mm1, mm4 - psrlq mm1, 0x03 + paddq mm1, mm4 + paddq mm1, mm4 + psrlq mm1, 0x03 - pmuludq mm0, [mmx_01bytes] - pmuludq mm3, [mmx_01bytes] - psllq mm0, 0x20 - pxor mm0, mm3 ; mm0 = m_up + pmuludq mm0, [mmx_01bytes] + pmuludq mm3, [mmx_01bytes] + psllq mm0, 0x20 + pxor mm0, mm3 ; mm0 = m_up - pmuludq mm2, [mmx_01bytes] - pmuludq mm1, [mmx_01bytes] - psllq mm1, 0x20 - pxor mm1, mm2 ; mm2 = m_down + pmuludq mm2, [mmx_01bytes] + pmuludq mm1, [mmx_01bytes] + psllq mm1, 0x20 + pxor mm1, mm2 ; mm2 = m_down - movq [r0], mm0 - movq [r0+0x08], mm0 - movq [r0+0x10], mm0 - movq [r0+0x18], mm0 + movq [r0], mm0 + movq [r0+0x08], mm0 + movq [r0+0x10], mm0 + movq [r0+0x18], mm0 - movq [r0+0x20], mm1 - movq [r0+0x28], mm1 - movq [r0+0x30], mm1 - movq [r0+0x38], mm1 + movq [r0+0x20], mm1 + movq [r0+0x28], mm1 + movq [r0+0x30], mm1 + movq [r0+0x38], mm1 - pop r4 - pop r3 - WELSEMMS - ret + pop r4 + pop r3 + WELSEMMS + ret @@ -1114,56 +1114,56 @@ WELS_EXTERN WelsIChromaPredDc_sse2 ; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) ;*********************************************************************** WELS_EXTERN WelsI16x16LumaPredDc_sse2 - push r3 - push r4 - %assign push_num 2 - LOAD_3_PARA - SIGN_EXTENSION r2, r2d - sub r1, r2 - movdqa xmm0, [r1] ; read one row - pxor xmm1, xmm1 - psadbw xmm0, xmm1 - movdqa xmm1, xmm0 - psrldq xmm1, 0x08 - pslldq xmm0, 0x08 - psrldq xmm0, 0x08 - paddw xmm0, xmm1 + push r3 + push r4 + %assign push_num 2 + LOAD_3_PARA + SIGN_EXTENSION r2, r2d + sub r1, r2 + movdqa xmm0, [r1] ; read one row + pxor xmm1, xmm1 + psadbw xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 0x08 + pslldq xmm0, 0x08 + psrldq xmm0, 0x08 + paddw xmm0, xmm1 - movzx r3, byte [r1+r2-0x01] - movzx r4, byte [r1+2*r2-0x01] - add r3, r4 - lea r1, [r1+r2] - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - LOAD_2_LEFT_AND_ADD - add r3, 0x10 - movd xmm1, r3d - paddw xmm0, xmm1 - psrld xmm0, 0x05 - pmuludq xmm0, [mmx_01bytes] - pshufd xmm0, xmm0, 0 + movzx r3, byte [r1+r2-0x01] + movzx r4, byte [r1+2*r2-0x01] + add r3, r4 + lea r1, [r1+r2] + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + LOAD_2_LEFT_AND_ADD + add r3, 0x10 + movd xmm1, r3d + paddw xmm0, xmm1 + psrld xmm0, 0x05 + pmuludq xmm0, [mmx_01bytes] + pshufd xmm0, xmm0, 0 - movdqa [r0], xmm0 - movdqa [r0+0x10], xmm0 - movdqa [r0+0x20], xmm0 - movdqa [r0+0x30], xmm0 - movdqa [r0+0x40], xmm0 - movdqa [r0+0x50], xmm0 - movdqa [r0+0x60], xmm0 - movdqa [r0+0x70], xmm0 - movdqa [r0+0x80], xmm0 - movdqa [r0+0x90], xmm0 - movdqa [r0+0xa0], xmm0 - movdqa [r0+0xb0], xmm0 - movdqa [r0+0xc0], xmm0 - movdqa [r0+0xd0], xmm0 - movdqa [r0+0xe0], xmm0 - movdqa [r0+0xf0], xmm0 + movdqa [r0], xmm0 + movdqa [r0+0x10], xmm0 + movdqa [r0+0x20], xmm0 + movdqa [r0+0x30], xmm0 + movdqa [r0+0x40], xmm0 + movdqa [r0+0x50], xmm0 + movdqa [r0+0x60], xmm0 + movdqa [r0+0x70], xmm0 + movdqa [r0+0x80], xmm0 + movdqa [r0+0x90], xmm0 + movdqa [r0+0xa0], xmm0 + movdqa [r0+0xb0], xmm0 + movdqa [r0+0xc0], xmm0 + movdqa [r0+0xd0], xmm0 + movdqa [r0+0xe0], xmm0 + movdqa [r0+0xf0], xmm0 - pop r4 - pop r3 - ret \ No newline at end of file + pop r4 + pop r3 + ret \ No newline at end of file diff --git a/codec/encoder/core/x86/matrix_transpose.asm b/codec/encoder/core/x86/matrix_transpose.asm index d187a9fc..98fe000b 100644 --- a/codec/encoder/core/x86/matrix_transpose.asm +++ b/codec/encoder/core/x86/matrix_transpose.asm @@ -34,362 +34,362 @@ ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m0, m3, m5, m2, m7, m1, m6, m4 %macro TRANSPOSE_8x8B_MMX 10 - MMX_XSwap bw, %1, %2, %8 - MMX_XSwap bw, %3, %4, %2 - MMX_XSwap bw, %5, %6, %4 - movq %6, %9 - movq %10, %4 - MMX_XSwap bw, %7, %6, %4 + MMX_XSwap bw, %1, %2, %8 + MMX_XSwap bw, %3, %4, %2 + MMX_XSwap bw, %5, %6, %4 + movq %6, %9 + movq %10, %4 + MMX_XSwap bw, %7, %6, %4 - MMX_XSwap wd, %1, %3, %6 - MMX_XSwap wd, %8, %2, %3 - MMX_XSwap wd, %5, %7, %2 - movq %7, %10 - movq %10, %3 - MMX_XSwap wd, %7, %4, %3 + MMX_XSwap wd, %1, %3, %6 + MMX_XSwap wd, %8, %2, %3 + MMX_XSwap wd, %5, %7, %2 + movq %7, %10 + movq %10, %3 + MMX_XSwap wd, %7, %4, %3 - MMX_XSwap dq, %1, %5, %4 - MMX_XSwap dq, %6, %2, %5 - MMX_XSwap dq, %8, %7, %2 - movq %7, %10 - movq %10, %5 - MMX_XSwap dq, %7, %3, %5 + MMX_XSwap dq, %1, %5, %4 + MMX_XSwap dq, %6, %2, %5 + MMX_XSwap dq, %8, %7, %2 + movq %7, %10 + movq %10, %5 + MMX_XSwap dq, %7, %3, %5 - movq %3, %10 + movq %3, %10 %endmacro ;in: m0, m3, m5, m2, m7, m1, m6, m4 -%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride - movq [%1], mm0 ; result of line 1, x8 bytes - movq [%1+%2], mm3 ; result of line 2 - lea %1, [%1+2*%2] - movq [%1], mm5 ; result of line 3 - movq [%1+%2], mm2 ; result of line 4 - lea %1, [%1+2*%2] - movq [%1], mm7 ; result of line 5 - movq [%1+%2], mm1 ; result of line 6 - lea %1, [%1+2*%2] - movq [%1], mm6 ; result of line 7 - movq [%1+%2], mm4 ; result of line 8 +%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride + movq [%1], mm0 ; result of line 1, x8 bytes + movq [%1+%2], mm3 ; result of line 2 + lea %1, [%1+2*%2] + movq [%1], mm5 ; result of line 3 + movq [%1+%2], mm2 ; result of line 4 + lea %1, [%1+2*%2] + movq [%1], mm7 ; result of line 5 + movq [%1+%2], mm1 ; result of line 6 + lea %1, [%1+2*%2] + movq [%1], mm6 ; result of line 7 + movq [%1+%2], mm4 ; result of line 8 %endmacro ;in: m0, m3, m5, m2, m7, m1, m6, m4 -%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32 - movq [%1], mm0 ; result of line 1, x8 bytes - movq [%1+%2], mm3 ; result of line 2 - lea %3, [%1+2*%2] - movq [%3], mm5 ; result of line 3 - movq [%3+%2], mm2 ; result of line 4 - lea %3, [%3+2*%2] - movq [%3], mm7 ; result of line 5 - movq [%3+%2], mm1 ; result of line 6 - lea %3, [%3+2*%2] - movq [%3], mm6 ; result of line 7 - movq [%3+%2], mm4 ; result of line 8 -%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX +%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32 + movq [%1], mm0 ; result of line 1, x8 bytes + movq [%1+%2], mm3 ; result of line 2 + lea %3, [%1+2*%2] + movq [%3], mm5 ; result of line 3 + movq [%3+%2], mm2 ; result of line 4 + lea %3, [%3+2*%2] + movq [%3], mm7 ; result of line 5 + movq [%3+%2], mm1 ; result of line 6 + lea %3, [%3+2*%2] + movq [%3], mm6 ; result of line 7 + movq [%3+%2], mm4 ; result of line 8 +%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX ; for transpose 16x8 ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 -%macro TRANSPOSE_8x16B_SSE2 10 - SSE2_XSawp bw, %1, %2, %8 - SSE2_XSawp bw, %3, %4, %2 - SSE2_XSawp bw, %5, %6, %4 - movdqa %6, %9 - movdqa %10, %4 - SSE2_XSawp bw, %7, %6, %4 +%macro TRANSPOSE_8x16B_SSE2 10 + SSE2_XSawp bw, %1, %2, %8 + SSE2_XSawp bw, %3, %4, %2 + SSE2_XSawp bw, %5, %6, %4 + movdqa %6, %9 + movdqa %10, %4 + SSE2_XSawp bw, %7, %6, %4 - SSE2_XSawp wd, %1, %3, %6 - SSE2_XSawp wd, %8, %2, %3 - SSE2_XSawp wd, %5, %7, %2 - movdqa %7, %10 - movdqa %10, %3 - SSE2_XSawp wd, %7, %4, %3 + SSE2_XSawp wd, %1, %3, %6 + SSE2_XSawp wd, %8, %2, %3 + SSE2_XSawp wd, %5, %7, %2 + movdqa %7, %10 + movdqa %10, %3 + SSE2_XSawp wd, %7, %4, %3 - SSE2_XSawp dq, %1, %5, %4 - SSE2_XSawp dq, %6, %2, %5 - SSE2_XSawp dq, %8, %7, %2 - movdqa %7, %10 - movdqa %10, %5 - SSE2_XSawp dq, %7, %3, %5 + SSE2_XSawp dq, %1, %5, %4 + SSE2_XSawp dq, %6, %2, %5 + SSE2_XSawp dq, %8, %7, %2 + movdqa %7, %10 + movdqa %10, %5 + SSE2_XSawp dq, %7, %3, %5 - SSE2_XSawp qdq, %1, %8, %3 - SSE2_XSawp qdq, %4, %2, %8 - SSE2_XSawp qdq, %6, %7, %2 - movdqa %7, %10 - movdqa %10, %1 - SSE2_XSawp qdq, %7, %5, %1 - movdqa %5, %10 -%endmacro ; end of TRANSPOSE_8x16B_SSE2 + SSE2_XSawp qdq, %1, %8, %3 + SSE2_XSawp qdq, %4, %2, %8 + SSE2_XSawp qdq, %6, %7, %2 + movdqa %7, %10 + movdqa %10, %1 + SSE2_XSawp qdq, %7, %5, %1 + movdqa %5, %10 +%endmacro ; end of TRANSPOSE_8x16B_SSE2 -%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride - movq [%1], xmm4 ; result of line 1, x8 bytes - movq [%1+%2], xmm2 ; result of line 2 - lea %1, [%1+2*%2] - movq [%1], xmm3 ; result of line 3 - movq [%1+%2], xmm7 ; result of line 4 +%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride + movq [%1], xmm4 ; result of line 1, x8 bytes + movq [%1+%2], xmm2 ; result of line 2 + lea %1, [%1+2*%2] + movq [%1], xmm3 ; result of line 3 + movq [%1+%2], xmm7 ; result of line 4 - lea %1, [%1+2*%2] - movq [%1], xmm5 ; result of line 5 - movq [%1+%2], xmm1 ; result of line 6 - lea %1, [%1+2*%2] - movq [%1], xmm6 ; result of line 7 - movq [%1+%2], xmm0 ; result of line 8 + lea %1, [%1+2*%2] + movq [%1], xmm5 ; result of line 5 + movq [%1+%2], xmm1 ; result of line 6 + lea %1, [%1+2*%2] + movq [%1], xmm6 ; result of line 7 + movq [%1+%2], xmm0 ; result of line 8 - lea %1, [%1+2*%2] - movhpd [%1], xmm4 ; result of line 9 - movhpd [%1+%2], xmm2 ; result of line 10 - lea %1, [%1+2*%2] - movhpd [%1], xmm3 ; result of line 11 - movhpd [%1+%2], xmm7 ; result of line 12 + lea %1, [%1+2*%2] + movhpd [%1], xmm4 ; result of line 9 + movhpd [%1+%2], xmm2 ; result of line 10 + lea %1, [%1+2*%2] + movhpd [%1], xmm3 ; result of line 11 + movhpd [%1+%2], xmm7 ; result of line 12 - lea %1, [%1+2*%2] - movhpd [%1], xmm5 ; result of line 13 - movhpd [%1+%2], xmm1 ; result of line 14 - lea %1, [%1+2*%2] - movhpd [%1], xmm6 ; result of line 15 - movhpd [%1+%2], xmm0 ; result of line 16 -%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2 + lea %1, [%1+2*%2] + movhpd [%1], xmm5 ; result of line 13 + movhpd [%1+%2], xmm1 ; result of line 14 + lea %1, [%1+2*%2] + movhpd [%1], xmm6 ; result of line 15 + movhpd [%1+%2], xmm0 ; result of line 16 +%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2 -%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32 - movq [%1], xmm4 ; result of line 1, x8 bytes - movq [%1+%2], xmm2 ; result of line 2 - lea %3, [%1+2*%2] - movq [%3], xmm3 ; result of line 3 - movq [%3+%2], xmm7 ; result of line 4 +%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32 + movq [%1], xmm4 ; result of line 1, x8 bytes + movq [%1+%2], xmm2 ; result of line 2 + lea %3, [%1+2*%2] + movq [%3], xmm3 ; result of line 3 + movq [%3+%2], xmm7 ; result of line 4 - lea %3, [%3+2*%2] - movq [%3], xmm5 ; result of line 5 - movq [%3+%2], xmm1 ; result of line 6 - lea %3, [%3+2*%2] - movq [%3], xmm6 ; result of line 7 - movq [%3+%2], xmm0 ; result of line 8 + lea %3, [%3+2*%2] + movq [%3], xmm5 ; result of line 5 + movq [%3+%2], xmm1 ; result of line 6 + lea %3, [%3+2*%2] + movq [%3], xmm6 ; result of line 7 + movq [%3+%2], xmm0 ; result of line 8 - lea %3, [%3+2*%2] - movhpd [%3], xmm4 ; result of line 9 - movhpd [%3+%2], xmm2 ; result of line 10 - lea %3, [%3+2*%2] - movhpd [%3], xmm3 ; result of line 11 - movhpd [%3+%2], xmm7 ; result of line 12 + lea %3, [%3+2*%2] + movhpd [%3], xmm4 ; result of line 9 + movhpd [%3+%2], xmm2 ; result of line 10 + lea %3, [%3+2*%2] + movhpd [%3], xmm3 ; result of line 11 + movhpd [%3+%2], xmm7 ; result of line 12 - lea %3, [%3+2*%2] - movhpd [%3], xmm5 ; result of line 13 - movhpd [%3+%2], xmm1 ; result of line 14 - lea %3, [%3+2*%2] - movhpd [%3], xmm6 ; result of line 15 - movhpd [%3+%2], xmm0 ; result of line 16 -%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2 + lea %3, [%3+2*%2] + movhpd [%3], xmm5 ; result of line 13 + movhpd [%3+%2], xmm1 ; result of line 14 + lea %3, [%3+2*%2] + movhpd [%3], xmm6 ; result of line 15 + movhpd [%3+%2], xmm0 ; result of line 16 +%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2 SECTION .text WELS_EXTERN TransposeMatrixBlock16x16_sse2 ; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride ); - push r4 - push r5 - %assign push_num 2 - LOAD_4_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d + push r4 + push r5 + %assign push_num 2 + LOAD_4_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d - mov r4, r7 - and r4, 0Fh - sub r7, 10h - sub r7, r4 - lea r5, [r3+r3*2] - ; top 8x16 block - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+r3*2] - movdqa xmm3, [r2+r5] - lea r2, [r2+r3*4] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+r3*2] + mov r4, r7 + and r4, 0Fh + sub r7, 10h + sub r7, r4 + lea r5, [r3+r3*2] + ; top 8x16 block + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+r3*2] + movdqa xmm3, [r2+r5] + lea r2, [r2+r3*4] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+r3*2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m4, m2, m3, m7, m5, m1, m6, m0 - TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m4, m2, m3, m7, m5, m1, m6, m0 + TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] - TRANSPOSE8x16_WRITE_SSE2 r0, r1 + TRANSPOSE8x16_WRITE_SSE2 r0, r1 - ; bottom 8x16 block - lea r2, [r2+r3*4] - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - movdqa xmm2, [r2+r3*2] - movdqa xmm3, [r2+r5] - lea r2, [r2+r3*4] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - movdqa xmm6, [r2+r3*2] + ; bottom 8x16 block + lea r2, [r2+r3*4] + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + movdqa xmm2, [r2+r3*2] + movdqa xmm3, [r2+r5] + lea r2, [r2+r3*4] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + movdqa xmm6, [r2+r3*2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m4, m2, m3, m7, m5, m1, m6, m0 - TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m4, m2, m3, m7, m5, m1, m6, m0 + TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] - mov r5, r1 - sal r5, 4 - sub r0, r5 - lea r0, [r0+r1*2+8] - TRANSPOSE8x16_WRITE_SSE2 r0, r1 + mov r5, r1 + sal r5, 4 + sub r0, r5 + lea r0, [r0+r1*2+8] + TRANSPOSE8x16_WRITE_SSE2 r0, r1 - add r7, r4 - add r7, 10h - POP_XMM - LOAD_4_PARA_POP - pop r5 - pop r4 - ret + add r7, r4 + add r7, 10h + POP_XMM + LOAD_4_PARA_POP + pop r5 + pop r4 + ret WELS_EXTERN TransposeMatrixBlocksx16_sse2 ; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks ); - push r5 - push r6 - %assign push_num 2 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - mov r5, r7 - and r5, 0Fh - sub r7, 10h - sub r7, r5 + push r5 + push r6 + %assign push_num 2 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + mov r5, r7 + and r5, 0Fh + sub r7, 10h + sub r7, r5 TRANSPOSE_LOOP_SSE2: - ; explictly loading next loop data - lea r6, [r2+r3*8] - push r4 + ; explictly loading next loop data + lea r6, [r2+r3*8] + push r4 %rep 8 - mov r4, [r6] - mov r4, [r6+r3] - lea r6, [r6+r3*2] + mov r4, [r6] + mov r4, [r6+r3] + lea r6, [r6+r3*2] %endrep - pop r4 - ; top 8x16 block - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm2, [r2] - movdqa xmm3, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm6, [r2] + pop r4 + ; top 8x16 block + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm2, [r2] + movdqa xmm3, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm6, [r2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m4, m2, m3, m7, m5, m1, m6, m0 - TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] - TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6 - lea r2, [r2+r3*2] + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m4, m2, m3, m7, m5, m1, m6, m0 + TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] + TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6 + lea r2, [r2+r3*2] - ; bottom 8x16 block - movdqa xmm0, [r2] - movdqa xmm1, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm2, [r2] - movdqa xmm3, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm4, [r2] - movdqa xmm5, [r2+r3] - lea r2, [r2+r3*2] - movdqa xmm6, [r2] + ; bottom 8x16 block + movdqa xmm0, [r2] + movdqa xmm1, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm2, [r2] + movdqa xmm3, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm4, [r2] + movdqa xmm5, [r2+r3] + lea r2, [r2+r3*2] + movdqa xmm6, [r2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m4, m2, m3, m7, m5, m1, m6, m0 - TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] - TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6 - lea r2, [r2+r3*2] - lea r0, [r0+16] - dec r4 - jg near TRANSPOSE_LOOP_SSE2 + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m4, m2, m3, m7, m5, m1, m6, m0 + TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] + TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6 + lea r2, [r2+r3*2] + lea r0, [r0+16] + dec r4 + jg near TRANSPOSE_LOOP_SSE2 - add r7, r5 - add r7, 10h - POP_XMM - LOAD_5_PARA_POP - pop r6 - pop r5 - ret + add r7, r5 + add r7, 10h + POP_XMM + LOAD_5_PARA_POP + pop r6 + pop r5 + ret WELS_EXTERN TransposeMatrixBlock8x8_mmx ; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride ); - %assign push_num 0 - LOAD_4_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - sub r7, 8 + %assign push_num 0 + LOAD_4_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + sub r7, 8 - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m0, m3, m5, m2, m7, m1, m6, m4 - TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m0, m3, m5, m2, m7, m1, m6, m4 + TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] - TRANSPOSE8x8_WRITE_MMX r0, r1 + TRANSPOSE8x8_WRITE_MMX r0, r1 - emms - add r7, 8 - LOAD_4_PARA_POP - ret + emms + add r7, 8 + LOAD_4_PARA_POP + ret WELS_EXTERN TransposeMatrixBlocksx8_mmx ; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks ); - push r5 - push r6 - %assign push_num 2 - LOAD_5_PARA - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - SIGN_EXTENSION r4, r4d - sub r7, 8 + push r5 + push r6 + %assign push_num 2 + LOAD_5_PARA + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + SIGN_EXTENSION r4, r4d + sub r7, 8 - lea r5, [r2+r3*8] + lea r5, [r2+r3*8] TRANSPOSE_BLOCKS_X8_LOOP_MMX: - ; explictly loading next loop data + ; explictly loading next loop data %rep 4 - mov r6, [r5] - mov r6, [r5+r3] - lea r5, [r5+r3*2] + mov r6, [r5] + mov r6, [r5+r3] + lea r5, [r5+r3*2] %endrep - movq mm0, [r2] - movq mm1, [r2+r3] - lea r2, [r2+2*r3] - movq mm2, [r2] - movq mm3, [r2+r3] - lea r2, [r2+2*r3] - movq mm4, [r2] - movq mm5, [r2+r3] - lea r2, [r2+2*r3] - movq mm6, [r2] + movq mm0, [r2] + movq mm1, [r2+r3] + lea r2, [r2+2*r3] + movq mm2, [r2] + movq mm3, [r2+r3] + lea r2, [r2+2*r3] + movq mm4, [r2] + movq mm5, [r2+r3] + lea r2, [r2+2*r3] + movq mm6, [r2] - ;in: m0, m1, m2, m3, m4, m5, m6, m7 - ;out: m0, m3, m5, m2, m7, m1, m6, m4 - TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] + ;in: m0, m1, m2, m3, m4, m5, m6, m7 + ;out: m0, m3, m5, m2, m7, m1, m6, m4 + TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] - TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6 - lea r0, [r0+8] - lea r2, [r2+2*r3] - dec r4 - jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX + TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6 + lea r0, [r0+8] + lea r2, [r2+2*r3] + dec r4 + jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX - emms - add r7, 8 - LOAD_5_PARA_POP - pop r6 - pop r5 - ret + emms + add r7, 8 + LOAD_5_PARA_POP + pop r6 + pop r5 + ret diff --git a/codec/encoder/core/x86/memzero.asm b/codec/encoder/core/x86/memzero.asm index f6e94207..a95c6497 100644 --- a/codec/encoder/core/x86/memzero.asm +++ b/codec/encoder/core/x86/memzero.asm @@ -51,10 +51,10 @@ SECTION .text ;void WelsPrefetchZero_mmx(int8_t const*_A); ;*********************************************************************** WELS_EXTERN WelsPrefetchZero_mmx - %assign push_num 0 - LOAD_1_PARA - prefetchnta [r0] - ret + %assign push_num 0 + LOAD_1_PARA + prefetchnta [r0] + ret ;*********************************************************************** @@ -62,71 +62,71 @@ WELS_EXTERN WelsPrefetchZero_mmx ;*********************************************************************** WELS_EXTERN WelsSetMemZeroAligned64_sse2 - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - neg r1 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + neg r1 - pxor xmm0, xmm0 + pxor xmm0, xmm0 .memzeroa64_sse2_loops: - movdqa [r0], xmm0 - movdqa [r0+16], xmm0 - movdqa [r0+32], xmm0 - movdqa [r0+48], xmm0 - add r0, 0x40 + movdqa [r0], xmm0 + movdqa [r0+16], xmm0 + movdqa [r0+32], xmm0 + movdqa [r0+48], xmm0 + add r0, 0x40 - add r1, 0x40 - jnz near .memzeroa64_sse2_loops + add r1, 0x40 + jnz near .memzeroa64_sse2_loops - ret + ret ;*********************************************************************** ; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size) ;*********************************************************************** WELS_EXTERN WelsSetMemZeroSize64_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - neg r1 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + neg r1 - pxor mm0, mm0 + pxor mm0, mm0 .memzero64_mmx_loops: - movq [r0], mm0 - movq [r0+8], mm0 - movq [r0+16], mm0 - movq [r0+24], mm0 - movq [r0+32], mm0 - movq [r0+40], mm0 - movq [r0+48], mm0 - movq [r0+56], mm0 - add r0, 0x40 + movq [r0], mm0 + movq [r0+8], mm0 + movq [r0+16], mm0 + movq [r0+24], mm0 + movq [r0+32], mm0 + movq [r0+40], mm0 + movq [r0+48], mm0 + movq [r0+56], mm0 + add r0, 0x40 - add r1, 0x40 - jnz near .memzero64_mmx_loops + add r1, 0x40 + jnz near .memzero64_mmx_loops - WELSEMMS - ret + WELSEMMS + ret ;*********************************************************************** ; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size) ;*********************************************************************** WELS_EXTERN WelsSetMemZeroSize8_mmx - %assign push_num 0 - LOAD_2_PARA - SIGN_EXTENSION r1, r1d - neg r1 - pxor mm0, mm0 + %assign push_num 0 + LOAD_2_PARA + SIGN_EXTENSION r1, r1d + neg r1 + pxor mm0, mm0 .memzero8_mmx_loops: - movq [r0], mm0 - add r0, 0x08 + movq [r0], mm0 + add r0, 0x08 - add r1, 0x08 - jnz near .memzero8_mmx_loops + add r1, 0x08 + jnz near .memzero8_mmx_loops - WELSEMMS - ret + WELSEMMS + ret diff --git a/codec/encoder/core/x86/quant.asm b/codec/encoder/core/x86/quant.asm index a9145265..e9f3167a 100644 --- a/codec/encoder/core/x86/quant.asm +++ b/codec/encoder/core/x86/quant.asm @@ -49,241 +49,241 @@ SECTION .text ;************************************************ %macro SSE2_Quant8 5 - MOVDQ %1, %5 - pxor %2, %2 - pcmpgtw %2, %1 - pxor %1, %2 - psubw %1, %2 - paddusw %1, %3 - pmulhuw %1, %4 - pxor %1, %2 - psubw %1, %2 - MOVDQ %5, %1 + MOVDQ %1, %5 + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 + paddusw %1, %3 + pmulhuw %1, %4 + pxor %1, %2 + psubw %1, %2 + MOVDQ %5, %1 %endmacro %macro SSE2_QuantMax8 6 - MOVDQ %1, %5 - pxor %2, %2 - pcmpgtw %2, %1 - pxor %1, %2 - psubw %1, %2 - paddusw %1, %3 - pmulhuw %1, %4 - pmaxsw %6, %1 - pxor %1, %2 - psubw %1, %2 - MOVDQ %5, %1 + MOVDQ %1, %5 + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 + paddusw %1, %3 + pmulhuw %1, %4 + pmaxsw %6, %1 + pxor %1, %2 + psubw %1, %2 + MOVDQ %5, %1 %endmacro -%define pDct esp + 4 -%define ff esp + 8 -%define mf esp + 12 -%define max esp + 16 +%define pDct esp + 4 +%define ff esp + 8 +%define mf esp + 12 +%define max esp + 16 ;*********************************************************************** -; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); +; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); ;*********************************************************************** WELS_EXTERN WelsQuant4x4_sse2 - %assign push_num 0 - LOAD_3_PARA - movdqa xmm2, [r1] - movdqa xmm3, [r2] + %assign push_num 0 + LOAD_3_PARA + movdqa xmm2, [r1] + movdqa xmm3, [r2] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] - ret + ret ;*********************************************************************** ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf); ;*********************************************************************** WELS_EXTERN WelsQuant4x4Dc_sse2 - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSIONW r1, r1w - SIGN_EXTENSIONW r2, r2w - SSE2_Copy8Times xmm3, r2d + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSIONW r1, r1w + SIGN_EXTENSIONW r2, r2w + SSE2_Copy8Times xmm3, r2d - SSE2_Copy8Times xmm2, r1d + SSE2_Copy8Times xmm2, r1d - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] - ret + ret ;*********************************************************************** -; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); +; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); ;*********************************************************************** WELS_EXTERN WelsQuantFour4x4_sse2 - %assign push_num 0 - LOAD_3_PARA - MOVDQ xmm2, [r1] - MOVDQ xmm3, [r2] + %assign push_num 0 + LOAD_3_PARA + MOVDQ xmm2, [r1] + MOVDQ xmm3, [r2] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60] - SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60] + SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70] - ret + ret ;*********************************************************************** -; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max); +; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max); ;*********************************************************************** WELS_EXTERN WelsQuantFour4x4Max_sse2 - %assign push_num 0 - LOAD_4_PARA - PUSH_XMM 8 - MOVDQ xmm2, [r1] - MOVDQ xmm3, [r2] + %assign push_num 0 + LOAD_4_PARA + PUSH_XMM 8 + MOVDQ xmm2, [r1] + MOVDQ xmm3, [r2] - pxor xmm4, xmm4 - pxor xmm5, xmm5 - pxor xmm6, xmm6 - pxor xmm7, xmm7 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7 - SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7 + SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7 - SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0 - pmaxsw xmm0, xmm4 - pmaxsw xmm0, xmm5 - pmaxsw xmm0, xmm7 - movdqa xmm1, xmm0 - punpckhqdq xmm0, xmm1 - pmaxsw xmm0, xmm1 + SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0 + pmaxsw xmm0, xmm4 + pmaxsw xmm0, xmm5 + pmaxsw xmm0, xmm7 + movdqa xmm1, xmm0 + punpckhqdq xmm0, xmm1 + pmaxsw xmm0, xmm1 - movq [r3], xmm0 - POP_XMM - LOAD_4_PARA_POP - ret + movq [r3], xmm0 + POP_XMM + LOAD_4_PARA_POP + ret %macro MMX_Copy4Times 2 - movd %1, %2 - punpcklwd %1, %1 - punpckldq %1, %1 + movd %1, %2 + punpcklwd %1, %1 + punpckldq %1, %1 %endmacro SECTION .text %macro MMX_Quant4 4 - pxor %2, %2 - pcmpgtw %2, %1 - pxor %1, %2 - psubw %1, %2 - paddusw %1, %3 - pmulhuw %1, %4 - pxor %1, %2 - psubw %1, %2 + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 + paddusw %1, %3 + pmulhuw %1, %4 + pxor %1, %2 + psubw %1, %2 %endmacro ;*********************************************************************** ;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block); ;*********************************************************************** WELS_EXTERN WelsHadamardQuant2x2_mmx - %assign push_num 0 - LOAD_5_PARA - SIGN_EXTENSIONW r1, r1w - SIGN_EXTENSIONW r2, r2w - movd mm0, [r0] - movd mm1, [r0 + 0x20] - punpcklwd mm0, mm1 - movd mm3, [r0 + 0x40] - movd mm1, [r0 + 0x60] - punpcklwd mm3, mm1 + %assign push_num 0 + LOAD_5_PARA + SIGN_EXTENSIONW r1, r1w + SIGN_EXTENSIONW r2, r2w + movd mm0, [r0] + movd mm1, [r0 + 0x20] + punpcklwd mm0, mm1 + movd mm3, [r0 + 0x40] + movd mm1, [r0 + 0x60] + punpcklwd mm3, mm1 - ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 - movq mm5, mm3 - paddw mm3, mm0 - psubw mm0, mm5 - punpcklwd mm3, mm0 - movq mm1, mm3 - psrlq mm1, 32 - movq mm5, mm1 - paddw mm1, mm3 - psubw mm3, mm5 - punpcklwd mm1, mm3 + ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 + movq mm5, mm3 + paddw mm3, mm0 + psubw mm0, mm5 + punpcklwd mm3, mm0 + movq mm1, mm3 + psrlq mm1, 32 + movq mm5, mm1 + paddw mm1, mm3 + psubw mm3, mm5 + punpcklwd mm1, mm3 - ;quant_2x2_dc - MMX_Copy4Times mm3, r2d - MMX_Copy4Times mm2, r1d - MMX_Quant4 mm1, mm0, mm2, mm3 + ;quant_2x2_dc + MMX_Copy4Times mm3, r2d + MMX_Copy4Times mm2, r1d + MMX_Quant4 mm1, mm0, mm2, mm3 - ; store dct_2x2 - movq [r3], mm1 - movq [r4], mm1 + ; store dct_2x2 + movq [r3], mm1 + movq [r4], mm1 - ; pNonZeroCount of dct_2x2 - pcmpeqb mm2, mm2 ; mm2 = FF - pxor mm3, mm3 - packsswb mm1, mm3 - pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal - psubsb mm1, mm2 ; set 0 if equal, 1 if not equal - psadbw mm1, mm3 ; - mov r1w, 0 - mov [r0], r1w - mov [r0 + 0x20], r1w - mov [r0 + 0x40], r1w - mov [r0 + 0x60], r1w + ; pNonZeroCount of dct_2x2 + pcmpeqb mm2, mm2 ; mm2 = FF + pxor mm3, mm3 + packsswb mm1, mm3 + pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal + psubsb mm1, mm2 ; set 0 if equal, 1 if not equal + psadbw mm1, mm3 ; + mov r1w, 0 + mov [r0], r1w + mov [r0 + 0x20], r1w + mov [r0 + 0x40], r1w + mov [r0 + 0x60], r1w - movd retrd, mm1 + movd retrd, mm1 - WELSEMMS - LOAD_5_PARA_POP - ret + WELSEMMS + LOAD_5_PARA_POP + ret ;*********************************************************************** ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf); ;*********************************************************************** WELS_EXTERN WelsHadamardQuant2x2Skip_mmx - %assign push_num 0 - LOAD_3_PARA - SIGN_EXTENSIONW r1, r1w - SIGN_EXTENSIONW r2, r2w - movd mm0, [r0] - movd mm1, [r0 + 0x20] - punpcklwd mm0, mm1 - movd mm3, [r0 + 0x40] - movd mm1, [r0 + 0x60] - punpcklwd mm3, mm1 + %assign push_num 0 + LOAD_3_PARA + SIGN_EXTENSIONW r1, r1w + SIGN_EXTENSIONW r2, r2w + movd mm0, [r0] + movd mm1, [r0 + 0x20] + punpcklwd mm0, mm1 + movd mm3, [r0 + 0x40] + movd mm1, [r0 + 0x60] + punpcklwd mm3, mm1 - ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 - movq mm5, mm3 - paddw mm3, mm0 - psubw mm0, mm5 - punpcklwd mm3, mm0 - movq mm1, mm3 - psrlq mm1, 32 - movq mm5, mm1 - paddw mm1, mm3 - psubw mm3, mm5 - punpcklwd mm1, mm3 + ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 + movq mm5, mm3 + paddw mm3, mm0 + psubw mm0, mm5 + punpcklwd mm3, mm0 + movq mm1, mm3 + psrlq mm1, 32 + movq mm5, mm1 + paddw mm1, mm3 + psubw mm3, mm5 + punpcklwd mm1, mm3 - ;quant_2x2_dc - MMX_Copy4Times mm3, r2d - MMX_Copy4Times mm2, r1d - MMX_Quant4 mm1, mm0, mm2, mm3 + ;quant_2x2_dc + MMX_Copy4Times mm3, r2d + MMX_Copy4Times mm2, r1d + MMX_Quant4 mm1, mm0, mm2, mm3 - ; pNonZeroCount of dct_2x2 - pcmpeqb mm2, mm2 ; mm2 = FF - pxor mm3, mm3 - packsswb mm1, mm3 - pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal - psubsb mm1, mm2 ; set 0 if equal, 1 if not equal - psadbw mm1, mm3 ; - movd retrd, mm1 + ; pNonZeroCount of dct_2x2 + pcmpeqb mm2, mm2 ; mm2 = FF + pxor mm3, mm3 + packsswb mm1, mm3 + pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal + psubsb mm1, mm2 ; set 0 if equal, 1 if not equal + psadbw mm1, mm3 ; + movd retrd, mm1 - WELSEMMS - ret + WELSEMMS + ret %macro SSE2_DeQuant8 3 @@ -297,12 +297,12 @@ WELS_EXTERN WelsHadamardQuant2x2Skip_mmx ; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf); ;*********************************************************************** WELS_EXTERN WelsDequant4x4_sse2 - %assign push_num 0 - LOAD_2_PARA + %assign push_num 0 + LOAD_2_PARA - movdqa xmm1, [r1] - SSE2_DeQuant8 [r0 ], xmm0, xmm1 - SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1 + movdqa xmm1, [r1] + SSE2_DeQuant8 [r0 ], xmm0, xmm1 + SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1 ret @@ -311,18 +311,18 @@ WELS_EXTERN WelsDequant4x4_sse2 ;***********************************************************************==== WELS_EXTERN WelsDequantFour4x4_sse2 - %assign push_num 0 - LOAD_2_PARA + %assign push_num 0 + LOAD_2_PARA - movdqa xmm1, [r1] - SSE2_DeQuant8 [r0 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1 - SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1 + movdqa xmm1, [r1] + SSE2_DeQuant8 [r0 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1 + SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1 ret @@ -330,41 +330,41 @@ WELS_EXTERN WelsDequantFour4x4_sse2 ;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf); ;*********************************************************************** WELS_EXTERN WelsDequantIHadamard4x4_sse2 - %assign push_num 0 - LOAD_2_PARA - %ifndef X86_32 - movzx r1, r1w - %endif + %assign push_num 0 + LOAD_2_PARA + %ifndef X86_32 + movzx r1, r1w + %endif - ; WelsDequantLumaDc4x4 - SSE2_Copy8Times xmm1, r1d - ;psrlw xmm1, 2 ; for the (>>2) in ihdm - MOVDQ xmm0, [r0] - MOVDQ xmm2, [r0+0x10] - pmullw xmm0, xmm1 - pmullw xmm2, xmm1 + ; WelsDequantLumaDc4x4 + SSE2_Copy8Times xmm1, r1d + ;psrlw xmm1, 2 ; for the (>>2) in ihdm + MOVDQ xmm0, [r0] + MOVDQ xmm2, [r0+0x10] + pmullw xmm0, xmm1 + pmullw xmm2, xmm1 - ; ihdm_4x4 - movdqa xmm1, xmm0 - psrldq xmm1, 8 - movdqa xmm3, xmm2 - psrldq xmm3, 8 + ; ihdm_4x4 + movdqa xmm1, xmm0 + psrldq xmm1, 8 + movdqa xmm3, xmm2 + psrldq xmm3, 8 - SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3 - SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2 - SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2 - SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1 + SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3 + SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2 + SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2 + SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1 - SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4 - SSE2_SumSub xmm2, xmm4, xmm5 - SSE2_SumSub xmm1, xmm0, xmm5 - SSE2_SumSub xmm4, xmm0, xmm5 - SSE2_SumSub xmm2, xmm1, xmm5 - SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 + SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4 + SSE2_SumSub xmm2, xmm4, xmm5 + SSE2_SumSub xmm1, xmm0, xmm5 + SSE2_SumSub xmm4, xmm0, xmm5 + SSE2_SumSub xmm2, xmm1, xmm5 + SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 - punpcklqdq xmm0, xmm1 - MOVDQ [r0], xmm0 + punpcklqdq xmm0, xmm1 + MOVDQ [r0], xmm0 - punpcklqdq xmm2, xmm3 - MOVDQ [r0+16], xmm2 - ret + punpcklqdq xmm2, xmm3 + MOVDQ [r0+16], xmm2 + ret diff --git a/codec/encoder/core/x86/sample_sc.asm b/codec/encoder/core/x86/sample_sc.asm index 0d93aa01..2add1655 100644 --- a/codec/encoder/core/x86/sample_sc.asm +++ b/codec/encoder/core/x86/sample_sc.asm @@ -35,189 +35,189 @@ SECTION .text ;********************************************************************************************************************************** ; -; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) +; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) ; -; \note: -; src need align with 16 bytes, ref is optional -; \return value: -; return minimal SAD cost, according index carried by index_min_cost +; \note: +; src need align with 16 bytes, ref is optional +; \return value: +; return minimal SAD cost, according index carried by index_min_cost ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs -%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref - movdqa xmm0, [%1] - movdqu xmm1, [%2] - movdqu xmm2, [%2+8h] - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 +%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref + movdqa xmm0, [%1] + movdqu xmm1, [%2] + movdqu xmm2, [%2+8h] + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 - mpsadbw xmm1, xmm0, 0 ; 000 B - paddw xmm7, xmm1 ; accumulate cost + mpsadbw xmm1, xmm0, 0 ; 000 B + paddw xmm7, xmm1 ; accumulate cost - mpsadbw xmm3, xmm0, 5 ; 101 B - paddw xmm7, xmm3 ; accumulate cost + mpsadbw xmm3, xmm0, 5 ; 101 B + paddw xmm7, xmm3 ; accumulate cost - mpsadbw xmm2, xmm0, 2 ; 010 B - paddw xmm7, xmm2 ; accumulate cost + mpsadbw xmm2, xmm0, 2 ; 010 B + paddw xmm7, xmm2 ; accumulate cost - mpsadbw xmm4, xmm0, 7 ; 111 B - paddw xmm7, xmm4 ; accumulate cost + mpsadbw xmm4, xmm0, 7 ; 111 B + paddw xmm7, xmm4 ; accumulate cost - add %1, %3 - add %2, %4 -%endmacro ; end of SAD_16x16_LINE_SSE41 -%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref - movdqa xmm0, [%1] - movdqu xmm1, [%2] - movdqu xmm2, [%2+8h] - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 + add %1, %3 + add %2, %4 +%endmacro ; end of SAD_16x16_LINE_SSE41 +%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref + movdqa xmm0, [%1] + movdqu xmm1, [%2] + movdqu xmm2, [%2+8h] + movdqa xmm3, xmm1 + movdqa xmm4, xmm2 - mpsadbw xmm1, xmm0, 0 ; 000 B - paddw xmm7, xmm1 ; accumulate cost + mpsadbw xmm1, xmm0, 0 ; 000 B + paddw xmm7, xmm1 ; accumulate cost - mpsadbw xmm3, xmm0, 5 ; 101 B - paddw xmm7, xmm3 ; accumulate cost + mpsadbw xmm3, xmm0, 5 ; 101 B + paddw xmm7, xmm3 ; accumulate cost - mpsadbw xmm2, xmm0, 2 ; 010 B - paddw xmm7, xmm2 ; accumulate cost + mpsadbw xmm2, xmm0, 2 ; 010 B + paddw xmm7, xmm2 ; accumulate cost - mpsadbw xmm4, xmm0, 7 ; 111 B - paddw xmm7, xmm4 ; accumulate cost -%endmacro ; end of SAD_16x16_LINE_SSE41E + mpsadbw xmm4, xmm0, 7 ; 111 B + paddw xmm7, xmm4 ; accumulate cost +%endmacro ; end of SAD_16x16_LINE_SSE41E WELS_EXTERN SampleSad16x16Hor8_sse41 ;push ebx ;push esi - ;mov eax, [esp+12] ; src - ;mov ecx, [esp+16] ; stride_src - ;mov ebx, [esp+20] ; ref - ;mov edx, [esp+24] ; stride_ref - ;mov esi, [esp+28] ; base_cost + ;mov eax, [esp+12] ; src + ;mov ecx, [esp+16] ; stride_src + ;mov ebx, [esp+20] ; ref + ;mov edx, [esp+24] ; stride_ref + ;mov esi, [esp+28] ; base_cost %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - pxor xmm7, xmm7 + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + pxor xmm7, xmm7 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41 r0, r2, r1, r3 - SAD_16x16_LINE_SSE41E r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41 r0, r2, r1, r3 + SAD_16x16_LINE_SSE41E r0, r2, r1, r3 - pxor xmm0, xmm0 - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - punpckhwd xmm7, xmm0 + pxor xmm0, xmm0 + movdqa xmm6, xmm7 + punpcklwd xmm6, xmm0 + punpckhwd xmm7, xmm0 - movdqa xmm5, [r4] - movdqa xmm4, xmm5 - punpcklwd xmm4, xmm0 - punpckhwd xmm5, xmm0 + movdqa xmm5, [r4] + movdqa xmm4, xmm5 + punpcklwd xmm4, xmm0 + punpckhwd xmm5, xmm0 - paddd xmm4, xmm6 - paddd xmm5, xmm7 - movdqa xmm3, xmm4 - pminud xmm3, xmm5 - pshufd xmm2, xmm3, 01001110B - pminud xmm2, xmm3 - pshufd xmm3, xmm2, 10110001B - pminud xmm2, xmm3 - movd retrd, xmm2 - pcmpeqd xmm4, xmm2 - movmskps r2d, xmm4 - bsf r1d, r2d - jnz near WRITE_INDEX + paddd xmm4, xmm6 + paddd xmm5, xmm7 + movdqa xmm3, xmm4 + pminud xmm3, xmm5 + pshufd xmm2, xmm3, 01001110B + pminud xmm2, xmm3 + pshufd xmm3, xmm2, 10110001B + pminud xmm2, xmm3 + movd retrd, xmm2 + pcmpeqd xmm4, xmm2 + movmskps r2d, xmm4 + bsf r1d, r2d + jnz near WRITE_INDEX - pcmpeqd xmm5, xmm2 - movmskps r2d, xmm5 - bsf r1d, r2d - add r1d, 4 + pcmpeqd xmm5, xmm2 + movmskps r2d, xmm5 + bsf r1d, r2d + add r1d, 4 WRITE_INDEX: - mov [r5], r1d + mov [r5], r1d POP_XMM LOAD_6_PARA_POP ret ;********************************************************************************************************************************** ; -; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) +; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) ; -; \note: -; src and ref is optional to align with 16 due inter 8x8 -; \return value: -; return minimal SAD cost, according index carried by index_min_cost +; \note: +; src and ref is optional to align with 16 due inter 8x8 +; \return value: +; return minimal SAD cost, according index carried by index_min_cost ; ;********************************************************************************************************************************** ; try 8 mv via offset ; xmm7 store sad costs -%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref - movdqu xmm0, [%1] - movdqu xmm1, [%2] - movdqa xmm2, xmm1 +%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref + movdqu xmm0, [%1] + movdqu xmm1, [%2] + movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0 ; 000 B - paddw xmm7, xmm1 ; accumulate cost + mpsadbw xmm1, xmm0, 0 ; 000 B + paddw xmm7, xmm1 ; accumulate cost - mpsadbw xmm2, xmm0, 5 ; 101 B - paddw xmm7, xmm2 ; accumulate cost + mpsadbw xmm2, xmm0, 5 ; 101 B + paddw xmm7, xmm2 ; accumulate cost - add %1, %3 - add %2, %4 -%endmacro ; end of SAD_8x8_LINE_SSE41 -%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref - movdqu xmm0, [%1] - movdqu xmm1, [%2] - movdqa xmm2, xmm1 + add %1, %3 + add %2, %4 +%endmacro ; end of SAD_8x8_LINE_SSE41 +%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref + movdqu xmm0, [%1] + movdqu xmm1, [%2] + movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0 ; 000 B - paddw xmm7, xmm1 ; accumulate cost + mpsadbw xmm1, xmm0, 0 ; 000 B + paddw xmm7, xmm1 ; accumulate cost - mpsadbw xmm2, xmm0, 5 ; 101 B - paddw xmm7, xmm2 ; accumulate cost -%endmacro ; end of SAD_8x8_LINE_SSE41E + mpsadbw xmm2, xmm0, 5 ; 101 B + paddw xmm7, xmm2 ; accumulate cost +%endmacro ; end of SAD_8x8_LINE_SSE41E WELS_EXTERN SampleSad8x8Hor8_sse41 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 - SIGN_EXTENSION r1, r1d - SIGN_EXTENSION r3, r3d - movdqa xmm7, [r4] ; load base cost list + SIGN_EXTENSION r1, r1d + SIGN_EXTENSION r3, r3d + movdqa xmm7, [r4] ; load base cost list - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41 r0, r2, r1, r3 - SAD_8x8_LINE_SSE41E r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41 r0, r2, r1, r3 + SAD_8x8_LINE_SSE41E r0, r2, r1, r3 - phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index - movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX - mov r1d, retrd - and retrd, 0xFFFF - sar r1d, 16 - mov [r5], r1d + phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index + movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX + mov r1d, retrd + and retrd, 0xFFFF + sar r1d, 16 + mov [r5], r1d POP_XMM LOAD_6_PARA_POP diff --git a/codec/encoder/core/x86/score.asm b/codec/encoder/core/x86/score.asm index 98a7a497..fa9651c9 100644 --- a/codec/encoder/core/x86/score.asm +++ b/codec/encoder/core/x86/score.asm @@ -104,32 +104,32 @@ db 6,7,6,7,7,8 align 16 high_mask_table: - db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2 - db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5 - db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8 - db 9,12, 0, 1, 1, 4, 2, 4, 5, 8 - db 2, 4, 4, 7, 5, 7, 8,11, 3, 4 - db 5, 8, 5, 7, 8,11, 6, 8, 8,11 - db 9,11,12,15, 0, 1, 1, 4, 1, 3 - db 4, 7, 2, 4, 4, 7, 5, 7, 8,11 - db 2, 3, 4, 7, 4, 6, 7,10, 5, 7 - db 7,10, 8,10,11,14, 3, 4, 4, 7 - db 5, 7, 8,11, 5, 7, 7,10, 8,10 - db 11,14, 6, 7, 8,11, 8,10,11,14 - db 9,11,11,14,12,14,15,18, 0, 0 - db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6 - db 4, 6, 7,10, 2, 3, 4, 7, 4, 6 - db 7,10, 5, 7, 7,10, 8,10,11,14 - db 2, 3, 3, 6, 4, 6, 7,10, 4, 6 - db 6, 9, 7, 9,10,13, 5, 6, 7,10 - db 7, 9,10,13, 8,10,10,13,11,13 - db 14,17, 3, 4, 4, 7, 4, 6, 7,10 - db 5, 7, 7,10, 8,10,11,14, 5, 6 - db 7,10, 7, 9,10,13, 8,10,10,13 - db 11,13,14,17, 6, 7, 7,10, 8,10 - db 11,14, 8,10,10,13,11,13,14,17 - db 9,10,11,14,11,13,14,17,12,14 - db 14,17,15,17,18,21 + db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2 + db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5 + db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8 + db 9,12, 0, 1, 1, 4, 2, 4, 5, 8 + db 2, 4, 4, 7, 5, 7, 8,11, 3, 4 + db 5, 8, 5, 7, 8,11, 6, 8, 8,11 + db 9,11,12,15, 0, 1, 1, 4, 1, 3 + db 4, 7, 2, 4, 4, 7, 5, 7, 8,11 + db 2, 3, 4, 7, 4, 6, 7,10, 5, 7 + db 7,10, 8,10,11,14, 3, 4, 4, 7 + db 5, 7, 8,11, 5, 7, 7,10, 8,10 + db 11,14, 6, 7, 8,11, 8,10,11,14 + db 9,11,11,14,12,14,15,18, 0, 0 + db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6 + db 4, 6, 7,10, 2, 3, 4, 7, 4, 6 + db 7,10, 5, 7, 7,10, 8,10,11,14 + db 2, 3, 3, 6, 4, 6, 7,10, 4, 6 + db 6, 9, 7, 9,10,13, 5, 6, 7,10 + db 7, 9,10,13, 8,10,10,13,11,13 + db 14,17, 3, 4, 4, 7, 4, 6, 7,10 + db 5, 7, 7,10, 8,10,11,14, 5, 6 + db 7,10, 7, 9,10,13, 8,10,10,13 + db 11,13,14,17, 6, 7, 7,10, 8,10 + db 11,14, 8,10,10,13,11,13,14,17 + db 9,10,11,14,11,13,14,17,12,14 + db 14,17,15,17,18,21 align 16 low_mask_table: @@ -167,173 +167,173 @@ SECTION .text ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct ) ;*********************************************************************** WELS_EXTERN WelsScan4x4DcAc_sse2 - %ifdef X86_32 - push r3 - %assign push_num 1 - %else - %assign push_num 0 - %endif - LOAD_2_PARA - movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0 - movdqa xmm1, [r1+16] ; f e d c b a 9 8 - pextrw r2d, xmm0, 7 ; ecx = 7 - pextrw r3d, xmm1, 2 ; edx = a - pextrw r1d, xmm0, 5 ; eax = 5 - pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8 - pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0 - pextrw r2d, xmm1, 0 ; ecx = 8 - pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0 - pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a - pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0 - pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a - pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0 - pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9 - movdqa [r0],xmm0 - movdqa [r0+16], xmm1 - %ifdef X86_32 - pop r3 - %endif - ret + %ifdef X86_32 + push r3 + %assign push_num 1 + %else + %assign push_num 0 + %endif + LOAD_2_PARA + movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0 + movdqa xmm1, [r1+16] ; f e d c b a 9 8 + pextrw r2d, xmm0, 7 ; ecx = 7 + pextrw r3d, xmm1, 2 ; edx = a + pextrw r1d, xmm0, 5 ; eax = 5 + pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8 + pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0 + pextrw r2d, xmm1, 0 ; ecx = 8 + pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0 + pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a + pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0 + pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a + pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0 + pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9 + movdqa [r0],xmm0 + movdqa [r0+16], xmm1 + %ifdef X86_32 + pop r3 + %endif + ret ;*********************************************************************** ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct ) ;*********************************************************************** WELS_EXTERN WelsScan4x4DcAc_ssse3 - %assign push_num 0 - LOAD_2_PARA - movdqa xmm0, [r1] - movdqa xmm1, [r1+16] - pextrw r2d, xmm0, 7 ; ecx = [7] - pextrw r1d, xmm1, 0 ; eax = [8] - pinsrw xmm0, r1d, 7 ; xmm0[7] = [8] - pinsrw xmm1, r2d, 0 ; xmm1[0] = [7] - pshufb xmm1, [pb_scanacdc_maskb] - pshufb xmm0, [pb_scanacdc_maska] + %assign push_num 0 + LOAD_2_PARA + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + pextrw r2d, xmm0, 7 ; ecx = [7] + pextrw r1d, xmm1, 0 ; eax = [8] + pinsrw xmm0, r1d, 7 ; xmm0[7] = [8] + pinsrw xmm1, r2d, 0 ; xmm1[0] = [7] + pshufb xmm1, [pb_scanacdc_maskb] + pshufb xmm0, [pb_scanacdc_maska] - movdqa [r0],xmm0 - movdqa [r0+16], xmm1 - ret + movdqa [r0],xmm0 + movdqa [r0+16], xmm1 + ret ;*********************************************************************** ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct ) ;*********************************************************************** WELS_EXTERN WelsScan4x4Ac_sse2 - %assign push_num 0 - LOAD_2_PARA - movdqa xmm0, [r1] - movdqa xmm1, [r1+16] - movdqa xmm2, xmm0 - punpcklqdq xmm0, xmm1 - punpckhqdq xmm2, xmm1 + %assign push_num 0 + LOAD_2_PARA + movdqa xmm0, [r1] + movdqa xmm1, [r1+16] + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm1 + punpckhqdq xmm2, xmm1 - movdqa xmm3, xmm0 - punpckldq xmm0, xmm2 - punpckhdq xmm3, xmm2 - pextrw r1d , xmm0, 3 - pextrw r2d , xmm0, 7 - pinsrw xmm0, r1d, 7 - pextrw r1d, xmm3, 4 - pinsrw xmm3, r2d, 4 - pextrw r2d, xmm3, 0 - pinsrw xmm3, r1d, 0 - pinsrw xmm0, r2d, 3 + movdqa xmm3, xmm0 + punpckldq xmm0, xmm2 + punpckhdq xmm3, xmm2 + pextrw r1d , xmm0, 3 + pextrw r2d , xmm0, 7 + pinsrw xmm0, r1d, 7 + pextrw r1d, xmm3, 4 + pinsrw xmm3, r2d, 4 + pextrw r2d, xmm3, 0 + pinsrw xmm3, r1d, 0 + pinsrw xmm0, r2d, 3 - pshufhw xmm1, xmm0, 0x93 - pshuflw xmm2, xmm3, 0x39 + pshufhw xmm1, xmm0, 0x93 + pshuflw xmm2, xmm3, 0x39 movdqa xmm3, xmm2 psrldq xmm1, 2 pslldq xmm3, 14 por xmm1, xmm3 psrldq xmm2, 2 - movdqa [r0],xmm1 - movdqa [r0+16], xmm2 - ret + movdqa [r0],xmm1 + movdqa [r0+16], xmm2 + ret ;*********************************************************************** ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct ); ;*********************************************************************** WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 - %ifdef X86_32 - push r3 - %assign push_num 1 - %else - %assign push_num 0 - %endif - LOAD_1_PARA - movdqa xmm0, [r0] - movdqa xmm1, [r0+16] + %ifdef X86_32 + push r3 + %assign push_num 1 + %else + %assign push_num 0 + %endif + LOAD_1_PARA + movdqa xmm0, [r0] + movdqa xmm1, [r0+16] - packsswb xmm0, xmm1 - ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx - xor r3, r3 + packsswb xmm0, xmm1 + ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx + xor r3, r3 pxor xmm3, xmm3 pcmpeqb xmm0, xmm3 pmovmskb r3d, xmm0 xor r3, 0xffff - xor r0, r0 - mov r2, 7 - mov r1, 8 + xor r0, r0 + mov r2, 7 + mov r1, 8 .loop_low8_find1: - bt r3, r2 - jc .loop_high8_find1 - dec r2 - jnz .loop_low8_find1 + bt r3, r2 + jc .loop_high8_find1 + dec r2 + jnz .loop_low8_find1 .loop_high8_find1: - bt r3, r1 - jc .find1end - inc r1 - cmp r1,16 - jb .loop_high8_find1 + bt r3, r1 + jc .find1end + inc r1 + cmp r1,16 + jb .loop_high8_find1 .find1end: - sub r1, r2 - sub r1, 1 - lea r2, [i_ds_table] - add r0b, [r2+r1] - mov r1, r3 - and r3, 0xff - shr r1, 8 - and r1, 0xff - lea r2 , [low_mask_table] - add r0b, [r2 +r3] - lea r2, [high_mask_table] - add r0b, [r2+r1] - %ifdef X86_32 - pop r3 - %else - mov retrd, r0d - %endif - ret + sub r1, r2 + sub r1, 1 + lea r2, [i_ds_table] + add r0b, [r2+r1] + mov r1, r3 + and r3, 0xff + shr r1, 8 + and r1, 0xff + lea r2 , [low_mask_table] + add r0b, [r2 +r3] + lea r2, [high_mask_table] + add r0b, [r2+r1] + %ifdef X86_32 + pop r3 + %else + mov retrd, r0d + %endif + ret ;*********************************************************************** ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level); ;*********************************************************************** WELS_EXTERN WelsGetNoneZeroCount_sse2 - %assign push_num 0 - LOAD_1_PARA - movdqa xmm0, [r0] - movdqa xmm1, [r0+16] - pxor xmm2, xmm2 - pcmpeqw xmm0, xmm2 - pcmpeqw xmm1, xmm2 - packsswb xmm1, xmm0 - xor r1, r1 - pmovmskb r1d, xmm1 - xor r1d, 0xffff - mov r2, r1 - and r1, 0xff - shr r2, 8 -; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet -; xor retr, retr - ;add al, [nozero_count_table+r2] - lea r0 , [nozero_count_table] - movzx r2, byte [r0+r2] - movzx r1, byte [r0+r1] - mov retrq, r2 - add retrq, r1 - ;add al, [nozero_count_table+r1] - ret + %assign push_num 0 + LOAD_1_PARA + movdqa xmm0, [r0] + movdqa xmm1, [r0+16] + pxor xmm2, xmm2 + pcmpeqw xmm0, xmm2 + pcmpeqw xmm1, xmm2 + packsswb xmm1, xmm0 + xor r1, r1 + pmovmskb r1d, xmm1 + xor r1d, 0xffff + mov r2, r1 + and r1, 0xff + shr r2, 8 +; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet +; xor retr, retr + ;add al, [nozero_count_table+r2] + lea r0 , [nozero_count_table] + movzx r2, byte [r0+r2] + movzx r1, byte [r0+r1] + mov retrq, r2 + add retrq, r1 + ;add al, [nozero_count_table+r1] + ret diff --git a/codec/processing/src/arm/adaptive_quantization.S b/codec/processing/src/arm/adaptive_quantization.S index ee1aca61..70091062 100644 --- a/codec/processing/src/arm/adaptive_quantization.S +++ b/codec/processing/src/arm/adaptive_quantization.S @@ -36,17 +36,17 @@ #ifdef __APPLE__ .macro SQR_ADD_16BYTES - vmull.u8 q3, $0, $0 - vmull.u8 q8, $1, $1 - vpadal.u16 $2, q3 - vpadal.u16 $2, q8 + vmull.u8 q3, $0, $0 + vmull.u8 q8, $1, $1 + vpadal.u16 $2, q3 + vpadal.u16 $2, q8 .endm #else .macro SQR_ADD_16BYTES arg0, arg1, arg2 - vmull.u8 q3, \arg0, \arg0 - vmull.u8 q8, \arg1, \arg1 - vpadal.u16 \arg2, q3 - vpadal.u16 \arg2, q8 + vmull.u8 q3, \arg0, \arg0 + vmull.u8 q8, \arg1, \arg1 + vpadal.u16 \arg2, q3 + vpadal.u16 \arg2, q8 .endm #endif @@ -54,66 +54,66 @@ WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon stmdb sp!, {r4} - vld1.8 {q15}, [r0], r1 //save the ref data (16bytes) - vld1.8 {q14}, [r2], r3 //save the src data (16bytes) + vld1.8 {q15}, [r0], r1 //save the ref data (16bytes) + vld1.8 {q14}, [r2], r3 //save the src data (16bytes) - vabd.u8 q13, q14, q15 - vmull.u8 q12, d27, d27 - vmull.u8 q11, d26, d26 - vaddl.u16 q12, d24, d25 - vpadal.u16 q12, q11 //sqr + vabd.u8 q13, q14, q15 + vmull.u8 q12, d27, d27 + vmull.u8 q11, d26, d26 + vaddl.u16 q12, d24, d25 + vpadal.u16 q12, q11 //sqr vaddl.u8 q13, d26, d27 //sum - vaddl.u8 q10, d28, d29 //sum_cur + vaddl.u8 q10, d28, d29 //sum_cur - vmull.u8 q9, d29, d29 - vmull.u8 q8, d28, d28 - vaddl.u16 q9, d18, d19 //sqr_cur - vpadal.u16 q9, q8 + vmull.u8 q9, d29, d29 + vmull.u8 q8, d28, d28 + vaddl.u16 q9, d18, d19 //sqr_cur + vpadal.u16 q9, q8 - mov r4, #15 + mov r4, #15 pixel_var_16x16_loop0: - vld1.8 {q0}, [r0], r1 //save the ref data (16bytes) - vld1.8 {q1}, [r2], r3 //save the src data (16bytes) + vld1.8 {q0}, [r0], r1 //save the ref data (16bytes) + vld1.8 {q1}, [r2], r3 //save the src data (16bytes) - vabd.u8 q2, q0, q1 + vabd.u8 q2, q0, q1 - //q10 save sum_cur - vpadal.u8 q10, q1 + //q10 save sum_cur + vpadal.u8 q10, q1 - //q12 save sqr - SQR_ADD_16BYTES d4, d5, q12 + //q12 save sqr + SQR_ADD_16BYTES d4, d5, q12 //q13 save sum - vpadal.u8 q13, q2 + vpadal.u8 q13, q2 - subs r4, #1 + subs r4, #1 - //q9 save sqr_cur - SQR_ADD_16BYTES d2, d3, q9 + //q9 save sqr_cur + SQR_ADD_16BYTES d2, d3, q9 - bne pixel_var_16x16_loop0 + bne pixel_var_16x16_loop0 - vadd.u16 d0, d26, d27 //sum - vadd.u16 d1, d20, d21 //sum_cur - vpaddl.u16 q0, q0 - vadd.u32 d2, d24, d25 //sqr - vadd.u32 d3, d18, d19 //sqr_cur - vpadd.u32 d0, d0, d1 - vpadd.u32 d1, d2, d3 + vadd.u16 d0, d26, d27 //sum + vadd.u16 d1, d20, d21 //sum_cur + vpaddl.u16 q0, q0 + vadd.u32 d2, d24, d25 //sqr + vadd.u32 d3, d18, d19 //sqr_cur + vpadd.u32 d0, d0, d1 + vpadd.u32 d1, d2, d3 - ldr r4, [sp, #4] + ldr r4, [sp, #4] - vshr.u32 q0, q0, #8 - vmul.u32 d0, d0 - vsub.u32 d0, d1, d0 + vshr.u32 q0, q0, #8 + vmul.u32 d0, d0 + vsub.u32 d0, d1, d0 vmovl.u32 q0, d0 - vst2.16 {d0[0], d1[0]}, [r4] + vst2.16 {d0[0], d1[0]}, [r4] - ldmia sp!, {r4} + ldmia sp!, {r4} WELS_ASM_FUNC_END diff --git a/codec/processing/src/arm/down_sample_neon.S b/codec/processing/src/arm/down_sample_neon.S index ad28e834..da7aca56 100644 --- a/codec/processing/src/arm/down_sample_neon.S +++ b/codec/processing/src/arm/down_sample_neon.S @@ -30,313 +30,313 @@ * */ -#ifdef HAVE_NEON +#ifdef HAVE_NEON .text #include "arm_arch_common_macro.S" -WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon - stmdb sp!, {r4-r8, lr} +WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon + stmdb sp!, {r4-r8, lr} - //Get the width and height - ldr r4, [sp, #24] //src_width - ldr r5, [sp, #28] //src_height + //Get the width and height + ldr r4, [sp, #24] //src_width + ldr r5, [sp, #28] //src_height - //Initialize the register - mov r6, r2 - mov r8, r0 - mov lr, #0 - lsr r5, #1 + //Initialize the register + mov r6, r2 + mov r8, r0 + mov lr, #0 + lsr r5, #1 - //Save the tailer for the unasigned size - mla r7, r1, r5, r0 - vld1.32 {q15}, [r7] + //Save the tailer for the unasigned size + mla r7, r1, r5, r0 + vld1.32 {q15}, [r7] - add r7, r2, r3 - //processing a colume data + add r7, r2, r3 + //processing a colume data comp_ds_bilinear_loop0: - vld1.8 {q0,q1}, [r2]! - vld1.8 {q2,q3}, [r7]! - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 - vpaddl.u8 q2, q2 - vpaddl.u8 q3, q3 - vrshr.u16 q0, #1 - vrshr.u16 q1, #1 - vrshr.u16 q2, #1 - vrshr.u16 q3, #1 - vrhadd.u16 q0, q2 - vrhadd.u16 q1, q3 - vmovn.u16 d0, q0 - vmovn.u16 d1, q1 - vst1.32 {q0}, [r0]! - add lr, #32 + vld1.8 {q0,q1}, [r2]! + vld1.8 {q2,q3}, [r7]! + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vrshr.u16 q0, #1 + vrshr.u16 q1, #1 + vrshr.u16 q2, #1 + vrshr.u16 q3, #1 + vrhadd.u16 q0, q2 + vrhadd.u16 q1, q3 + vmovn.u16 d0, q0 + vmovn.u16 d1, q1 + vst1.32 {q0}, [r0]! + add lr, #32 - cmp lr, r4 - movcs lr, #0 - addcs r6, r6, r3, lsl #1 - movcs r2, r6 - addcs r7, r2, r3 - addcs r8, r1 - movcs r0, r8 - subscs r5, #1 - bne comp_ds_bilinear_loop0 + cmp lr, r4 + movcs lr, #0 + addcs r6, r6, r3, lsl #1 + movcs r2, r6 + addcs r7, r2, r3 + addcs r8, r1 + movcs r0, r8 + subscs r5, #1 + bne comp_ds_bilinear_loop0 - //restore the tailer for the unasigned size - vst1.32 {q15}, [r0] + //restore the tailer for the unasigned size + vst1.32 {q15}, [r0] - ldmia sp!, {r4-r8,lr} + ldmia sp!, {r4-r8,lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon - stmdb sp!, {r4-r7, lr} +WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon + stmdb sp!, {r4-r7, lr} - //Get the width and height - ldr r4, [sp, #20] //src_width - ldr r5, [sp, #24] //src_height + //Get the width and height + ldr r4, [sp, #20] //src_width + ldr r5, [sp, #24] //src_height - //Get the difference - sub lr, r3, r4 - sub r1, r1, r4, lsr #1 + //Get the difference + sub lr, r3, r4 + sub r1, r1, r4, lsr #1 - lsr r5, #1 + lsr r5, #1 - //processing a colume data + //processing a colume data comp_ds_bilinear_w_x8_loop0: - lsr r6, r4, #3 - add r7, r2, r3 - //processing a line data + lsr r6, r4, #3 + add r7, r2, r3 + //processing a line data comp_ds_bilinear_w_x8_loop1: - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r7]! - vpaddl.u8 q0, q0 - vrshr.u16 q0, #1 - vrhadd.u16 d0, d1 + vld1.8 {d0}, [r2]! + vld1.8 {d1}, [r7]! + vpaddl.u8 q0, q0 + vrshr.u16 q0, #1 + vrhadd.u16 d0, d1 - vmovn.u16 d0, q0 - vst1.32 {d0[0]}, [r0]! - subs r6, #1 - bne comp_ds_bilinear_w_x8_loop1 + vmovn.u16 d0, q0 + vst1.32 {d0[0]}, [r0]! + subs r6, #1 + bne comp_ds_bilinear_w_x8_loop1 - add r2, r7, lr - add r0, r1 - subs r5, #1 - bne comp_ds_bilinear_w_x8_loop0 + add r2, r7, lr + add r0, r1 + subs r5, #1 + bne comp_ds_bilinear_w_x8_loop0 - ldmia sp!, {r4-r7,lr} + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon - stmdb sp!, {r4-r7, lr} +WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon + stmdb sp!, {r4-r7, lr} - //Get the width and height - ldr r4, [sp, #20] //src_width - ldr r5, [sp, #24] //src_height + //Get the width and height + ldr r4, [sp, #20] //src_width + ldr r5, [sp, #24] //src_height - //Get the difference - sub lr, r3, r4 - sub r1, r1, r4, lsr #1 + //Get the difference + sub lr, r3, r4 + sub r1, r1, r4, lsr #1 - lsr r5, #1 + lsr r5, #1 - //processing a colume data + //processing a colume data comp_ds_bilinear_w_x16_loop0: - lsr r6, r4, #4 - add r7, r2, r3 - //processing a line data + lsr r6, r4, #4 + add r7, r2, r3 + //processing a line data comp_ds_bilinear_w_x16_loop1: - vld1.8 {q0}, [r2]! - vld1.8 {q1}, [r7]! - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 - vrshr.u16 q0, #1 - vrshr.u16 q1, #1 - vrhadd.u16 q0, q1 + vld1.8 {q0}, [r2]! + vld1.8 {q1}, [r7]! + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vrshr.u16 q0, #1 + vrshr.u16 q1, #1 + vrhadd.u16 q0, q1 - vmovn.u16 d0, q0 - vst1.32 {d0}, [r0]! - subs r6, #1 - bne comp_ds_bilinear_w_x16_loop1 + vmovn.u16 d0, q0 + vst1.32 {d0}, [r0]! + subs r6, #1 + bne comp_ds_bilinear_w_x16_loop1 - add r2, r7, lr - add r0, r1 - subs r5, #1 - bne comp_ds_bilinear_w_x16_loop0 + add r2, r7, lr + add r0, r1 + subs r5, #1 + bne comp_ds_bilinear_w_x16_loop0 - ldmia sp!, {r4-r7,lr} + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END -WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon - stmdb sp!, {r4-r7, lr} +WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon + stmdb sp!, {r4-r7, lr} - //Get the width and height - ldr r4, [sp, #20] //src_width - ldr r5, [sp, #24] //src_height + //Get the width and height + ldr r4, [sp, #20] //src_width + ldr r5, [sp, #24] //src_height - //Get the difference - sub lr, r3, r4 - sub r1, r1, r4, lsr #1 + //Get the difference + sub lr, r3, r4 + sub r1, r1, r4, lsr #1 - lsr r5, #1 + lsr r5, #1 - //processing a colume data + //processing a colume data comp_ds_bilinear_w_x32_loop0: - lsr r6, r4, #5 - add r7, r2, r3 - //processing a line data + lsr r6, r4, #5 + add r7, r2, r3 + //processing a line data comp_ds_bilinear_w_x32_loop1: - vld1.8 {q0,q1}, [r2]! - vld1.8 {q2,q3}, [r7]! - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 - vpaddl.u8 q2, q2 - vpaddl.u8 q3, q3 - vrshr.u16 q0, #1 - vrshr.u16 q1, #1 - vrshr.u16 q2, #1 - vrshr.u16 q3, #1 - vrhadd.u16 q0, q2 - vrhadd.u16 q1, q3 + vld1.8 {q0,q1}, [r2]! + vld1.8 {q2,q3}, [r7]! + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vrshr.u16 q0, #1 + vrshr.u16 q1, #1 + vrshr.u16 q2, #1 + vrshr.u16 q3, #1 + vrhadd.u16 q0, q2 + vrhadd.u16 q1, q3 - vmovn.u16 d0, q0 - vmovn.u16 d1, q1 - vst1.32 {q0}, [r0]! - subs r6, #1 - bne comp_ds_bilinear_w_x32_loop1 + vmovn.u16 d0, q0 + vmovn.u16 d1, q1 + vst1.32 {q0}, [r0]! + subs r6, #1 + bne comp_ds_bilinear_w_x32_loop1 - add r2, r7, lr - add r0, r1 - subs r5, #1 - bne comp_ds_bilinear_w_x32_loop0 + add r2, r7, lr + add r0, r1 + subs r5, #1 + bne comp_ds_bilinear_w_x32_loop0 - ldmia sp!, {r4-r7,lr} + ldmia sp!, {r4-r7,lr} WELS_ASM_FUNC_END WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon stmdb sp!, {r4-r12, lr} - //Get the data from stack - ldr r4, [sp, #40] //the addr of src - ldr r5, [sp, #44] //the value of src_stride + //Get the data from stack + ldr r4, [sp, #40] //the addr of src + ldr r5, [sp, #44] //the value of src_stride ldr r6, [sp, #48] //the value of scaleX ldr r7, [sp, #52] //the value of scaleY mov r10, #32768 sub r10, #1 - and r8, r6, r10 // r8 uinc(scaleX mod 32767) + and r8, r6, r10 // r8 uinc(scaleX mod 32767) mov r11, #-1 - mul r11, r8 // r11 -uinc + mul r11, r8 // r11 -uinc vdup.s16 d2, r8 vdup.s16 d0, r11 vzip.s16 d0, d2 // uinc -uinc uinc -uinc - and r9, r7, r10 // r9 vinc(scaleY mod 32767) + and r9, r7, r10 // r9 vinc(scaleY mod 32767) mov r11, #-1 - mul r11, r9 // r11 -vinc + mul r11, r9 // r11 -vinc - vdup.s16 d2, r9 - vdup.s16 d3, r11 - vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc + vdup.s16 d2, r9 + vdup.s16 d3, r11 + vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc - mov r11, #0x40000000 + mov r11, #0x40000000 mov r12, #0x4000 sub r12, #1 add r11, r12 - vdup.s32 d1, r11; //init u 16384 16383 16384 16383 + vdup.s32 d1, r11; //init u 16384 16383 16384 16383 - mov r11, #16384 + mov r11, #16384 vdup.s16 d16, r11 sub r11, #1 - vdup.s16 d17, r11 - vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383 + vdup.s16 d17, r11 + vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383 - veor q14, q14 - sub r1, r2 // stride - width - mov r8, #16384 // yInverse - sub r3, #1 + veor q14, q14 + sub r1, r2 // stride - width + mov r8, #16384 // yInverse + sub r3, #1 _HEIGHT: ldr r4, [sp, #40] //the addr of src - mov r11, r8 - lsr r11, #15 - mul r11, r5 - add r11, r4 // get current row address - mov r12, r11 - add r12, r5 + mov r11, r8 + lsr r11, #15 + mul r11, r5 + add r11, r4 // get current row address + mov r12, r11 + add r12, r5 - mov r9, #16384 // xInverse - sub r10, r2, #1 + mov r9, #16384 // xInverse + sub r10, r2, #1 vmov.s16 d6, d1 _WIDTH: - mov lr, r9 - lsr lr, #15 + mov lr, r9 + lsr lr, #15 add r4, r11,lr - vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a; + vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a; add r4, r12,lr - vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; - vzip.32 d28, d29 //q14: 000d000c000b000a; + vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a; + vzip.32 d28, d29 //q14: 000d000c000b000a; - vmull.u16 q13, d6, d7 //q13: init u * init v - vmull.u32 q12, d26,d28 - vmlal.u32 q12, d27,d29 - vqadd.u64 d24, d24,d25 - vrshr.u64 d24, #30 + vmull.u16 q13, d6, d7 //q13: init u * init v + vmull.u32 q12, d26,d28 + vmlal.u32 q12, d27,d29 + vqadd.u64 d24, d24,d25 + vrshr.u64 d24, #30 - vst1.8 {d24[0]}, [r0]! - add r9, r6 - vadd.u16 d6, d0 // inc u - vshl.u16 d6, #1 - vshr.u16 d6, #1 - subs r10, #1 - bne _WIDTH + vst1.8 {d24[0]}, [r0]! + add r9, r6 + vadd.u16 d6, d0 // inc u + vshl.u16 d6, #1 + vshr.u16 d6, #1 + subs r10, #1 + bne _WIDTH WIDTH_END: - lsr r9, #15 + lsr r9, #15 add r4,r11,r9 - vld1.8 {d24[0]}, [r4] - vst1.8 {d24[0]}, [r0] - add r0, #1 - add r8, r7 - add r0, r1 - vadd.s16 d7, d5 // inc v - vshl.u16 d7, #1 - vshr.u16 d7, #1 - subs r3, #1 - bne _HEIGHT + vld1.8 {d24[0]}, [r4] + vst1.8 {d24[0]}, [r0] + add r0, #1 + add r8, r7 + add r0, r1 + vadd.s16 d7, d5 // inc v + vshl.u16 d7, #1 + vshr.u16 d7, #1 + subs r3, #1 + bne _HEIGHT LAST_ROW: ldr r4, [sp, #40] //the addr of src - lsr r8, #15 - mul r8, r5 - add r4, r8 // get current row address - mov r9, #16384 + lsr r8, #15 + mul r8, r5 + add r4, r8 // get current row address + mov r9, #16384 _LAST_ROW_WIDTH: - mov r11, r9 - lsr r11, #15 + mov r11, r9 + lsr r11, #15 - add r3, r4,r11 - vld1.8 {d0[0]}, [r3] - vst1.8 {d0[0]}, [r0] - add r0, #1 - add r9, r6 - subs r2, #1 - bne _LAST_ROW_WIDTH + add r3, r4,r11 + vld1.8 {d0[0]}, [r3] + vst1.8 {d0[0]}, [r0] + add r0, #1 + add r9, r6 + subs r2, #1 + bne _LAST_ROW_WIDTH - ldmia sp!, {r4-r12, lr} + ldmia sp!, {r4-r12, lr} WELS_ASM_FUNC_END #endif diff --git a/codec/processing/src/arm/pixel_sad_neon.S b/codec/processing/src/arm/pixel_sad_neon.S index 95c42294..da0190ed 100644 --- a/codec/processing/src/arm/pixel_sad_neon.S +++ b/codec/processing/src/arm/pixel_sad_neon.S @@ -37,32 +37,32 @@ WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon stmdb sp!, {lr} - //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r2], r3 + //Loading a horizontal line data (8 bytes) + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r2], r3 - //Do the SAD for 8 bytes - vabdl.u8 q1, d0, d1 + //Do the SAD for 8 bytes + vabdl.u8 q1, d0, d1 - mov lr, #7 + mov lr, #7 pixel_sad_8x8_loop0: //Loading a horizontal line data (8 bytes) - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r2], r3 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r2], r3 - subs lr, #1 + subs lr, #1 - //Do the SAD for 8 bytes - vabal.u8 q1, d0, d1 - bne pixel_sad_8x8_loop0 + //Do the SAD for 8 bytes + vabal.u8 q1, d0, d1 + bne pixel_sad_8x8_loop0 - vadd.u16 d2, d3 - vpaddl.u16 d2, d2 - vpaddl.u32 d2, d2 - vmov.u32 r0, d2[0]//TBO... + vadd.u16 d2, d3 + vpaddl.u16 d2, d2 + vpaddl.u32 d2, d2 + vmov.u32 r0, d2[0]//TBO... - ldmia sp!, {lr} + ldmia sp!, {lr} WELS_ASM_FUNC_END #endif diff --git a/codec/processing/src/arm/vaa_calc_neon.S b/codec/processing/src/arm/vaa_calc_neon.S index 1c8dcff2..73d3708f 100644 --- a/codec/processing/src/arm/vaa_calc_neon.S +++ b/codec/processing/src/arm/vaa_calc_neon.S @@ -37,61 +37,61 @@ #ifdef __APPLE__ .macro ABS_SUB_SUM_16BYTES - vld1.32 {q15}, [$0], $2 - vld1.32 {q14}, [$1], $2 - vabal.u8 $3, d30, d28 - vabal.u8 $4, d31, d29 + vld1.32 {q15}, [$0], $2 + vld1.32 {q14}, [$1], $2 + vabal.u8 $3, d30, d28 + vabal.u8 $4, d31, d29 .endm .macro ABS_SUB_SUM_8x16BYTES - vld1.32 {q15}, [$0], $2 - vld1.32 {q14}, [$1], $2 - vabdl.u8 $3, d30, d28 - vabdl.u8 $4, d31, d29 + vld1.32 {q15}, [$0], $2 + vld1.32 {q14}, [$1], $2 + vabdl.u8 $3, d30, d28 + vabdl.u8 $4, d31, d29 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 - ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 + ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4 .endm .macro SAD_8X16BITS - vadd.u16 d31, $0, $1 - vpaddl.u16 d31, d31 - vpaddl.u32 $2, d31 + vadd.u16 d31, $0, $1 + vpaddl.u16 d31, d31 + vpaddl.u32 $2, d31 .endm #else .macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4 - vld1.32 {q15}, [\arg0], \arg2 - vld1.32 {q14}, [\arg1], \arg2 - vabal.u8 \arg3, d30, d28 - vabal.u8 \arg4, d31, d29 + vld1.32 {q15}, [\arg0], \arg2 + vld1.32 {q14}, [\arg1], \arg2 + vabal.u8 \arg3, d30, d28 + vabal.u8 \arg4, d31, d29 .endm .macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4 - vld1.32 {q15}, [\arg0], \arg2 - vld1.32 {q14}, [\arg1], \arg2 - vabdl.u8 \arg3, d30, d28 - vabdl.u8 \arg4, d31, d29 + vld1.32 {q15}, [\arg0], \arg2 + vld1.32 {q14}, [\arg1], \arg2 + vabdl.u8 \arg3, d30, d28 + vabdl.u8 \arg4, d31, d29 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 - ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 + ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 .endm .macro SAD_8X16BITS arg0, arg1, arg2 - vadd.u16 d31, \arg0, \arg1 - vpaddl.u16 d31, d31 - vpaddl.u32 \arg2, d31 + vadd.u16 d31, \arg0, \arg1 + vpaddl.u16 d31, d31 + vpaddl.u32 \arg2, d31 .endm #endif @@ -100,16 +100,16 @@ WELS_ASM_FUNC_BEGIN VAACalcSad_neon stmdb sp!, {r4-r8} - ldr r4, [sp, #20] //load pic_stride - ldr r5, [sp, #28] //load psad8x8 + ldr r4, [sp, #20] //load pic_stride + ldr r5, [sp, #28] //load psad8x8 - //Initial the Q8 register for save the "psadframe" - vmov.s64 q8, #0 + //Initial the Q8 register for save the "psadframe" + vmov.s64 q8, #0 - //Get the jump distance to use on loop codes - lsl r8, r4, #4 - sub r7, r8, #16 //R7 keep the 16*pic_stride-16 - sub r8, r2 //R8 keep the 16*pic_stride-pic_width + //Get the jump distance to use on loop codes + lsl r8, r4, #4 + sub r7, r8, #16 //R7 keep the 16*pic_stride-16 + sub r8, r2 //R8 keep the 16*pic_stride-pic_width vaa_calc_sad_loop0: @@ -118,70 +118,70 @@ vaa_calc_sad_loop0: vaa_calc_sad_loop1: - //Process the 16x16 bytes - ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1 - ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3 + //Process the 16x16 bytes + ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1 + ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3 - //Do the SAD - SAD_8X16BITS d0, d1, d0 - SAD_8X16BITS d2, d3, d1 - SAD_8X16BITS d4, d5, d2 - SAD_8X16BITS d6, d7, d3 + //Do the SAD + SAD_8X16BITS d0, d1, d0 + SAD_8X16BITS d2, d3, d1 + SAD_8X16BITS d4, d5, d2 + SAD_8X16BITS d6, d7, d3 - //Write to "psad8x8" buffer - vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! + //Write to "psad8x8" buffer + vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! - //Adjust the input address - sub r0, r7 - sub r1, r7 + //Adjust the input address + sub r0, r7 + sub r1, r7 - subs r6, #16 + subs r6, #16 - //Save to calculate "psadframe" - vadd.u32 q0, q1 - vadd.u32 q8, q0 + //Save to calculate "psadframe" + vadd.u32 q0, q1 + vadd.u32 q8, q0 - bne vaa_calc_sad_loop1 + bne vaa_calc_sad_loop1 - //Adjust the input address - add r0, r8 - add r1, r8 + //Adjust the input address + add r0, r8 + add r1, r8 subs r3, #16 - bne vaa_calc_sad_loop0 + bne vaa_calc_sad_loop0 - ldr r6, [sp, #24] //load psadframe - vadd.u32 d16, d17 - vst1.32 {d16[0]}, [r6] + ldr r6, [sp, #24] //load psadframe + vadd.u32 d16, d17 + vst1.32 {d16[0]}, [r6] - ldmia sp!, {r4-r8} + ldmia sp!, {r4-r8} WELS_ASM_FUNC_END #ifdef __APPLE__ .macro SAD_SD_MAD_16BYTES - vld1.32 {q0}, [$0], $2 - vld1.32 {q1}, [$1], $2 + vld1.32 {q0}, [$0], $2 + vld1.32 {q1}, [$1], $2 - vpadal.u8 $3, q0 - vpadal.u8 $4, q1 + vpadal.u8 $3, q0 + vpadal.u8 $4, q1 - vabd.u8 q0, q0, q1 - vmax.u8 $5, q0 - vpadal.u8 $6, q0 + vabd.u8 q0, q0, q1 + vmax.u8 $5, q0 + vpadal.u8 $6, q0 .endm .macro SAD_SD_MAD_8x16BYTES - vld1.32 {q0}, [$0], $2 - vld1.32 {q1}, [$1], $2 + vld1.32 {q0}, [$0], $2 + vld1.32 {q1}, [$1], $2 - vpaddl.u8 q2, q0 - vpaddl.u8 q3, q1 + vpaddl.u8 q2, q0 + vpaddl.u8 q3, q1 - vabd.u8 $3, q0, q1 - vpaddl.u8 $4, $3 //abs_diff + vabd.u8 $3, q0, q1 + vpaddl.u8 $4, $3 //abs_diff SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 @@ -192,41 +192,41 @@ WELS_ASM_FUNC_END SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4 - vsub.u16 $5, q2, q3 + vsub.u16 $5, q2, q3 .endm .macro SAD_SD_MAD_CALC - vpmax.u8 d0, $0, $1 //8bytes - vpmax.u8 d0, d0, d0 //4bytes - vpmax.u8 $2, d0, d0 //2bytes + vpmax.u8 d0, $0, $1 //8bytes + vpmax.u8 d0, d0, d0 //4bytes + vpmax.u8 $2, d0, d0 //2bytes - vpaddl.u16 $3, $3 - vpaddl.u32 $3, $3 - vpaddl.s16 $4, $4 - vpaddl.s32 $4, $4 + vpaddl.u16 $3, $3 + vpaddl.u32 $3, $3 + vpaddl.s16 $4, $4 + vpaddl.s32 $4, $4 .endm #else .macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6 - vld1.32 {q0}, [\arg0], \arg2 - vld1.32 {q1}, [\arg1], \arg2 + vld1.32 {q0}, [\arg0], \arg2 + vld1.32 {q1}, [\arg1], \arg2 - vpadal.u8 \arg3, q0 - vpadal.u8 \arg4, q1 + vpadal.u8 \arg3, q0 + vpadal.u8 \arg4, q1 - vabd.u8 q0, q0, q1 - vmax.u8 \arg5, q0 - vpadal.u8 \arg6, q0 + vabd.u8 q0, q0, q1 + vmax.u8 \arg5, q0 + vpadal.u8 \arg6, q0 .endm .macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5 - vld1.32 {q0}, [\arg0], \arg2 - vld1.32 {q1}, [\arg1], \arg2 + vld1.32 {q0}, [\arg0], \arg2 + vld1.32 {q1}, [\arg1], \arg2 - vpaddl.u8 q2, q0 - vpaddl.u8 q3, q1 + vpaddl.u8 q2, q0 + vpaddl.u8 q3, q1 - vabd.u8 \arg3, q0, q1 - vpaddl.u8 \arg4, \arg3 //abs_diff + vabd.u8 \arg3, q0, q1 + vpaddl.u8 \arg4, \arg3 //abs_diff SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 @@ -237,18 +237,18 @@ WELS_ASM_FUNC_END SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 - vsub.u16 \arg5, q2, q3 + vsub.u16 \arg5, q2, q3 .endm .macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4 - vpmax.u8 d0, \arg0, \arg1 //8bytes - vpmax.u8 d0, d0, d0 //4bytes - vpmax.u8 \arg2, d0, d0 //2bytes + vpmax.u8 d0, \arg0, \arg1 //8bytes + vpmax.u8 d0, d0, d0 //4bytes + vpmax.u8 \arg2, d0, d0 //2bytes - vpaddl.u16 \arg3, \arg3 - vpaddl.u32 \arg3, \arg3 - vpaddl.s16 \arg4, \arg4 - vpaddl.s32 \arg4, \arg4 + vpaddl.u16 \arg3, \arg3 + vpaddl.u32 \arg3, \arg3 + vpaddl.s16 \arg4, \arg4 + vpaddl.s32 \arg4, \arg4 .endm #endif @@ -256,18 +256,18 @@ WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon stmdb sp!, {r4-r10} - ldr r4, [sp, #28] //load pic_stride - ldr r5, [sp, #36] //load psad8x8 + ldr r4, [sp, #28] //load pic_stride + ldr r5, [sp, #36] //load psad8x8 ldr r6, [sp, #40] //load psd8x8 ldr r7, [sp, #44] //load pmad8x8 - //Initial the Q4 register for save the "psadframe" - vmov.s64 q15, #0 + //Initial the Q4 register for save the "psadframe" + vmov.s64 q15, #0 - //Get the jump distance to use on loop codes - lsl r10, r4, #4 - sub r9, r10, #16 //R9 keep the 16*pic_stride-16 - sub r10, r2 //R10 keep the 16*pic_stride-pic_width + //Get the jump distance to use on loop codes + lsl r10, r4, #4 + sub r9, r10, #16 //R9 keep the 16*pic_stride-16 + sub r10, r2 //R10 keep the 16*pic_stride-pic_width vaa_calc_sad_bgd_loop0: @@ -276,391 +276,391 @@ vaa_calc_sad_bgd_loop0: vaa_calc_sad_bgd_loop1: - //Process the 16x16 bytes pmad psad psd - SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9 - SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10 + //Process the 16x16 bytes pmad psad psd + SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9 + SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10 SAD_SD_MAD_CALC d26, d27, d16, q11, q9 SAD_SD_MAD_CALC d28, d29, d17, q12, q10 - //Write to "psad8x8" buffer - vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! - //Adjust the input address - sub r0, r9 - sub r1, r9 - //Write to "psd8x8" buffer - vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! - subs r8, #16 - //Write to "pmad8x8" buffer - vst2.16 {d16[0],d17[0]}, [r7]! - //Save to calculate "psadframe" - vadd.u32 q11, q12 - vadd.u32 q15, q11 + //Write to "psad8x8" buffer + vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! + //Adjust the input address + sub r0, r9 + sub r1, r9 + //Write to "psd8x8" buffer + vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! + subs r8, #16 + //Write to "pmad8x8" buffer + vst2.16 {d16[0],d17[0]}, [r7]! + //Save to calculate "psadframe" + vadd.u32 q11, q12 + vadd.u32 q15, q11 - bne vaa_calc_sad_bgd_loop1 + bne vaa_calc_sad_bgd_loop1 - //Adjust the input address - add r0, r10 - add r1, r10 + //Adjust the input address + add r0, r10 + add r1, r10 subs r3, #16 - bne vaa_calc_sad_bgd_loop0 + bne vaa_calc_sad_bgd_loop0 - ldr r8, [sp, #32] //load psadframe - vadd.u32 d30, d31 - vst1.32 {d30[0]}, [r8] - ldmia sp!, {r4-r10} + ldr r8, [sp, #32] //load psadframe + vadd.u32 d30, d31 + vst1.32 {d30[0]}, [r8] + ldmia sp!, {r4-r10} WELS_ASM_FUNC_END #ifdef __APPLE__ .macro SSD_MUL_SUM_16BYTES_RESET - vmull.u8 $3, $0, $0 - vpaddl.u16 $2, $3 + vmull.u8 $3, $0, $0 + vpaddl.u16 $2, $3 - vmull.u8 $3, $1, $1 - vpadal.u16 $2, $3 + vmull.u8 $3, $1, $1 + vpadal.u16 $2, $3 .endm .macro SSD_MUL_SUM_16BYTES - vmull.u8 $3, $0, $0 - vpadal.u16 $2, $3 + vmull.u8 $3, $0, $0 + vpadal.u16 $2, $3 - vmull.u8 $3, $1, $1 - vpadal.u16 $2, $3 + vmull.u8 $3, $1, $1 + vpadal.u16 $2, $3 .endm .macro SAD_SSD_BGD_16 - vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q0}, [$0], $2 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [$1], $2 //load ref_row - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vld1.8 {q1}, [$1], $2 //load ref_row + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //the last row of a 16x16 block .macro SAD_SSD_BGD_16_end - vld1.8 {q0}, [$0], $1 //load cur_row + vld1.8 {q0}, [$0], $1 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 8x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_8x8 - vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q0}, [$0], $2 //load cur_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [$1], $2 //load ref_row + vld1.8 {q1}, [$1], $2 //load ref_row - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 16x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_16x16 - vld1.8 {q0}, [$0], $2 //load cur_row - vld1.8 {q1}, [$1], $2 //load ref_row + vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q1}, [$1], $2 //load ref_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [$1], $2 //load ref_row + vld1.8 {q1}, [$1], $2 //load ref_row - vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16 .endm //for each 8x16 block .macro SAD_SSD_BGD_CALC_8x16 - vpmax.u8 d10, d10, d11 //4 numbers - vpmax.u8 d10, d10, d10 //2 numbers - vpmax.u8 d10, d10, d10 //1 number1 + vpmax.u8 d10, d10, d11 //4 numbers + vpmax.u8 d10, d10, d10 //2 numbers + vpmax.u8 d10, d10, d10 //1 number1 - vmov $0, d10 //d26 d27 keeps the l_mad + vmov $0, d10 //d26 d27 keeps the l_mad - //p_sd8x8 fix me - vpaddl.u16 q3, q3 - vpaddl.u16 q4, q4 + //p_sd8x8 fix me + vpaddl.u16 q3, q3 + vpaddl.u16 q4, q4 - vsub.i32 $1, q3, q4 - vpaddl.u32 $1, $1 + vsub.i32 $1, q3, q4 + vpaddl.u32 $1, $1 - //psad8x8 - vpaddl.u16 $2, $2 - vpaddl.u32 $2, $2 + //psad8x8 + vpaddl.u16 $2, $2 + vpaddl.u32 $2, $2 - //psadframe - vadd.i32 q12, $2 + //psadframe + vadd.i32 q12, $2 .endm .macro SAD_SSD_BGD_16x16 - //for one 8x16 - SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_16 $0, $1, $2, q6 + //for one 8x16 + SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 + SAD_SSD_BGD_16 $0, $1, $2, q6 - SAD_SSD_BGD_CALC_8x16 d26, q14, q6 + SAD_SSD_BGD_CALC_8x16 d26, q14, q6 - //for another 8x16 - SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16 $0, $1, $2, q7 - SAD_SSD_BGD_16_end $0, $2, q7 + //for another 8x16 + SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16 $0, $1, $2, q7 + SAD_SSD_BGD_16_end $0, $2, q7 - SAD_SSD_BGD_CALC_8x16 d27, q15, q7 + SAD_SSD_BGD_CALC_8x16 d27, q15, q7 .endm .macro SSD_SAD_SD_MAD_PADDL - vpaddl.s16 $0, $0 - vpaddl.s32 $0, $0 - vadd.i32 $1, $1, $2 + vpaddl.s16 $0, $0 + vpaddl.s32 $0, $0 + vadd.i32 $1, $1, $2 .endm #else .macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3 - vmull.u8 \arg3, \arg0, \arg0 - vpaddl.u16 \arg2, \arg3 + vmull.u8 \arg3, \arg0, \arg0 + vpaddl.u16 \arg2, \arg3 - vmull.u8 \arg3, \arg1, \arg1 - vpadal.u16 \arg2, \arg3 + vmull.u8 \arg3, \arg1, \arg1 + vpadal.u16 \arg2, \arg3 .endm .macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3 - vmull.u8 \arg3, \arg0, \arg0 - vpadal.u16 \arg2, \arg3 + vmull.u8 \arg3, \arg0, \arg0 + vpadal.u16 \arg2, \arg3 - vmull.u8 \arg3, \arg1, \arg1 - vpadal.u16 \arg2, \arg3 + vmull.u8 \arg3, \arg1, \arg1 + vpadal.u16 \arg2, \arg3 .endm .macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [\arg1], \arg2 //load ref_row - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vld1.8 {q1}, [\arg1], \arg2 //load ref_row + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //the last row of a 16x16 block .macro SAD_SSD_BGD_16_end arg0, arg1, arg2 - vld1.8 {q0}, [\arg0], \arg1 //load cur_row + vld1.8 {q0}, [\arg0], \arg1 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 + vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 - vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 8x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [\arg1], \arg2 //load ref_row + vld1.8 {q1}, [\arg1], \arg2 //load ref_row - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm //for the begin of a 16x16 block, use some instructions to reset the register .macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vld1.8 {q1}, [\arg1], \arg2 //load ref_row + vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q1}, [\arg1], \arg2 //load ref_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 + vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 - vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 - vld1.8 {q1}, [\arg1], \arg2 //load ref_row + vld1.8 {q1}, [\arg1], \arg2 //load ref_row - vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16 .endm //for each 8x16 block .macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2 - vpmax.u8 d10, d10, d11 //4 numbers - vpmax.u8 d10, d10, d10 //2 numbers - vpmax.u8 d10, d10, d10 //1 number1 + vpmax.u8 d10, d10, d11 //4 numbers + vpmax.u8 d10, d10, d10 //2 numbers + vpmax.u8 d10, d10, d10 //1 number1 - vmov \arg0, d10 //d26 d27 keeps the l_mad + vmov \arg0, d10 //d26 d27 keeps the l_mad - //p_sd8x8 - vpaddl.u16 q3, q3 - vpaddl.u16 q4, q4 + //p_sd8x8 + vpaddl.u16 q3, q3 + vpaddl.u16 q4, q4 - vsub.i32 \arg1, q3, q4 - vpaddl.u32 \arg1, \arg1 + vsub.i32 \arg1, q3, q4 + vpaddl.u32 \arg1, \arg1 - //psad8x8 - vpaddl.u16 \arg2, \arg2 - vpaddl.u32 \arg2, \arg2 + //psad8x8 + vpaddl.u16 \arg2, \arg2 + vpaddl.u32 \arg2, \arg2 - //psadframe - vadd.i32 q12, \arg2 + //psadframe + vadd.i32 q12, \arg2 .endm .macro SAD_SSD_BGD_16x16 arg0, arg1, arg2 - //for one 8x16 - SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + //for one 8x16 + SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_BGD_CALC_8x16 d26, q14, q6 + SAD_SSD_BGD_CALC_8x16 d26, q14, q6 - //for another 8x16 - SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_BGD_16_end \arg0, \arg2, q7 + //for another 8x16 + SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_BGD_16_end \arg0, \arg2, q7 - SAD_SSD_BGD_CALC_8x16 d27, q15, q7 + SAD_SSD_BGD_CALC_8x16 d27, q15, q7 .endm .macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2 - vpaddl.s16 \arg0, \arg0 - vpaddl.s32 \arg0, \arg0 - vadd.i32 \arg1, \arg1, \arg2 + vpaddl.s16 \arg0, \arg0 + vpaddl.s32 \arg0, \arg0 + vadd.i32 \arg1, \arg1, \arg2 .endm #endif WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon - stmdb sp!, {r0-r12, r14} - vpush {q4-q7} + stmdb sp!, {r0-r12, r14} + vpush {q4-q7} - ldr r4, [sp, #120] //r4 keeps the pic_stride + ldr r4, [sp, #120] //r4 keeps the pic_stride - sub r5, r4, #1 - lsl r5, r5, #4 //r5 keeps the little step + sub r5, r4, #1 + lsl r5, r5, #4 //r5 keeps the little step - lsl r6, r4, #4 - sub r6, r2, r6 //r6 keeps the big step + lsl r6, r4, #4 + sub r6, r2, r6 //r6 keeps the big step - ldr r8, [sp, #128]//psad8x8 - ldr r9, [sp, #132]//psum16x16 - ldr r10, [sp, #136]//psqsum16x16 - ldr r11, [sp, #140]//psqdiff16x16 - ldr r12, [sp, #144]//p_sd8x8 - ldr r14, [sp, #148]//p_mad8x8 + ldr r8, [sp, #128]//psad8x8 + ldr r9, [sp, #132]//psum16x16 + ldr r10, [sp, #136]//psqsum16x16 + ldr r11, [sp, #140]//psqdiff16x16 + ldr r12, [sp, #144]//p_sd8x8 + ldr r14, [sp, #148]//p_mad8x8 - vmov.i8 q12, #0 + vmov.i8 q12, #0 vaa_calc_sad_ssd_bgd_height_loop: mov r7, r2 vaa_calc_sad_ssd_bgd_width_loop: - //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10 + //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10 SAD_SSD_BGD_16x16 r0,r1,r4 //psad8x8 @@ -694,243 +694,243 @@ vaa_calc_sad_ssd_bgd_width_loop: bne vaa_calc_sad_ssd_bgd_width_loop - sub r0, r0, r6 //jump to next 16 x width - sub r1, r1, r6 //jump to next 16 x width + sub r0, r0, r6 //jump to next 16 x width + sub r1, r1, r6 //jump to next 16 x width subs r3, #16 bne vaa_calc_sad_ssd_bgd_height_loop - //psadframe - ldr r7, [sp, #124]//psadframe + //psadframe + ldr r7, [sp, #124]//psadframe - vadd.i32 d24, d24, d25 - vst1.32 {d24[0]}, [r7] + vadd.i32 d24, d24, d25 + vst1.32 {d24[0]}, [r7] - vpop {q4-q7} - ldmia sp!, {r0-r12, r14} + vpop {q4-q7} + ldmia sp!, {r0-r12, r14} WELS_ASM_FUNC_END #ifdef __APPLE__ .macro SAD_VAR_16 - vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q0}, [$0], $2 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [$1], $2 + vld1.8 {q1}, [$1], $2 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_END - vld1.8 {q0}, [$0], $1 //load cur_row + vld1.8 {q0}, [$0], $1 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_RESET_16x16 - vld1.8 {q0}, [$0], $2 //load cur_row - vld1.8 {q1}, [$1], $2 + vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q1}, [$1], $2 - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [$1], $2 + vld1.8 {q1}, [$1], $2 - vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 + SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 .endm .macro SAD_VAR_16_RESET_8x8 - vld1.8 {q0}, [$0], $2 //load cur_row + vld1.8 {q0}, [$0], $2 //load cur_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [$1], $2 + vld1.8 {q1}, [$1], $2 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16x16 - //for one 8x16 - SAD_VAR_16_RESET_16x16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 - SAD_VAR_16 $0, $1, $2, q6 + //for one 8x16 + SAD_VAR_16_RESET_16x16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 + SAD_VAR_16 $0, $1, $2, q6 - vpaddl.u16 q6, q6 - vpaddl.u32 q6, q6 - vadd.i32 q12, q6 + vpaddl.u16 q6, q6 + vpaddl.u32 q6, q6 + vadd.i32 q12, q6 - //for another 8x16 - SAD_VAR_16_RESET_8x8 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16 $0, $1, $2, q7 - SAD_VAR_16_END $0, $2, q7 + //for another 8x16 + SAD_VAR_16_RESET_8x8 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16 $0, $1, $2, q7 + SAD_VAR_16_END $0, $2, q7 - vpaddl.u16 q7, q7 - vpaddl.u32 q7, q7 + vpaddl.u16 q7, q7 + vpaddl.u32 q7, q7 - vadd.i32 q12, q7 + vadd.i32 q12, q7 .endm #else .macro SAD_VAR_16 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [\arg1], \arg2 + vld1.8 {q1}, [\arg1], \arg2 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_END arg0, arg1, arg2 - vld1.8 {q0}, [\arg0], \arg1 //load cur_row + vld1.8 {q0}, [\arg0], \arg1 //load cur_row - vpadal.u8 q3, q0 //add cur_row together - vpadal.u8 q4, q1 //add ref_row together + vpadal.u8 q3, q0 //add cur_row together + vpadal.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 + vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vld1.8 {q1}, [\arg1], \arg2 + vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q1}, [\arg1], \arg2 - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [\arg1], \arg2 + vld1.8 {q1}, [\arg1], \arg2 - vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 + SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 .endm .macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3 - vld1.8 {q0}, [\arg0], \arg2 //load cur_row + vld1.8 {q0}, [\arg0], \arg2 //load cur_row - vpaddl.u8 q3, q0 //add cur_row together - vpaddl.u8 q4, q1 //add ref_row together + vpaddl.u8 q3, q0 //add cur_row together + vpaddl.u8 q4, q1 //add ref_row together - vabd.u8 q2, q0, q1 //abs_diff + vabd.u8 q2, q0, q1 //abs_diff - vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 + vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 - vld1.8 {q1}, [\arg1], \arg2 + vld1.8 {q1}, [\arg1], \arg2 - vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 + vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 - SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 + SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 .endm .macro SAD_VAR_16x16 arg0, arg1, arg2 - //for one 8x16 - SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 - SAD_VAR_16 \arg0, \arg1, \arg2, q6 + //for one 8x16 + SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 + SAD_VAR_16 \arg0, \arg1, \arg2, q6 - vpaddl.u16 q6, q6 - vpaddl.u32 q6, q6 - vadd.i32 q12, q6 + vpaddl.u16 q6, q6 + vpaddl.u32 q6, q6 + vadd.i32 q12, q6 - //for another 8x16 - SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16 \arg0, \arg1, \arg2, q7 - SAD_VAR_16_END \arg0, \arg2, q7 + //for another 8x16 + SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16 \arg0, \arg1, \arg2, q7 + SAD_VAR_16_END \arg0, \arg2, q7 - vpaddl.u16 q7, q7 - vpaddl.u32 q7, q7 + vpaddl.u16 q7, q7 + vpaddl.u32 q7, q7 - vadd.i32 q12, q7 + vadd.i32 q12, q7 .endm #endif WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon - stmdb sp!, {r4-r11} - vpush {q4} - vpush {q6-q7} + stmdb sp!, {r4-r11} + vpush {q4} + vpush {q6-q7} - ldr r4, [sp, #80] //r4 keeps the pic_stride + ldr r4, [sp, #80] //r4 keeps the pic_stride - sub r5, r4, #1 - lsl r5, r5, #4 //r5 keeps the little step + sub r5, r4, #1 + lsl r5, r5, #4 //r5 keeps the little step - lsl r6, r4, #4 - sub r6, r2, r6 //r6 keeps the big step + lsl r6, r4, #4 + sub r6, r2, r6 //r6 keeps the big step - ldr r7, [sp, #84] //psadframe - ldr r8, [sp, #88] //psad8x8 - ldr r9, [sp, #92] //psum16x16 - ldr r10, [sp, #96] //psqsum16x16 + ldr r7, [sp, #84] //psadframe + ldr r8, [sp, #88] //psad8x8 + ldr r9, [sp, #92] //psum16x16 + ldr r10, [sp, #96] //psqsum16x16 - vmov.i8 q12, #0 + vmov.i8 q12, #0 vaa_calc_sad_var_height_loop: mov r11, r2 @@ -956,154 +956,154 @@ vaa_calc_sad_var_width_loop: bne vaa_calc_sad_var_width_loop - sub r0, r0, r6 //jump to next 16 x width - sub r1, r1, r6 //jump to next 16 x width + sub r0, r0, r6 //jump to next 16 x width + sub r1, r1, r6 //jump to next 16 x width subs r3, #16 bne vaa_calc_sad_var_height_loop - vadd.i32 d24, d24, d25 - vst1.32 {d24[0]}, [r7] + vadd.i32 d24, d24, d25 + vst1.32 {d24[0]}, [r7] - vpop {q6-q7} - vpop {q4} - ldmia sp!, {r4-r11} + vpop {q6-q7} + vpop {q4} + ldmia sp!, {r4-r11} WELS_ASM_FUNC_END #ifdef __APPLE__ .macro SAD_SSD_16 - SAD_VAR_16 $0, $1, $2, $3 + SAD_VAR_16 $0, $1, $2, $3 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 .endm .macro SAD_SSD_16_END - SAD_VAR_16_END $0, $1, $2 + SAD_VAR_16_END $0, $1, $2 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_16x16 - SAD_VAR_16_RESET_16x16 $0, $1, $2, $3 + SAD_VAR_16_RESET_16x16 $0, $1, $2, $3 - SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_8x8 - SAD_VAR_16_RESET_8x8 $0, $1, $2, $3 + SAD_VAR_16_RESET_8x8 $0, $1, $2, $3 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16x16 - //for one 8x16 - SAD_SSD_16_RESET_16x16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 - SAD_SSD_16 $0, $1, $2, q6 + //for one 8x16 + SAD_SSD_16_RESET_16x16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 + SAD_SSD_16 $0, $1, $2, q6 - vpaddl.u16 q6, q6 - vpaddl.u32 q6, q6 - vadd.i32 q12, q6 + vpaddl.u16 q6, q6 + vpaddl.u32 q6, q6 + vadd.i32 q12, q6 - //for another 8x16 - SAD_SSD_16_RESET_8x8 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16 $0, $1, $2, q7 - SAD_SSD_16_END $0, $2, q7 + //for another 8x16 + SAD_SSD_16_RESET_8x8 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16 $0, $1, $2, q7 + SAD_SSD_16_END $0, $2, q7 - vpaddl.u16 q7, q7 - vpaddl.u32 q7, q7 + vpaddl.u16 q7, q7 + vpaddl.u32 q7, q7 - vadd.i32 q12, q7 + vadd.i32 q12, q7 .endm #else .macro SAD_SSD_16 arg0, arg1, arg2, arg3 - SAD_VAR_16 \arg0, \arg1, \arg2, \arg3 + SAD_VAR_16 \arg0, \arg1, \arg2, \arg3 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 .endm .macro SAD_SSD_16_END arg0, arg1, arg2 - SAD_VAR_16_END \arg0, \arg1, \arg2 + SAD_VAR_16_END \arg0, \arg1, \arg2 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3 - SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3 + SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3 - SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3 - SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3 + SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3 - SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 + SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 .endm .macro SAD_SSD_16x16 arg0, arg1, arg2 - //for one 8x16 - SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 - SAD_SSD_16 \arg0, \arg1, \arg2, q6 + //for one 8x16 + SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 + SAD_SSD_16 \arg0, \arg1, \arg2, q6 - vpaddl.u16 q6, q6 - vpaddl.u32 q6, q6 - vadd.i32 q12, q6 + vpaddl.u16 q6, q6 + vpaddl.u32 q6, q6 + vadd.i32 q12, q6 - //for another 8x16 - SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16 \arg0, \arg1, \arg2, q7 - SAD_SSD_16_END \arg0, \arg2, q7 + //for another 8x16 + SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16 \arg0, \arg1, \arg2, q7 + SAD_SSD_16_END \arg0, \arg2, q7 - vpaddl.u16 q7, q7 - vpaddl.u32 q7, q7 + vpaddl.u16 q7, q7 + vpaddl.u32 q7, q7 - vadd.i32 q12, q7 + vadd.i32 q12, q7 .endm #endif WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon - stmdb sp!, {r4-r12} - vpush {q4} - vpush {q6-q7} + stmdb sp!, {r4-r12} + vpush {q4} + vpush {q6-q7} - ldr r4, [sp, #84] //r4 keeps the pic_stride + ldr r4, [sp, #84] //r4 keeps the pic_stride - sub r5, r4, #1 - lsl r5, r5, #4 //r5 keeps the little step + sub r5, r4, #1 + lsl r5, r5, #4 //r5 keeps the little step - lsl r6, r4, #4 - sub r6, r2, r6 //r6 keeps the big step + lsl r6, r4, #4 + sub r6, r2, r6 //r6 keeps the big step - ldr r7, [sp, #88] //psadframe - ldr r8, [sp, #92] //psad8x8 - ldr r9, [sp, #96] //psum16x16 - ldr r10, [sp, #100] //psqsum16x16 - ldr r11, [sp, #104] //psqdiff16x16 + ldr r7, [sp, #88] //psadframe + ldr r8, [sp, #92] //psad8x8 + ldr r9, [sp, #96] //psum16x16 + ldr r10, [sp, #100] //psqsum16x16 + ldr r11, [sp, #104] //psqdiff16x16 - vmov.i8 q12, #0 + vmov.i8 q12, #0 vaa_calc_sad_ssd_height_loop: mov r12, r2 @@ -1136,18 +1136,18 @@ vaa_calc_sad_ssd_width_loop: bne vaa_calc_sad_ssd_width_loop - sub r0, r0, r6 //jump to next 16 x width - sub r1, r1, r6 //jump to next 16 x width + sub r0, r0, r6 //jump to next 16 x width + sub r1, r1, r6 //jump to next 16 x width subs r3, #16 - bne vaa_calc_sad_ssd_height_loop + bne vaa_calc_sad_ssd_height_loop - vadd.i32 d24, d24, d25 - vst1.32 {d24[0]}, [r7] + vadd.i32 d24, d24, d25 + vst1.32 {d24[0]}, [r7] - vpop {q6-q7} - vpop {q4} - ldmia sp!, {r4-r12} + vpop {q6-q7} + vpop {q4} + ldmia sp!, {r4-r12} WELS_ASM_FUNC_END #endif diff --git a/codec/processing/src/x86/denoisefilter.asm b/codec/processing/src/x86/denoisefilter.asm index 0914bacc..ec10ca3d 100644 --- a/codec/processing/src/x86/denoisefilter.asm +++ b/codec/processing/src/x86/denoisefilter.asm @@ -56,217 +56,217 @@ sse2_20 times 8 dw 20 ;*********************************************************************** SECTION .text -%macro WEIGHT_LINE 9 - movq %2, %9 - punpcklbw %2, %7 - movdqa %8, %2 +%macro WEIGHT_LINE 9 + movq %2, %9 + punpcklbw %2, %7 + movdqa %8, %2 - movdqa %1, %6 - psubusb %1, %8 - psubusb %8, %6 - por %8, %1 ; ABS(curPixel - centerPixel); + movdqa %1, %6 + psubusb %1, %8 + psubusb %8, %6 + por %8, %1 ; ABS(curPixel - centerPixel); - movdqa %1, %3 - psubusb %1, %8 + movdqa %1, %3 + psubusb %1, %8 - pmullw %1, %1 - psrlw %1, 5 - pmullw %2, %1 - paddusw %4, %1 - paddusw %5, %2 + pmullw %1, %1 + psrlw %1, 5 + pmullw %2, %1 + paddusw %4, %1 + paddusw %5, %2 %endmacro -%macro WEIGHT_LINE1_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - paddw %3, %2 +%macro WEIGHT_LINE1_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + paddw %3, %2 %endmacro -%macro WEIGHT_LINE2_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - paddw %3, %2 +%macro WEIGHT_LINE2_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + paddw %3, %2 %endmacro -%macro WEIGHT_LINE3_UV 4 - movdqa %2, %1 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 +%macro WEIGHT_LINE3_UV 4 + movdqa %2, %1 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 1 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 1 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 2 - punpcklbw %2, %4 - pmullw %2, [sse2_20] - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 2 + punpcklbw %2, %4 + pmullw %2, [sse2_20] + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 3 - punpcklbw %2, %4 - psllw %2, 2 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 3 + punpcklbw %2, %4 + psllw %2, 2 + paddw %3, %2 - movdqa %2, %1 - psrldq %2, 4 - punpcklbw %2, %4 - psllw %2, 1 - paddw %3, %2 + movdqa %2, %1 + psrldq %2, 4 + punpcklbw %2, %4 + psllw %2, 1 + paddw %3, %2 %endmacro ;*********************************************************************** ; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride); ;*********************************************************************** -; 1 2 3 -; 4 0 5 -; 6 7 8 -; 0: the center point +; 1 2 3 +; 4 0 5 +; 6 7 8 +; 0: the center point WELS_EXTERN BilateralLumaFilter8_sse2 - push r3 - %assign push_num 1 - LOAD_2_PARA - PUSH_XMM 8 + push r3 + %assign push_num 1 + LOAD_2_PARA + PUSH_XMM 8 - pxor xmm7, xmm7 + pxor xmm7, xmm7 - mov r3, r0 + mov r3, r0 - movq xmm6, [r0] - punpcklbw xmm6, xmm7 - movdqa xmm3, [sse2_32] - pxor xmm4, xmm4 ; nTotWeight - pxor xmm5, xmm5 ; nSum + movq xmm6, [r0] + punpcklbw xmm6, xmm7 + movdqa xmm3, [sse2_32] + pxor xmm4, xmm4 ; nTotWeight + pxor xmm5, xmm5 ; nSum - dec r0 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5 + dec r0 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5 - sub r0, r1 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3 + sub r0, r1 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3 - lea r0, [r0 + r1 * 2] - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7 - WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8 + lea r0, [r0 + r1 * 2] + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7 + WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8 - pcmpeqw xmm0, xmm0 - psrlw xmm0, 15 - psllw xmm0, 8 - psubusw xmm0, xmm4 - pmullw xmm0, xmm6 - paddusw xmm5, xmm0 - psrlw xmm5, 8 - packuswb xmm5, xmm5 - movq [r3], xmm5 + pcmpeqw xmm0, xmm0 + psrlw xmm0, 15 + psllw xmm0, 8 + psubusw xmm0, xmm4 + pmullw xmm0, xmm6 + paddusw xmm5, xmm0 + psrlw xmm5, 8 + packuswb xmm5, xmm5 + movq [r3], xmm5 - POP_XMM - pop r3 - %assign push_num 0 + POP_XMM + pop r3 + %assign push_num 0 - ret + ret ;*********************************************************************** -; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); +; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride); ;*********************************************************************** ;5x5 filter: -;1 1 2 1 1 -;1 2 4 2 1 -;2 4 20 4 2 -;1 2 4 2 1 -;1 1 2 1 1 +;1 1 2 1 1 +;1 2 4 2 1 +;2 4 20 4 2 +;1 2 4 2 1 +;1 1 2 1 1 WELS_EXTERN WaverageChromaFilter8_sse2 - push r3 + push r3 - %assign push_num 1 + %assign push_num 1 - LOAD_2_PARA + LOAD_2_PARA - mov r3, r1 - add r3, r3 - sub r0, r3 ; pixels - 2 * stride - sub r0, 2 + mov r3, r1 + add r3, r3 + sub r0, r3 ; pixels - 2 * stride + sub r0, 2 - pxor xmm0, xmm0 - pxor xmm3, xmm3 + pxor xmm0, xmm0 + pxor xmm3, xmm3 - movdqu xmm1, [r0] - WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 + movdqu xmm1, [r0] + WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 - movdqu xmm1, [r0 + r1] - WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 + movdqu xmm1, [r0 + r1] + WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 - add r0, r3 - movdqu xmm1, [r0] - WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 + add r0, r3 + movdqu xmm1, [r0] + WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0 - movdqu xmm1, [r0 + r1] - WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 + movdqu xmm1, [r0 + r1] + WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0 - movdqu xmm1, [r0 + r1 * 2] - WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 + movdqu xmm1, [r0 + r1 * 2] + WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0 - psrlw xmm3, 6 - packuswb xmm3, xmm3 - movq [r0 + 2], xmm3 + psrlw xmm3, 6 + packuswb xmm3, xmm3 + movq [r0 + 2], xmm3 - pop r3 + pop r3 - %assign push_num 0 - ret + %assign push_num 0 + ret diff --git a/codec/processing/src/x86/downsample_bilinear.asm b/codec/processing/src/x86/downsample_bilinear.asm index 70d1b8a2..cbed9254 100644 --- a/codec/processing/src/x86/downsample_bilinear.asm +++ b/codec/processing/src/x86/downsample_bilinear.asm @@ -29,13 +29,13 @@ ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* -;* upsampling.asm +;* upsampling.asm ;* ;* Abstract -;* SIMD for pixel domain down sampling +;* SIMD for pixel domain down sampling ;* ;* History -;* 10/22/2009 Created +;* 10/22/2009 Created ;* ;*************************************************************************/ %include "asm_inc.asm" @@ -61,9 +61,9 @@ SECTION .rodata align=16 ALIGN 16 shufb_mask_low: - db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h + db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h shufb_mask_high: - db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h + db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h ;*********************************************************************** @@ -73,737 +73,737 @@ shufb_mask_high: SECTION .text ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 + sar ebp, $01 ; iSrcHeight >> 1 .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes .xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+8] ; 1st pSrc line + 8 - movq mm2, [esi+ecx] ; 2nd pSrc line - movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+8] ; 1st pSrc line + 8 + movq mm2, [esi+ecx] ; 2nd pSrc line + movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 - ; to handle mm0, mm1, mm2, mm3 - pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm4, mm5 ; d c D C b a B A - pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + ; to handle mm0, mm1, mm2, mm3 + pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm4, mm5 ; d c D C b a B A + pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm5, mm6 ; h g H G f e F E - pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm5, mm6 ; h g H G f e F E + pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm6, mm7 ; l k L K j i J I - pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 + pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm6, mm7 ; l k L K j i J I + pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 - pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm7, mm0 ; p o P O n m N M - pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 + pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm7, mm0 ; p o P O n m N M + pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 - ; to handle mm4, mm5, mm6, mm7 - movq mm0, mm4 ; - punpckldq mm0, mm5 ; H G F E D C B A - punpckhdq mm4, mm5 ; h g f e d c b a + ; to handle mm4, mm5, mm6, mm7 + movq mm0, mm4 ; + punpckldq mm0, mm5 ; H G F E D C B A + punpckhdq mm4, mm5 ; h g f e d c b a - movq mm1, mm6 - punpckldq mm1, mm7 ; P O N M L K J I - punpckhdq mm6, mm7 ; p o n m l k j i + movq mm1, mm6 + punpckldq mm1, mm7 ; P O N M L K J I + punpckhdq mm6, mm7 ; p o n m l k j i - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - ; 2nd part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm1, [esi+16] ; 1st pSrc line + 16 - movq mm2, [esi+24] ; 1st pSrc line + 24 - movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16 - movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24 + ; 2nd part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm1, [esi+16] ; 1st pSrc line + 16 + movq mm2, [esi+24] ; 1st pSrc line + 24 + movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16 + movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24 - ; to handle mm1, mm2, mm3, mm4 - pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm5, mm6 ; d c D C b a B A - pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 + ; to handle mm1, mm2, mm3, mm4 + pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm5, mm6 ; d c D C b a B A + pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 - pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm6, mm7 ; h g H G f e F E - pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 + pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm6, mm7 ; h g H G f e F E + pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 - pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm7, mm1 ; l k L K j i J I - pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 + pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm7, mm1 ; l k L K j i J I + pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 - pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm1, mm2 ; p o P O n m N M - pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 + pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm1, mm2 ; p o P O n m N M + pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 - ; to handle mm5, mm6, mm7, mm1 - movq mm2, mm5 - punpckldq mm2, mm6 ; H G F E D C B A - punpckhdq mm5, mm6 ; h g f e d c b a + ; to handle mm5, mm6, mm7, mm1 + movq mm2, mm5 + punpckldq mm2, mm6 ; H G F E D C B A + punpckhdq mm5, mm6 ; h g f e d c b a - movq mm3, mm7 - punpckldq mm3, mm1 ; P O N M L K J I - punpckhdq mm7, mm1 ; p o n m l k j i + movq mm3, mm7 + punpckldq mm3, mm1 ; P O N M L K J I + punpckhdq mm7, mm1 ; p o n m l k j i - ; avg within MB horizon width (16 x 2 lines) - pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part + ; avg within MB horizon width (16 x 2 lines) + pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part - movq [edi ], mm0 - movq [edi+8], mm2 + movq [edi ], mm0 + movq [edi+8], mm2 - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 + sar ebp, $01 ; iSrcHeight >> 1 .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes .xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E - ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+8] ; 1st pSrc line + 8 - movq mm2, [esi+ecx] ; 2nd pSrc line - movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E + ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+8] ; 1st pSrc line + 8 + movq mm2, [esi+ecx] ; 2nd pSrc line + movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8 - ; to handle mm0, mm1, mm2, mm3 - pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm4, mm5 ; d c D C b a B A - pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + ; to handle mm0, mm1, mm2, mm3 + pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm4, mm5 ; d c D C b a B A + pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm5, mm6 ; h g H G f e F E - pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm5, mm6 ; h g H G f e F E + pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B - pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B - punpcklbw mm6, mm7 ; l k L K j i J I - pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 + pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B + pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B + punpcklbw mm6, mm7 ; l k L K j i J I + pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 - pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B - pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B - punpcklbw mm7, mm0 ; p o P O n m N M - pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 + pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B + pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B + punpcklbw mm7, mm0 ; p o P O n m N M + pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 - ; to handle mm4, mm5, mm6, mm7 - movq mm0, mm4 ; - punpckldq mm0, mm5 ; H G F E D C B A - punpckhdq mm4, mm5 ; h g f e d c b a + ; to handle mm4, mm5, mm6, mm7 + movq mm0, mm4 ; + punpckldq mm0, mm5 ; H G F E D C B A + punpckhdq mm4, mm5 ; h g f e d c b a - movq mm1, mm6 - punpckldq mm1, mm7 ; P O N M L K J I - punpckhdq mm6, mm7 ; p o n m l k j i + movq mm1, mm6 + punpckldq mm1, mm7 ; P O N M L K J I + punpckhdq mm6, mm7 ; p o n m l k j i - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 - pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 + pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - movq [edi ], mm0 + movq [edi ], mm0 - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 + sar ebp, $01 ; iSrcHeight >> 1 .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 8 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 8 bytes .xloops: - ; 1st part horizonal loop: x8 bytes - ; mem hi<- ->lo - ;1st Line Src: mm0: d D c C b B a A - ;2nd Line Src: mm1: h H g G f F e E - ;=> target: - ;: H G F E D C B A - ;: h g f e d c b a - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movq mm0, [esi] ; 1st pSrc line - movq mm1, [esi+ecx] ; 2nd pSrc line + ; 1st part horizonal loop: x8 bytes + ; mem hi<- ->lo + ;1st Line Src: mm0: d D c C b B a A + ;2nd Line Src: mm1: h H g G f F e E + ;=> target: + ;: H G F E D C B A + ;: h g f e d c b a + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movq mm0, [esi] ; 1st pSrc line + movq mm1, [esi+ecx] ; 2nd pSrc line - ; to handle mm0, mm1, mm2, mm3 - pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B - pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B - punpcklbw mm2, mm3 ; d c D C b a B A - pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 + ; to handle mm0, mm1, mm2, mm3 + pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B + pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B + punpcklbw mm2, mm3 ; d c D C b a B A + pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 - pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B - pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B - punpcklbw mm4, mm5 ; h g H G f e F E - pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 + pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B + pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B + punpcklbw mm4, mm5 ; h g H G f e F E + pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 - ; to handle mm2, mm4 - movq mm0, mm2 ; - punpckldq mm0, mm4 ; H G F E D C B A - punpckhdq mm2, mm4 ; h g f e d c b a + ; to handle mm2, mm4 + movq mm0, mm2 ; + punpckldq mm0, mm4 ; H G F E D C B A + punpckhdq mm2, mm4 ; h g f e d c b a - ; avg within MB horizon width (16 x 2 lines) - pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 - pshufw mm1, mm0, 04eh ; 01001110 B - pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once + ; avg within MB horizon width (16 x 2 lines) + pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 + pshufw mm1, mm0, 04eh ; 01001110 B + pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once - movd [edi], mm0 + movd [edi], mm0 - ; next unit - lea esi, [esi+8] - lea edi, [edi+4] + ; next unit + lea esi, [esi+8] + lea edi, [edi+4] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - WELSEMMS - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + WELSEMMS + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3 - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 + sar ebp, $01 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes .xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A - ; xmm1: p P o O n N m M l L k K j J i I - ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A - ; xmm3: p P o O n N m M l L k K j J i I - ;=> target: - ;: P O N M L K J I H G F E D C B A - ;: p o n m l k j i h g f e d c b a - ;: P .. A - ;: p .. a + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P .. A + ;: p .. a - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movdqa xmm0, [esi] ; 1st_src_line - movdqa xmm1, [esi+16] ; 1st_src_line + 16 - movdqa xmm2, [esi+ecx] ; 2nd_src_line - movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa xmm0, [esi] ; 1st_src_line + movdqa xmm1, [esi+16] ; 1st_src_line + 16 + movdqa xmm2, [esi+ecx] ; 2nd_src_line + movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 - ; packing & avg - movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - ; another implementation for xmm4 high bits -; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm4 + ; packing & avg + movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + ; another implementation for xmm4 high bits +; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm4 - movdqa xmm5, xmm1 - pshufb xmm1, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm1 -; psrlw xmm5, 8 - pavgb xmm1, xmm5 + movdqa xmm5, xmm1 + pshufb xmm1, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm1 +; psrlw xmm5, 8 + pavgb xmm1, xmm5 - movdqa xmm4, xmm2 - pshufb xmm2, xmm7 - pshufb xmm4, xmm6 -; psubb xmm4, xmm2 -; psrlw xmm4, 8 - pavgb xmm2, xmm4 + movdqa xmm4, xmm2 + pshufb xmm2, xmm7 + pshufb xmm4, xmm6 +; psubb xmm4, xmm2 +; psrlw xmm4, 8 + pavgb xmm2, xmm4 - movdqa xmm5, xmm3 - pshufb xmm3, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm3 -; psrlw xmm5, 8 - pavgb xmm3, xmm5 + movdqa xmm5, xmm3 + pshufb xmm3, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm3 +; psrlw xmm5, 8 + pavgb xmm3, xmm5 - packuswb xmm0, xmm1 - packuswb xmm2, xmm3 - pavgb xmm0, xmm2 + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + pavgb xmm0, xmm2 - ; write pDst - movdqa [edi], xmm0 + ; write pDst + movdqa [edi], xmm0 - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3 - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high + sar ebp, $01 ; iSrcHeight >> 1 + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes .xloops: - ; horizonal loop: x16 bytes by source - ; mem hi<- ->lo - ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A - ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i + ; horizonal loop: x16 bytes by source + ; mem hi<- ->lo + ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A + ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movdqa xmm0, [esi] ; 1st_src_line - movdqa xmm1, [esi+ecx] ; 2nd_src_line + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa xmm0, [esi] ; 1st_src_line + movdqa xmm1, [esi+ecx] ; 2nd_src_line - ; packing & avg - movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - ; another implementation for xmm2 high bits -; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm2 + ; packing & avg + movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + ; another implementation for xmm2 high bits +; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm2 - movdqa xmm3, xmm1 - pshufb xmm1, xmm7 - pshufb xmm3, xmm6 -; psubb xmm3, xmm1 -; psrlw xmm3, 8 - pavgb xmm1, xmm3 + movdqa xmm3, xmm1 + pshufb xmm1, xmm7 + pshufb xmm3, xmm6 +; psubb xmm3, xmm1 +; psrlw xmm3, 8 + pavgb xmm1, xmm3 - pavgb xmm0, xmm1 - packuswb xmm0, xmm1 + pavgb xmm0, xmm1 + packuswb xmm0, xmm1 - ; write pDst - movq [edi], xmm0 + ; write pDst + movq [edi], xmm0 - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4 - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 + sar ebp, $01 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 32 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 32 bytes .xloops: - ; 1st part horizonal loop: x16 bytes - ; mem hi<- ->lo - ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A - ; xmm1: p P o O n N m M l L k K j J i I - ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A - ; xmm3: p P o O n N m M l L k K j J i I - ;=> target: - ;: P O N M L K J I H G F E D C B A - ;: p o n m l k j i h g f e d c b a - ;: P .. A - ;: p .. a + ; 1st part horizonal loop: x16 bytes + ; mem hi<- ->lo + ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A + ; xmm1: p P o O n N m M l L k K j J i I + ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A + ; xmm3: p P o O n N m M l L k K j J i I + ;=> target: + ;: P O N M L K J I H G F E D C B A + ;: p o n m l k j i h g f e d c b a + ;: P .. A + ;: p .. a - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movntdqa xmm0, [esi] ; 1st_src_line - movntdqa xmm1, [esi+16] ; 1st_src_line + 16 - movntdqa xmm2, [esi+ecx] ; 2nd_src_line - movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movntdqa xmm0, [esi] ; 1st_src_line + movntdqa xmm1, [esi+16] ; 1st_src_line + 16 + movntdqa xmm2, [esi+ecx] ; 2nd_src_line + movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16 - ; packing & avg - movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a -; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm4 + ; packing & avg + movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a +; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm4 - movdqa xmm5, xmm1 - pshufb xmm1, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm1 -; psrlw xmm5, 8 - pavgb xmm1, xmm5 + movdqa xmm5, xmm1 + pshufb xmm1, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm1 +; psrlw xmm5, 8 + pavgb xmm1, xmm5 - movdqa xmm4, xmm2 - pshufb xmm2, xmm7 - pshufb xmm4, xmm6 -; psubb xmm4, xmm2 -; psrlw xmm4, 8 - pavgb xmm2, xmm4 + movdqa xmm4, xmm2 + pshufb xmm2, xmm7 + pshufb xmm4, xmm6 +; psubb xmm4, xmm2 +; psrlw xmm4, 8 + pavgb xmm2, xmm4 - movdqa xmm5, xmm3 - pshufb xmm3, xmm7 - pshufb xmm5, xmm6 -; psubb xmm5, xmm3 -; psrlw xmm5, 8 - pavgb xmm3, xmm5 + movdqa xmm5, xmm3 + pshufb xmm3, xmm7 + pshufb xmm5, xmm6 +; psubb xmm5, xmm3 +; psrlw xmm5, 8 + pavgb xmm3, xmm5 - packuswb xmm0, xmm1 - packuswb xmm2, xmm3 - pavgb xmm0, xmm2 + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + pavgb xmm0, xmm2 - ; write pDst - movdqa [edi], xmm0 + ; write pDst + movdqa [edi], xmm0 - ; next SMB - lea esi, [esi+32] - lea edi, [edi+16] + ; next SMB + lea esi, [esi+32] + lea edi, [edi+16] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret ;*********************************************************************** -; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride, -; unsigned char* pSrc, const int iSrcStride, -; const int iSrcWidth, const int iSrcHeight ); +; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride, +; unsigned char* pSrc, const int iSrcStride, +; const int iSrcWidth, const int iSrcHeight ); ;*********************************************************************** WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4 - push ebx - push edx - push esi - push edi - push ebp + push ebx + push edx + push esi + push edi + push ebp - mov edi, [esp+24] ; pDst - mov edx, [esp+28] ; iDstStride - mov esi, [esp+32] ; pSrc - mov ecx, [esp+36] ; iSrcStride - mov ebp, [esp+44] ; iSrcHeight + mov edi, [esp+24] ; pDst + mov edx, [esp+28] ; iDstStride + mov esi, [esp+32] ; pSrc + mov ecx, [esp+36] ; iSrcStride + mov ebp, [esp+44] ; iSrcHeight - sar ebp, $01 ; iSrcHeight >> 1 - movdqa xmm7, [shufb_mask_low] ; mask low - movdqa xmm6, [shufb_mask_high] ; mask high + sar ebp, $01 ; iSrcHeight >> 1 + movdqa xmm7, [shufb_mask_low] ; mask low + movdqa xmm6, [shufb_mask_high] ; mask high .yloops: - mov eax, [esp+40] ; iSrcWidth - sar eax, $01 ; iSrcWidth >> 1 - mov ebx, eax ; iDstWidth restored at ebx - sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb - neg ebx ; - (iSrcWidth >> 1) - ; each loop = source bandwidth: 16 bytes + mov eax, [esp+40] ; iSrcWidth + sar eax, $01 ; iSrcWidth >> 1 + mov ebx, eax ; iDstWidth restored at ebx + sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb + neg ebx ; - (iSrcWidth >> 1) + ; each loop = source bandwidth: 16 bytes .xloops: - ; horizonal loop: x16 bytes by source - ; mem hi<- ->lo - ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A - ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I - ;=> target: - ;: H G F E D C B A, P O N M L K J I - ;: h g f e d c b a, p o n m l k j i + ; horizonal loop: x16 bytes by source + ; mem hi<- ->lo + ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A + ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I + ;=> target: + ;: H G F E D C B A, P O N M L K J I + ;: h g f e d c b a, p o n m l k j i - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - movntdqa xmm0, [esi] ; 1st_src_line - movntdqa xmm1, [esi+ecx] ; 2nd_src_line + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movntdqa xmm0, [esi] ; 1st_src_line + movntdqa xmm1, [esi+ecx] ; 2nd_src_line - ; packing & avg - movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A - pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A - pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a -; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 -; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a - pavgb xmm0, xmm2 + ; packing & avg + movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A + pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A + pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a +; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0 +; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a + pavgb xmm0, xmm2 - movdqa xmm3, xmm1 - pshufb xmm1, xmm7 - pshufb xmm3, xmm6 -; psubb xmm3, xmm1 -; psrlw xmm3, 8 - pavgb xmm1, xmm3 + movdqa xmm3, xmm1 + pshufb xmm1, xmm7 + pshufb xmm3, xmm6 +; psubb xmm3, xmm1 +; psrlw xmm3, 8 + pavgb xmm1, xmm3 - pavgb xmm0, xmm1 - packuswb xmm0, xmm1 + pavgb xmm0, xmm1 + packuswb xmm0, xmm1 - ; write pDst - movq [edi], xmm0 + ; write pDst + movq [edi], xmm0 - ; next SMB - lea esi, [esi+16] - lea edi, [edi+8] + ; next SMB + lea esi, [esi+16] + lea edi, [edi+8] - dec eax - jg near .xloops + dec eax + jg near .xloops - ; next line - lea esi, [esi+2*ecx] ; next end of lines - lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] - lea edi, [edi+edx] - lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] + ; next line + lea esi, [esi+2*ecx] ; next end of lines + lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth] + lea edi, [edi+edx] + lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth] - dec ebp - jg near .yloops + dec ebp + jg near .yloops - pop ebp - pop edi - pop esi - pop edx - pop ebx - ret + pop ebp + pop edi + pop esi + pop edx + pop ebx + ret @@ -811,395 +811,395 @@ WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4 ;************************************************************************************************************** ;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, -; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, +; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, ; unsigned int uiScaleX, unsigned int uiScaleY ); ;{ ;************************************************************************************************************** -WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 - push ebp - push esi - push edi - push ebx -%define pushsize 16 -%define localsize 28 -%define pDstData esp + pushsize + localsize + 4 -%define dwDstStride esp + pushsize + localsize + 8 -%define dwDstWidth esp + pushsize + localsize + 12 -%define dwDstHeight esp + pushsize + localsize + 16 -%define pSrcData esp + pushsize + localsize + 20 -%define dwSrcStride esp + pushsize + localsize + 24 -%define dwSrcWidth esp + pushsize + localsize + 28 -%define dwSrcHeight esp + pushsize + localsize + 32 -%define scale esp + 0 -%define uiScaleX esp + pushsize + localsize + 36 -%define uiScaleY esp + pushsize + localsize + 40 -%define tmpHeight esp + 12 -%define yInverse esp + 16 -%define xInverse esp + 20 -%define dstStep esp + 24 - sub esp, localsize +WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 + push ebp + push esi + push edi + push ebx +%define pushsize 16 +%define localsize 28 +%define pDstData esp + pushsize + localsize + 4 +%define dwDstStride esp + pushsize + localsize + 8 +%define dwDstWidth esp + pushsize + localsize + 12 +%define dwDstHeight esp + pushsize + localsize + 16 +%define pSrcData esp + pushsize + localsize + 20 +%define dwSrcStride esp + pushsize + localsize + 24 +%define dwSrcWidth esp + pushsize + localsize + 28 +%define dwSrcHeight esp + pushsize + localsize + 32 +%define scale esp + 0 +%define uiScaleX esp + pushsize + localsize + 36 +%define uiScaleY esp + pushsize + localsize + 40 +%define tmpHeight esp + 12 +%define yInverse esp + 16 +%define xInverse esp + 20 +%define dstStep esp + 24 + sub esp, localsize - pxor xmm0, xmm0 - mov edx, 32767 - mov eax, [uiScaleX] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm1, eax ; uinc(uiScaleX mod 32767) - movd xmm2, ebx ; -uinc - psllq xmm1, 32 - por xmm1, xmm2 ; 0 0 uinc -uinc (dword) - pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc + pxor xmm0, xmm0 + mov edx, 32767 + mov eax, [uiScaleX] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm1, eax ; uinc(uiScaleX mod 32767) + movd xmm2, ebx ; -uinc + psllq xmm1, 32 + por xmm1, xmm2 ; 0 0 uinc -uinc (dword) + pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc - mov eax, [uiScaleY] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm6, eax ; vinc(uiScaleY mod 32767) - movd xmm2, ebx ; -vinc - psllq xmm6, 32 - por xmm6, xmm2 ; 0 0 vinc -vinc (dword) - pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc + mov eax, [uiScaleY] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm6, eax ; vinc(uiScaleY mod 32767) + movd xmm2, ebx ; -vinc + psllq xmm6, 32 + por xmm6, xmm2 ; 0 0 vinc -vinc (dword) + pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc - mov edx, 40003fffh - movd xmm5, edx - punpcklwd xmm5, xmm0 ; 16384 16383 - pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 + mov edx, 40003fffh + movd xmm5, edx + punpcklwd xmm5, xmm0 ; 16384 16383 + pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 DOWNSAMPLE: - mov eax, [dwDstHeight] - mov edi, [pDstData] - mov edx, [dwDstStride] - mov ecx, [dwDstWidth] - sub edx, ecx - mov [dstStep], edx ; stride - width - dec eax - mov [tmpHeight], eax - mov eax, 16384 - mov [yInverse], eax + mov eax, [dwDstHeight] + mov edi, [pDstData] + mov edx, [dwDstStride] + mov ecx, [dwDstWidth] + sub edx, ecx + mov [dstStep], edx ; stride - width + dec eax + mov [tmpHeight], eax + mov eax, 16384 + mov [yInverse], eax - pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 + pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 HEIGHT: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - mov ebp, esi - add ebp, [dwSrcStride] + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + mov ebp, esi + add ebp, [dwSrcStride] - mov eax, 16384 - mov [xInverse], eax - mov ecx, [dwDstWidth] - dec ecx + mov eax, 16384 + mov [xInverse], eax + mov ecx, [dwDstWidth] + dec ecx - movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 + movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 WIDTH: - mov eax, [xInverse] - shr eax, 15 + mov eax, [xInverse] + shr eax, 15 - movd xmm1, [esi+eax] ; xxxxxxba - movd xmm2, [ebp+eax] ; xxxxxxdc - pxor xmm0, xmm0 - punpcklwd xmm1, xmm2 ; xxxxdcba - punpcklbw xmm1, xmm0 ; 0d0c0b0a - punpcklwd xmm1, xmm0 ; 000d000c000b000a + movd xmm1, [esi+eax] ; xxxxxxba + movd xmm2, [ebp+eax] ; xxxxxxdc + pxor xmm0, xmm0 + punpcklwd xmm1, xmm2 ; xxxxdcba + punpcklbw xmm1, xmm0 ; 0d0c0b0a + punpcklwd xmm1, xmm0 ; 000d000c000b000a - movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv - pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 - movdqa xmm0, xmm2 - pmuludq xmm2, xmm1 - psrlq xmm0, 32 - psrlq xmm1, 32 - pmuludq xmm0, xmm1 - paddq xmm2, xmm0 - pshufd xmm1, xmm2, 00001110b - paddq xmm2, xmm1 - psrlq xmm2, 29 + movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv + pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 + movdqa xmm0, xmm2 + pmuludq xmm2, xmm1 + psrlq xmm0, 32 + psrlq xmm1, 32 + pmuludq xmm0, xmm1 + paddq xmm2, xmm0 + pshufd xmm1, xmm2, 00001110b + paddq xmm2, xmm1 + psrlq xmm2, 29 - movd eax, xmm2 - inc eax - shr eax, 1 - mov [edi], al - inc edi + movd eax, xmm2 + inc eax + shr eax, 1 + mov [edi], al + inc edi - mov eax, [uiScaleX] - add [xInverse], eax + mov eax, [uiScaleX] + add [xInverse], eax - paddw xmm3, xmm7 ; inc u - psllw xmm3, 1 - psrlw xmm3, 1 + paddw xmm3, xmm7 ; inc u + psllw xmm3, 1 + psrlw xmm3, 1 - loop WIDTH + loop WIDTH WIDTH_END: - mov eax, [xInverse] - shr eax, 15 - mov cl, [esi+eax] - mov [edi], cl - inc edi + mov eax, [xInverse] + shr eax, 15 + mov cl, [esi+eax] + mov [edi], cl + inc edi - mov eax, [uiScaleY] - add [yInverse], eax - add edi, [dstStep] + mov eax, [uiScaleY] + add [yInverse], eax + add edi, [dstStep] - paddw xmm4, xmm6 ; inc v - psllw xmm4, 1 - psrlw xmm4, 1 + paddw xmm4, xmm6 ; inc v + psllw xmm4, 1 + psrlw xmm4, 1 - dec dword [tmpHeight] - jg HEIGHT + dec dword [tmpHeight] + jg HEIGHT LAST_ROW: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address - mov eax, 16384 - mov [xInverse], eax - mov ecx, [dwDstWidth] + mov eax, 16384 + mov [xInverse], eax + mov ecx, [dwDstWidth] LAST_ROW_WIDTH: - mov eax, [xInverse] - shr eax, 15 + mov eax, [xInverse] + shr eax, 15 - mov al, [esi+eax] - mov [edi], al - inc edi + mov al, [esi+eax] + mov [edi], al + inc edi - mov eax, [uiScaleX] - add [xInverse], eax + mov eax, [uiScaleX] + add [xInverse], eax - loop LAST_ROW_WIDTH + loop LAST_ROW_WIDTH LAST_ROW_END: - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef pushsize -%undef localsize -%undef pSrcData -%undef dwSrcWidth -%undef dwSrcHeight -%undef dwSrcStride -%undef pDstData -%undef dwDstWidth -%undef dwDstHeight -%undef dwDstStride -%undef scale -%undef uiScaleX -%undef uiScaleY -%undef tmpHeight -%undef yInverse -%undef xInverse -%undef dstStep - ret + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef pushsize +%undef localsize +%undef pSrcData +%undef dwSrcWidth +%undef dwSrcHeight +%undef dwSrcStride +%undef pDstData +%undef dwDstWidth +%undef dwDstHeight +%undef dwDstStride +%undef scale +%undef uiScaleX +%undef uiScaleY +%undef tmpHeight +%undef yInverse +%undef xInverse +%undef dstStep + ret ;************************************************************************************************************** ;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, -; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, +; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight, ; unsigned int uiScaleX, unsigned int uiScaleY ); ;{ ;************************************************************************************************************** -WELS_EXTERN GeneralBilinearFastDownsampler_sse2 - push ebp - push esi - push edi - push ebx -%define pushsize 16 -%define localsize 28 -%define pDstData esp + pushsize + localsize + 4 -%define dwDstStride esp + pushsize + localsize + 8 -%define dwDstWidth esp + pushsize + localsize + 12 -%define dwDstHeight esp + pushsize + localsize + 16 -%define pSrcData esp + pushsize + localsize + 20 -%define dwSrcStride esp + pushsize + localsize + 24 -%define dwSrcWidth esp + pushsize + localsize + 28 -%define dwSrcHeight esp + pushsize + localsize + 32 -%define scale esp + 0 -%define uiScaleX esp + pushsize + localsize + 36 -%define uiScaleY esp + pushsize + localsize + 40 -%define tmpHeight esp + 12 -%define yInverse esp + 16 -%define xInverse esp + 20 -%define dstStep esp + 24 - sub esp, localsize +WELS_EXTERN GeneralBilinearFastDownsampler_sse2 + push ebp + push esi + push edi + push ebx +%define pushsize 16 +%define localsize 28 +%define pDstData esp + pushsize + localsize + 4 +%define dwDstStride esp + pushsize + localsize + 8 +%define dwDstWidth esp + pushsize + localsize + 12 +%define dwDstHeight esp + pushsize + localsize + 16 +%define pSrcData esp + pushsize + localsize + 20 +%define dwSrcStride esp + pushsize + localsize + 24 +%define dwSrcWidth esp + pushsize + localsize + 28 +%define dwSrcHeight esp + pushsize + localsize + 32 +%define scale esp + 0 +%define uiScaleX esp + pushsize + localsize + 36 +%define uiScaleY esp + pushsize + localsize + 40 +%define tmpHeight esp + 12 +%define yInverse esp + 16 +%define xInverse esp + 20 +%define dstStep esp + 24 + sub esp, localsize - pxor xmm0, xmm0 - mov edx, 65535 - mov eax, [uiScaleX] - and eax, edx - mov ebx, eax - neg ebx - and ebx, 65535 - movd xmm1, eax ; uinc(uiScaleX mod 65536) - movd xmm2, ebx ; -uinc - psllq xmm1, 32 - por xmm1, xmm2 ; 0 uinc 0 -uinc - pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc + pxor xmm0, xmm0 + mov edx, 65535 + mov eax, [uiScaleX] + and eax, edx + mov ebx, eax + neg ebx + and ebx, 65535 + movd xmm1, eax ; uinc(uiScaleX mod 65536) + movd xmm2, ebx ; -uinc + psllq xmm1, 32 + por xmm1, xmm2 ; 0 uinc 0 -uinc + pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc - mov eax, [uiScaleY] - and eax, 32767 - mov ebx, eax - neg ebx - and ebx, 32767 - movd xmm6, eax ; vinc(uiScaleY mod 32767) - movd xmm2, ebx ; -vinc - psllq xmm6, 32 - por xmm6, xmm2 ; 0 vinc 0 -vinc - pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc + mov eax, [uiScaleY] + and eax, 32767 + mov ebx, eax + neg ebx + and ebx, 32767 + movd xmm6, eax ; vinc(uiScaleY mod 32767) + movd xmm2, ebx ; -vinc + psllq xmm6, 32 + por xmm6, xmm2 ; 0 vinc 0 -vinc + pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc - mov edx, 80007fffh ; 32768 32767 - movd xmm5, edx - pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 - mov ebx, 16384 + mov edx, 80007fffh ; 32768 32767 + movd xmm5, edx + pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 + mov ebx, 16384 FAST_DOWNSAMPLE: - mov eax, [dwDstHeight] - mov edi, [pDstData] - mov edx, [dwDstStride] - mov ecx, [dwDstWidth] - sub edx, ecx - mov [dstStep], edx ; stride - width - dec eax - mov [tmpHeight], eax - mov eax, 16384 - mov [yInverse], eax + mov eax, [dwDstHeight] + mov edi, [pDstData] + mov edx, [dwDstStride] + mov ecx, [dwDstWidth] + sub edx, ecx + mov [dstStep], edx ; stride - width + dec eax + mov [tmpHeight], eax + mov eax, 16384 + mov [yInverse], eax - pshuflw xmm4, xmm5, 01010000b - psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 + pshuflw xmm4, xmm5, 01010000b + psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 FAST_HEIGHT: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address - mov ebp, esi - add ebp, [dwSrcStride] + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address + mov ebp, esi + add ebp, [dwSrcStride] - mov eax, 32768 - mov [xInverse], eax - mov ecx, [dwDstWidth] - dec ecx + mov eax, 32768 + mov [xInverse], eax + mov ecx, [dwDstWidth] + dec ecx - movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 + movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 FAST_WIDTH: - mov eax, [xInverse] - shr eax, 16 + mov eax, [xInverse] + shr eax, 16 - movd xmm1, [esi+eax] ; xxxxxxba - movd xmm2, [ebp+eax] ; xxxxxxdc - punpcklwd xmm1, xmm2 ; xxxxdcba - punpcklbw xmm1, xmm0 ; 0d0c0b0a + movd xmm1, [esi+eax] ; xxxxxxba + movd xmm2, [ebp+eax] ; xxxxxxdc + punpcklwd xmm1, xmm2 ; xxxxdcba + punpcklbw xmm1, xmm0 ; 0d0c0b0a - movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv - pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 - pmaddwd xmm2, xmm1 - pshufd xmm1, xmm2, 00000001b - paddd xmm2, xmm1 - movd xmm1, ebx - paddd xmm2, xmm1 - psrld xmm2, 15 + movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv + pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 + pmaddwd xmm2, xmm1 + pshufd xmm1, xmm2, 00000001b + paddd xmm2, xmm1 + movd xmm1, ebx + paddd xmm2, xmm1 + psrld xmm2, 15 - packuswb xmm2, xmm0 - movd eax, xmm2 - mov [edi], al - inc edi + packuswb xmm2, xmm0 + movd eax, xmm2 + mov [edi], al + inc edi - mov eax, [uiScaleX] - add [xInverse], eax + mov eax, [uiScaleX] + add [xInverse], eax - paddw xmm3, xmm7 ; inc u + paddw xmm3, xmm7 ; inc u - loop FAST_WIDTH + loop FAST_WIDTH FAST_WIDTH_END: - mov eax, [xInverse] - shr eax, 16 - mov cl, [esi+eax] - mov [edi], cl - inc edi + mov eax, [xInverse] + shr eax, 16 + mov cl, [esi+eax] + mov [edi], cl + inc edi - mov eax, [uiScaleY] - add [yInverse], eax - add edi, [dstStep] + mov eax, [uiScaleY] + add [yInverse], eax + add edi, [dstStep] - paddw xmm4, xmm6 ; inc v - psllw xmm4, 1 - psrlw xmm4, 1 + paddw xmm4, xmm6 ; inc v + psllw xmm4, 1 + psrlw xmm4, 1 - dec dword [tmpHeight] - jg FAST_HEIGHT + dec dword [tmpHeight] + jg FAST_HEIGHT FAST_LAST_ROW: - mov eax, [yInverse] - mov esi, [pSrcData] - shr eax, 15 - mul dword [dwSrcStride] - add esi, eax ; get current row address + mov eax, [yInverse] + mov esi, [pSrcData] + shr eax, 15 + mul dword [dwSrcStride] + add esi, eax ; get current row address - mov eax, 32768 - mov [xInverse], eax - mov ecx, [dwDstWidth] + mov eax, 32768 + mov [xInverse], eax + mov ecx, [dwDstWidth] FAST_LAST_ROW_WIDTH: - mov eax, [xInverse] - shr eax, 16 + mov eax, [xInverse] + shr eax, 16 - mov al, [esi+eax] - mov [edi], al - inc edi + mov al, [esi+eax] + mov [edi], al + inc edi - mov eax, [uiScaleX] - add [xInverse], eax + mov eax, [uiScaleX] + add [xInverse], eax - loop FAST_LAST_ROW_WIDTH + loop FAST_LAST_ROW_WIDTH FAST_LAST_ROW_END: - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp -%undef pushsize -%undef localsize -%undef pSrcData -%undef dwSrcWidth -%undef dwSrcHeight -%undef dwSrcStride -%undef pDstData -%undef dwDstWidth -%undef dwDstHeight -%undef dwDstStride -%undef scale -%undef uiScaleX -%undef uiScaleY -%undef tmpHeight -%undef yInverse -%undef xInverse -%undef dstStep - ret + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp +%undef pushsize +%undef localsize +%undef pSrcData +%undef dwSrcWidth +%undef dwSrcHeight +%undef dwSrcStride +%undef pDstData +%undef dwDstWidth +%undef dwDstHeight +%undef dwDstStride +%undef scale +%undef uiScaleX +%undef uiScaleY +%undef tmpHeight +%undef yInverse +%undef xInverse +%undef dstStep + ret %endif diff --git a/codec/processing/src/x86/vaa.asm b/codec/processing/src/x86/vaa.asm index 4dea8f08..6741fb9b 100644 --- a/codec/processing/src/x86/vaa.asm +++ b/codec/processing/src/x86/vaa.asm @@ -48,100 +48,100 @@ ; Macros and other preprocessor constants ;*********************************************************************** %macro SUM_SQR_SSE2 3 ; dst, pSrc, zero - movdqa %1, %2 - punpcklbw %1, %3 - punpckhbw %2, %3 - pmaddwd %1, %1 - pmaddwd %2, %2 - paddd %1, %2 - pshufd %2, %1, 04Eh ; 01001110 B - paddd %1, %2 - pshufd %2, %1, 0B1h ; 10110001 B - paddd %1, %2 + movdqa %1, %2 + punpcklbw %1, %3 + punpckhbw %2, %3 + pmaddwd %1, %1 + pmaddwd %2, %2 + paddd %1, %2 + pshufd %2, %1, 04Eh ; 01001110 B + paddd %1, %2 + pshufd %2, %1, 0B1h ; 10110001 B + paddd %1, %2 %endmacro ; END OF SUM_SQR_SSE2 %macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3 - movdqa xmm1, [%1] - movdqa xmm2, [%2] - movdqa xmm3, [%1+%3] - movdqa xmm4, [%2+%3] - psadbw xmm1, xmm2 - psadbw xmm3, xmm4 - paddd xmm6, xmm1 - paddd xmm6, xmm3 - lea %1, [%1+%3*2] - lea %2, [%2+%3*2] + movdqa xmm1, [%1] + movdqa xmm2, [%2] + movdqa xmm3, [%1+%3] + movdqa xmm4, [%2+%3] + psadbw xmm1, xmm2 + psadbw xmm3, xmm4 + paddd xmm6, xmm1 + paddd xmm6, xmm3 + lea %1, [%1+%3*2] + lea %2, [%2+%3*2] %endmacro ; by comparing it outperforms than phaddw(SSSE3) sets %macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp - ; @sum_8x2 begin - pshufd %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 04Eh ; 01001110 B - paddw %1, %2 - pshuflw %2, %1, 0B1h ; 10110001 B - paddw %1, %2 - ; end of @sum_8x2 + ; @sum_8x2 begin + pshufd %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 04Eh ; 01001110 B + paddw %1, %2 + pshuflw %2, %1, 0B1h ; 10110001 B + paddw %1, %2 + ; end of @sum_8x2 %endmacro ; END of SUM_WORD_8x2_SSE2 %macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3 - movdqa xmm1, [%1] - movdqa xmm2, [%2] - movdqa xmm3, xmm1 - psadbw xmm3, xmm2 - paddd xmm6, xmm3 + movdqa xmm1, [%1] + movdqa xmm2, [%2] + movdqa xmm3, xmm1 + psadbw xmm3, xmm2 + paddd xmm6, xmm3 - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd xmm5, xmm3 + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd xmm5, xmm3 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm4, xmm1 - paddd xmm4, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm4, xmm1 + paddd xmm4, xmm2 - add %1, %3 - add %2, %3 + add %1, %3 + add %2, %3 %endmacro %macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3 - movdqa xmm1, [%1] - movdqa xmm2, [%2] - movdqa xmm3, xmm1 - psadbw xmm3, xmm2 - paddd xmm7, xmm3 ; sad + movdqa xmm1, [%1] + movdqa xmm2, [%2] + movdqa xmm3, xmm1 + psadbw xmm3, xmm2 + paddd xmm7, xmm3 ; sad - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; diff + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; diff - movdqa xmm2, xmm1 - psadbw xmm2, xmm0 - paddd xmm6, xmm2 ; sum + movdqa xmm2, xmm1 + psadbw xmm2, xmm0 + paddd xmm6, xmm2 ; sum - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm5, xmm1 - paddd xmm5, xmm2 ; sqsum + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm5, xmm1 + paddd xmm5, xmm2 ; sqsum - movdqa xmm1, xmm3 - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm4, xmm1 - paddd xmm4, xmm3 ; sqdiff + movdqa xmm1, xmm3 + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd xmm4, xmm1 + paddd xmm4, xmm3 ; sqdiff - add %1, %3 - add %2, %3 + add %1, %3 + add %2, %3 %endmacro %macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 @@ -149,40 +149,40 @@ %define sum_cur_reg %2 %define sum_ref_reg %3 %define mad_reg %4 - movdqa xmm1, [%5] - movdqa xmm2, [%6] - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd sum_cur_reg, xmm3 ; sum_cur - movdqa xmm3, xmm2 - psadbw xmm3, xmm0 - paddd sum_ref_reg, xmm3 ; sum_ref + movdqa xmm1, [%5] + movdqa xmm2, [%6] + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd sum_cur_reg, xmm3 ; sum_cur + movdqa xmm3, xmm2 + psadbw xmm3, xmm0 + paddd sum_ref_reg, xmm3 ; sum_ref - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; abs diff - pmaxub mad_reg, xmm3 ; max abs diff + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; abs diff + pmaxub mad_reg, xmm3 ; max abs diff - psadbw xmm3, xmm0 - paddd sad_reg, xmm3 ; sad + psadbw xmm3, xmm0 + paddd sad_reg, xmm3 ; sad - add %5, %7 - add %6, %7 + add %5, %7 + add %6, %7 %endmacro %macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used %define max_reg %1 - movdqa xmm1, max_reg - psrldq xmm1, 4 - pmaxub max_reg, xmm1 - movdqa xmm1, max_reg - psrldq xmm1, 2 - pmaxub max_reg, xmm1 - movdqa xmm1, max_reg - psrldq xmm1, 1 - pmaxub max_reg, xmm1 + movdqa xmm1, max_reg + psrldq xmm1, 4 + pmaxub max_reg, xmm1 + movdqa xmm1, max_reg + psrldq xmm1, 2 + pmaxub max_reg, xmm1 + movdqa xmm1, max_reg + psrldq xmm1, 1 + pmaxub max_reg, xmm1 %endmacro %macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 @@ -190,50 +190,50 @@ %define sum_reg %2 %define mad_reg %3 %define sqdiff_reg %4 - movdqa xmm1, [%5] - movdqa xmm2, xmm1 - movdqa xmm3, xmm1 - punpcklbw xmm2, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm2, xmm3 - movdqa xmm3, xmm2 - psllq xmm2, 32 - psrlq xmm3, 32 - psllq xmm3, 32 - paddd xmm2, xmm3 - paddd sad_reg, xmm2 ; sqsum + movdqa xmm1, [%5] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + punpcklbw xmm2, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm2, xmm2 + pmaddwd xmm3, xmm3 + paddd xmm2, xmm3 + movdqa xmm3, xmm2 + psllq xmm2, 32 + psrlq xmm3, 32 + psllq xmm3, 32 + paddd xmm2, xmm3 + paddd sad_reg, xmm2 ; sqsum - movdqa xmm2, [%6] - movdqa xmm3, xmm1 - psadbw xmm3, xmm0 - paddd sum_reg, xmm3 ; sum_cur - movdqa xmm3, xmm2 - psadbw xmm3, xmm0 - pslldq xmm3, 4 - paddd sum_reg, xmm3 ; sum_ref + movdqa xmm2, [%6] + movdqa xmm3, xmm1 + psadbw xmm3, xmm0 + paddd sum_reg, xmm3 ; sum_cur + movdqa xmm3, xmm2 + psadbw xmm3, xmm0 + pslldq xmm3, 4 + paddd sum_reg, xmm3 ; sum_ref - movdqa xmm3, xmm1 - pmaxub xmm3, xmm2 - pminub xmm2, xmm1 - psubb xmm3, xmm2 ; abs diff - pmaxub mad_reg, xmm3 ; max abs diff + movdqa xmm3, xmm1 + pmaxub xmm3, xmm2 + pminub xmm2, xmm1 + psubb xmm3, xmm2 ; abs diff + pmaxub mad_reg, xmm3 ; max abs diff - movdqa xmm1, xmm3 - psadbw xmm3, xmm0 - paddd sad_reg, xmm3 ; sad + movdqa xmm1, xmm3 + psadbw xmm3, xmm0 + paddd sad_reg, xmm3 ; sad - movdqa xmm3, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd sqdiff_reg, xmm1 - paddd sqdiff_reg, xmm3 ; sqdiff + movdqa xmm3, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm3, xmm0 + pmaddwd xmm1, xmm1 + pmaddwd xmm3, xmm3 + paddd sqdiff_reg, xmm1 + paddd sqdiff_reg, xmm3 ; sqdiff - add %5, %7 - add %6, %7 + add %5, %7 + add %6, %7 %endmacro @@ -249,99 +249,99 @@ SECTION .text ; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); ;*********************************************************************** WELS_EXTERN SampleVariance16x16_sse2 - push esi - push edi - push ebx + push esi + push edi + push ebx - sub esp, 16 - %define SUM [esp] - %define SUM_CUR [esp+4] - %define SQR [esp+8] - %define SQR_CUR [esp+12] - %define PUSH_SIZE 28 ; 12 + 16 + sub esp, 16 + %define SUM [esp] + %define SUM_CUR [esp+4] + %define SQR [esp+8] + %define SQR_CUR [esp+12] + %define PUSH_SIZE 28 ; 12 + 16 - mov edi, [esp+PUSH_SIZE+4] ; y_ref - mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride - mov esi, [esp+PUSH_SIZE+12] ; y_src - mov eax, [esp+PUSH_SIZE+16] ; y_src_stride - mov ecx, 010h ; height = 16 + mov edi, [esp+PUSH_SIZE+4] ; y_ref + mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride + mov esi, [esp+PUSH_SIZE+12] ; y_src + mov eax, [esp+PUSH_SIZE+16] ; y_src_stride + mov ecx, 010h ; height = 16 - pxor xmm7, xmm7 - movdqu SUM, xmm7 + pxor xmm7, xmm7 + movdqu SUM, xmm7 .hloops: - movdqa xmm0, [edi] ; y_ref - movdqa xmm1, [esi] ; y_src - movdqa xmm2, xmm0 ; store first for future process - movdqa xmm3, xmm1 - ; sum += diff; - movdqa xmm4, xmm0 - psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] - ; to be continued for sum - pshufd xmm5, xmm4, 0C6h ; 11000110 B - paddw xmm4, xmm5 - movd ebx, xmm4 - add SUM, ebx + movdqa xmm0, [edi] ; y_ref + movdqa xmm1, [esi] ; y_src + movdqa xmm2, xmm0 ; store first for future process + movdqa xmm3, xmm1 + ; sum += diff; + movdqa xmm4, xmm0 + psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] + ; to be continued for sum + pshufd xmm5, xmm4, 0C6h ; 11000110 B + paddw xmm4, xmm5 + movd ebx, xmm4 + add SUM, ebx - ; sqr += diff * diff; - pmaxub xmm0, xmm1 - pminub xmm1, xmm2 - psubb xmm0, xmm1 ; diff - SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero - movd ebx, xmm1 - add SQR, ebx + ; sqr += diff * diff; + pmaxub xmm0, xmm1 + pminub xmm1, xmm2 + psubb xmm0, xmm1 ; diff + SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero + movd ebx, xmm1 + add SQR, ebx - ; sum_cur += y_src[x]; - movdqa xmm0, xmm3 ; cur_orig - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 - paddw xmm0, xmm1 ; 8x2 - SUM_WORD_8x2_SSE2 xmm0, xmm1 - movd ebx, xmm0 - and ebx, 0ffffh - add SUM_CUR, ebx + ; sum_cur += y_src[x]; + movdqa xmm0, xmm3 ; cur_orig + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + paddw xmm0, xmm1 ; 8x2 + SUM_WORD_8x2_SSE2 xmm0, xmm1 + movd ebx, xmm0 + and ebx, 0ffffh + add SUM_CUR, ebx - ; sqr_cur += y_src[x] * y_src[x]; - SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero - movd ebx, xmm0 - add SQR_CUR, ebx + ; sqr_cur += y_src[x] * y_src[x]; + SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero + movd ebx, xmm0 + add SQR_CUR, ebx - lea edi, [edi+edx] - lea esi, [esi+eax] - dec ecx - jnz near .hloops + lea edi, [edi+edx] + lea esi, [esi+eax] + dec ecx + jnz near .hloops - mov ebx, 0 - mov bx, word SUM - sar ebx, 8 - imul ebx, ebx - mov ecx, SQR - sar ecx, 8 - sub ecx, ebx - mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture - mov [edi], cx ; to store uiMotionIndex - mov ebx, 0 - mov bx, word SUM_CUR - sar ebx, 8 - imul ebx, ebx - mov ecx, SQR_CUR - sar ecx, 8 - sub ecx, ebx - mov [edi+2], cx ; to store uiTextureIndex + mov ebx, 0 + mov bx, word SUM + sar ebx, 8 + imul ebx, ebx + mov ecx, SQR + sar ecx, 8 + sub ecx, ebx + mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture + mov [edi], cx ; to store uiMotionIndex + mov ebx, 0 + mov bx, word SUM_CUR + sar ebx, 8 + imul ebx, ebx + mov ecx, SQR_CUR + sar ecx, 8 + sub ecx, ebx + mov [edi+2], cx ; to store uiTextureIndex - %undef SUM - %undef SUM_CUR - %undef SQR - %undef SQR_CUR - %undef PUSH_SIZE + %undef SUM + %undef SUM_CUR + %undef SQR + %undef SQR_CUR + %undef PUSH_SIZE - add esp, 16 - pop ebx - pop edi - pop esi + add esp, 16 + pop ebx + pop edi + pop esi - ret + ret @@ -360,67 +360,67 @@ WELS_EXTERN VAACalcSad_sse2 %define psadframe esp + pushsize + 24 %define psad8x8 esp + pushsize + 28 %define pushsize 12 - push esi - push edi - push ebx - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx + push esi + push edi + push ebx + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad height_loop: - mov ecx, dword [iPicWidth] - push esi - push edi + mov ecx, dword [iPicWidth] + push esi + push edi width_loop: - pxor xmm6, xmm6 ; - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - paddd xmm7, xmm6 - movd [edx], xmm6 - psrldq xmm6, 8 - movd [edx+4], xmm6 + pxor xmm6, xmm6 ; + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + paddd xmm7, xmm6 + movd [edx], xmm6 + psrldq xmm6, 8 + movd [edx+4], xmm6 - pxor xmm6, xmm6 - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - WELS_SAD_16x2_SSE2 esi,edi,ebx - paddd xmm7, xmm6 - movd [edx+8], xmm6 - psrldq xmm6, 8 - movd [edx+12], xmm6 + pxor xmm6, xmm6 + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + WELS_SAD_16x2_SSE2 esi,edi,ebx + paddd xmm7, xmm6 + movd [edx+8], xmm6 + psrldq xmm6, 8 + movd [edx+12], xmm6 - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 - dec ecx - jnz width_loop + dec ecx + jnz width_loop - pop edi - pop esi - add esi, eax - add edi, eax + pop edi + pop esi + add esi, eax + add edi, eax - dec dword [iPicHeight] - jnz height_loop + dec dword [iPicHeight] + jnz height_loop - mov edx, [psadframe] - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [edx], xmm7 + mov edx, [psadframe] + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [edx], xmm7 %undef cur_data %undef ref_data @@ -430,10 +430,10 @@ width_loop: %undef psadframe %undef psad8x8 %undef pushsize - pop ebx - pop edi - pop esi - ret + pop ebx + pop edi + pop esi + ret %else ;64-bit @@ -441,98 +441,98 @@ width_loop: ; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); ;*********************************************************************** WELS_EXTERN SampleVariance16x16_sse2 - %define SUM r10;[esp] - %define SUM_CUR r11;[esp+4] - %define SQR r13;[esp+8] - %define SQR_CUR r15;[esp+12] + %define SUM r10;[esp] + %define SUM_CUR r11;[esp+4] + %define SQR r13;[esp+8] + %define SQR_CUR r15;[esp+12] - push r12 - push r13 - push r14 - push r15 - %assign push_num 4 - LOAD_5_PARA - PUSH_XMM 8 - SIGN_EXTENSION r1,r1d - SIGN_EXTENSION r3,r3d + push r12 + push r13 + push r14 + push r15 + %assign push_num 4 + LOAD_5_PARA + PUSH_XMM 8 + SIGN_EXTENSION r1,r1d + SIGN_EXTENSION r3,r3d - mov r12,010h - pxor xmm7, xmm7 - movq SUM, xmm7 - movq SUM_CUR,xmm7 - movq SQR,xmm7 - movq SQR_CUR,xmm7 + mov r12,010h + pxor xmm7, xmm7 + movq SUM, xmm7 + movq SUM_CUR,xmm7 + movq SQR,xmm7 + movq SQR_CUR,xmm7 .hloops: - mov r14,0 - movdqa xmm0, [r0] ; y_ref - movdqa xmm1, [r2] ; y_src - movdqa xmm2, xmm0 ; store first for future process - movdqa xmm3, xmm1 - ; sum += diff; - movdqa xmm4, xmm0 - psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] - ; to be continued for sum - pshufd xmm5, xmm4, 0C6h ; 11000110 B - paddw xmm4, xmm5 - movd r14d, xmm4 - add SUM, r14 + mov r14,0 + movdqa xmm0, [r0] ; y_ref + movdqa xmm1, [r2] ; y_src + movdqa xmm2, xmm0 ; store first for future process + movdqa xmm3, xmm1 + ; sum += diff; + movdqa xmm4, xmm0 + psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] + ; to be continued for sum + pshufd xmm5, xmm4, 0C6h ; 11000110 B + paddw xmm4, xmm5 + movd r14d, xmm4 + add SUM, r14 - ; sqr += diff * diff; - pmaxub xmm0, xmm1 - pminub xmm1, xmm2 - psubb xmm0, xmm1 ; diff - SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero - movd r14d, xmm1 - add SQR, r14 + ; sqr += diff * diff; + pmaxub xmm0, xmm1 + pminub xmm1, xmm2 + psubb xmm0, xmm1 ; diff + SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero + movd r14d, xmm1 + add SQR, r14 - ; sum_cur += y_src[x]; - movdqa xmm0, xmm3 ; cur_orig - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpckhbw xmm1, xmm7 - paddw xmm0, xmm1 ; 8x2 - SUM_WORD_8x2_SSE2 xmm0, xmm1 - movd r14d, xmm0 - and r14, 0ffffh - add SUM_CUR, r14 + ; sum_cur += y_src[x]; + movdqa xmm0, xmm3 ; cur_orig + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + paddw xmm0, xmm1 ; 8x2 + SUM_WORD_8x2_SSE2 xmm0, xmm1 + movd r14d, xmm0 + and r14, 0ffffh + add SUM_CUR, r14 - ; sqr_cur += y_src[x] * y_src[x]; - SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero - movd r14d, xmm0 - add SQR_CUR, r14 + ; sqr_cur += y_src[x] * y_src[x]; + SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero + movd r14d, xmm0 + add SQR_CUR, r14 - lea r0, [r0+r1] - lea r2, [r2+r3] - dec r12 - jnz near .hloops + lea r0, [r0+r1] + lea r2, [r2+r3] + dec r12 + jnz near .hloops - mov r0, SUM - sar r0, 8 - imul r0, r0 - mov r1, SQR - sar r1, 8 - sub r1, r0 - mov [r4], r1w ; to store uiMotionIndex - mov r0, SUM_CUR - sar r0, 8 - imul r0, r0 - mov r1, SQR_CUR - sar r1, 8 - sub r1, r0 - mov [r4+2], r1w ; to store uiTextureIndex + mov r0, SUM + sar r0, 8 + imul r0, r0 + mov r1, SQR + sar r1, 8 + sub r1, r0 + mov [r4], r1w ; to store uiMotionIndex + mov r0, SUM_CUR + sar r0, 8 + imul r0, r0 + mov r1, SQR_CUR + sar r1, 8 + sub r1, r0 + mov [r4+2], r1w ; to store uiTextureIndex - POP_XMM - LOAD_5_PARA_POP - pop r15 - pop r14 - pop r13 - pop r12 + POP_XMM + LOAD_5_PARA_POP + pop r15 + pop r14 + pop r13 + pop r12 - %assign push_num 0 + %assign push_num 0 - ret + ret ;************************************************************************************************************* @@ -550,69 +550,69 @@ WELS_EXTERN VAACalcSad_sse2 %define psadframe r5 %define psad8x8 r6 - push r12 - push r13 - %assign push_num 2 - LOAD_7_PARA - PUSH_XMM 8 - SIGN_EXTENSION r2,r2d - SIGN_EXTENSION r3,r3d - SIGN_EXTENSION r4,r4d + push r12 + push r13 + %assign push_num 2 + LOAD_7_PARA + PUSH_XMM 8 + SIGN_EXTENSION r2,r2d + SIGN_EXTENSION r3,r3d + SIGN_EXTENSION r4,r4d - mov r12,r4 - shr r2, 4 ; iPicWidth/16 - shr r3, 4 ; iPicHeight/16 + mov r12,r4 + shr r2, 4 ; iPicWidth/16 + shr r3, 4 ; iPicHeight/16 - shl r12, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad + shl r12, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad height_loop: - mov r13, r2 - push r0 - push r1 + mov r13, r2 + push r0 + push r1 width_loop: - pxor xmm6, xmm6 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - paddd xmm7, xmm6 - movd [r6], xmm6 - psrldq xmm6, 8 - movd [r6+4], xmm6 + pxor xmm6, xmm6 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + paddd xmm7, xmm6 + movd [r6], xmm6 + psrldq xmm6, 8 + movd [r6+4], xmm6 - pxor xmm6, xmm6 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - WELS_SAD_16x2_SSE2 r0,r1,r4 - paddd xmm7, xmm6 - movd [r6+8], xmm6 - psrldq xmm6, 8 - movd [r6+12], xmm6 + pxor xmm6, xmm6 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + WELS_SAD_16x2_SSE2 r0,r1,r4 + paddd xmm7, xmm6 + movd [r6+8], xmm6 + psrldq xmm6, 8 + movd [r6+12], xmm6 - add r6, 16 - sub r0, r12 - sub r1, r12 - add r0, 16 - add r1, 16 + add r6, 16 + sub r0, r12 + sub r1, r12 + add r0, 16 + add r1, 16 - dec r13 - jnz width_loop + dec r13 + jnz width_loop - pop r1 - pop r0 - add r0, r12 - add r1, r12 + pop r1 + pop r0 + add r0, r12 + add r1, r12 - dec r3 - jnz height_loop + dec r3 + jnz height_loop - ;mov r13, [psadframe] - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [psadframe], xmm7 + ;mov r13, [psadframe] + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [psadframe], xmm7 %undef cur_data %undef ref_data @@ -622,12 +622,12 @@ width_loop: %undef psadframe %undef psad8x8 %undef pushsize - POP_XMM - LOAD_7_PARA_POP - pop r13 - pop r12 - %assign push_num 0 - ret + POP_XMM + LOAD_7_PARA_POP + pop r13 + pop r12 + %assign push_num 0 + ret %endif @@ -653,103 +653,103 @@ WELS_EXTERN VAACalcSadVar_sse2 %define tmp_esi esp + 0 %define tmp_edi esp + 4 %define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad var_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi var_width_loop: - pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 - pxor xmm5, xmm5 ; pSum16x16 - pxor xmm4, xmm4 ; sqsum_16x16 - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - paddd xmm7, xmm6 - movd [edx], xmm6 - psrldq xmm6, 8 - movd [edx+4], xmm6 + pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 + pxor xmm5, xmm5 ; pSum16x16 + pxor xmm4, xmm4 ; sqsum_16x16 + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + paddd xmm7, xmm6 + movd [edx], xmm6 + psrldq xmm6, 8 + movd [edx+4], xmm6 - pxor xmm6, xmm6 - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx - paddd xmm7, xmm6 - movd [edx+8], xmm6 - psrldq xmm6, 8 - movd [edx+12], xmm6 + pxor xmm6, xmm6 + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx + paddd xmm7, xmm6 + movd [edx+8], xmm6 + psrldq xmm6, 8 + movd [edx+12], xmm6 - mov ebp, [psum16x16] - movdqa xmm1, xmm5 - psrldq xmm1, 8 - paddd xmm5, xmm1 - movd [ebp], xmm5 - add dword [psum16x16], 4 + mov ebp, [psum16x16] + movdqa xmm1, xmm5 + psrldq xmm1, 8 + paddd xmm5, xmm1 + movd [ebp], xmm5 + add dword [psum16x16], 4 - movdqa xmm5, xmm4 - psrldq xmm5, 8 - paddd xmm4, xmm5 - movdqa xmm3, xmm4 - psrldq xmm3, 4 - paddd xmm4, xmm3 + movdqa xmm5, xmm4 + psrldq xmm5, 8 + paddd xmm4, xmm5 + movdqa xmm3, xmm4 + psrldq xmm3, 4 + paddd xmm4, xmm3 - mov ebp, [psqsum16x16] - movd [ebp], xmm4 - add dword [psqsum16x16], 4 + mov ebp, [psqsum16x16] + movd [ebp], xmm4 + add dword [psqsum16x16], 4 - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 - dec ecx - jnz var_width_loop + dec ecx + jnz var_width_loop - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax - dec dword [iPicHeight] - jnz var_height_loop + dec dword [iPicHeight] + jnz var_height_loop - mov edx, [psadframe] - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [edx], xmm7 + mov edx, [psadframe] + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [edx], xmm7 - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp %undef cur_data %undef ref_data %undef iPicWidth @@ -763,7 +763,7 @@ var_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret %else ;64-bit @@ -784,112 +784,112 @@ WELS_EXTERN VAACalcSadVar_sse2 %define psum16x16 arg8 %define psqsum16x16 arg9 - push r12 - push r13 - push r14 - push r15 - %assign push_num 4 - PUSH_XMM 8 + push r12 + push r13 + push r14 + push r15 + %assign push_num 4 + PUSH_XMM 8 %ifdef WIN64 - mov r4, arg5 ;iPicStride - mov r5, arg6 ;psad8x8 + mov r4, arg5 ;iPicStride + mov r5, arg6 ;psad8x8 %endif - mov r14,arg7 - SIGN_EXTENSION r2,r2d - SIGN_EXTENSION r3,r3d - SIGN_EXTENSION r4,r4d + mov r14,arg7 + SIGN_EXTENSION r2,r2d + SIGN_EXTENSION r3,r3d + SIGN_EXTENSION r4,r4d - mov r13,r4 - shr r2,4 - shr r3,4 + mov r13,r4 + shr r2,4 + shr r3,4 - shl r13,4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm7, xmm7 ; iFrameSad + shl r13,4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm7, xmm7 ; iFrameSad var_height_loop: - push r2 - %assign push_num push_num+1 - mov r11, r0 - mov r12, r1 + push r2 + %assign push_num push_num+1 + mov r11, r0 + mov r12, r1 var_width_loop: - pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 - pxor xmm5, xmm5 ; pSum16x16 - pxor xmm4, xmm4 ; sqsum_16x16 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - paddd xmm7, xmm6 - movd [r14], xmm6 - psrldq xmm6, 8 - movd [r14+4], xmm6 + pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 + pxor xmm5, xmm5 ; pSum16x16 + pxor xmm4, xmm4 ; sqsum_16x16 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + paddd xmm7, xmm6 + movd [r14], xmm6 + psrldq xmm6, 8 + movd [r14+4], xmm6 - pxor xmm6, xmm6 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 - paddd xmm7, xmm6 - movd [r14+8], xmm6 - psrldq xmm6, 8 - movd [r14+12], xmm6 + pxor xmm6, xmm6 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 + paddd xmm7, xmm6 + movd [r14+8], xmm6 + psrldq xmm6, 8 + movd [r14+12], xmm6 - mov r15, psum16x16 - movdqa xmm1, xmm5 - psrldq xmm1, 8 - paddd xmm5, xmm1 - movd [r15], xmm5 - add dword psum16x16, 4 + mov r15, psum16x16 + movdqa xmm1, xmm5 + psrldq xmm1, 8 + paddd xmm5, xmm1 + movd [r15], xmm5 + add dword psum16x16, 4 - movdqa xmm5, xmm4 - psrldq xmm5, 8 - paddd xmm4, xmm5 - movdqa xmm3, xmm4 - psrldq xmm3, 4 - paddd xmm4, xmm3 + movdqa xmm5, xmm4 + psrldq xmm5, 8 + paddd xmm4, xmm5 + movdqa xmm3, xmm4 + psrldq xmm3, 4 + paddd xmm4, xmm3 - mov r15, psqsum16x16 - movd [r15], xmm4 - add dword psqsum16x16, 4 + mov r15, psqsum16x16 + movd [r15], xmm4 + add dword psqsum16x16, 4 - add r14,16 - sub r0, r13 - sub r1, r13 - add r0, 16 - add r1, 16 + add r14,16 + sub r0, r13 + sub r1, r13 + add r0, 16 + add r1, 16 - dec r2 - jnz var_width_loop + dec r2 + jnz var_width_loop - pop r2 - %assign push_num push_num-1 - mov r0, r11 - mov r1, r12 - add r0, r13 - add r1, r13 - dec r3 - jnz var_height_loop + pop r2 + %assign push_num push_num-1 + mov r0, r11 + mov r1, r12 + add r0, r13 + add r1, r13 + dec r3 + jnz var_height_loop - mov r15, psadframe - movdqa xmm5, xmm7 - psrldq xmm7, 8 - paddd xmm7, xmm5 - movd [r15], xmm7 + mov r15, psadframe + movdqa xmm5, xmm7 + psrldq xmm7, 8 + paddd xmm7, xmm5 + movd [r15], xmm7 - POP_XMM - pop r15 - pop r14 - pop r13 - pop r12 + POP_XMM + pop r15 + pop r14 + pop r13 + pop r12 %assign push_num 0 %undef cur_data %undef ref_data @@ -904,7 +904,7 @@ var_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret %endif @@ -932,118 +932,118 @@ WELS_EXTERN VAACalcSadSsd_sse2 %define tmp_edi esp + 4 %define tmp_sadframe esp + 8 %define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize + push ebp + push esi + push edi + push ebx + sub esp, localsize - mov ecx, [iPicWidth] - mov ecx, [iPicHeight] - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov edx, [psad8x8] - mov eax, ebx + mov ecx, [iPicWidth] + mov ecx, [iPicHeight] + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov edx, [psad8x8] + mov eax, ebx - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - mov ecx, [iPicWidth] - mov ecx, [iPicHeight] - pxor xmm0, xmm0 - movd [tmp_sadframe], xmm0 + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + mov ecx, [iPicWidth] + mov ecx, [iPicHeight] + pxor xmm0, xmm0 + movd [tmp_sadframe], xmm0 sqdiff_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi sqdiff_width_loop: - pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 - pxor xmm6, xmm6 ; pSum16x16 - pxor xmm5, xmm5 ; sqsum_16x16 four dword - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - movdqa xmm1, xmm7 - movd [edx], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [edx+4], xmm7 - movd ebp, xmm1 - add [tmp_sadframe], ebp + pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 + pxor xmm6, xmm6 ; pSum16x16 + pxor xmm5, xmm5 ; sqsum_16x16 four dword + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + movdqa xmm1, xmm7 + movd [edx], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [edx+4], xmm7 + movd ebp, xmm1 + add [tmp_sadframe], ebp - pxor xmm7, xmm7 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx - movdqa xmm1, xmm7 - movd [edx+8], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [edx+12], xmm7 - movd ebp, xmm1 - add [tmp_sadframe], ebp + pxor xmm7, xmm7 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx + movdqa xmm1, xmm7 + movd [edx+8], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [edx+12], xmm7 + movd ebp, xmm1 + add [tmp_sadframe], ebp - mov ebp, [psum16x16] - movdqa xmm1, xmm6 - psrldq xmm1, 8 - paddd xmm6, xmm1 - movd [ebp], xmm6 - add dword [psum16x16], 4 + mov ebp, [psum16x16] + movdqa xmm1, xmm6 + psrldq xmm1, 8 + paddd xmm6, xmm1 + movd [ebp], xmm6 + add dword [psum16x16], 4 - mov ebp, [psqsum16x16] - pshufd xmm6, xmm5, 14 ;00001110 - paddd xmm6, xmm5 - pshufd xmm5, xmm6, 1 ;00000001 - paddd xmm5, xmm6 - movd [ebp], xmm5 - add dword [psqsum16x16], 4 + mov ebp, [psqsum16x16] + pshufd xmm6, xmm5, 14 ;00001110 + paddd xmm6, xmm5 + pshufd xmm5, xmm6, 1 ;00000001 + paddd xmm5, xmm6 + movd [ebp], xmm5 + add dword [psqsum16x16], 4 - mov ebp, [psqdiff16x16] - pshufd xmm5, xmm4, 14 ; 00001110 - paddd xmm5, xmm4 - pshufd xmm4, xmm5, 1 ; 00000001 - paddd xmm4, xmm5 - movd [ebp], xmm4 - add dword [psqdiff16x16], 4 + mov ebp, [psqdiff16x16] + pshufd xmm5, xmm4, 14 ; 00001110 + paddd xmm5, xmm4 + pshufd xmm4, xmm5, 1 ; 00000001 + paddd xmm4, xmm5 + movd [ebp], xmm4 + add dword [psqdiff16x16], 4 - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 - dec ecx - jnz sqdiff_width_loop + dec ecx + jnz sqdiff_width_loop - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax - dec dword [iPicHeight] - jnz sqdiff_height_loop + dec dword [iPicHeight] + jnz sqdiff_height_loop - mov ebx, [tmp_sadframe] - mov eax, [psadframe] - mov [eax], ebx + mov ebx, [tmp_sadframe] + mov eax, [psadframe] + mov [eax], ebx - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp %undef cur_data %undef ref_data %undef iPicWidth @@ -1059,7 +1059,7 @@ sqdiff_width_loop: %undef tmp_sadframe %undef pushsize %undef localsize - ret + ret %else @@ -1083,128 +1083,128 @@ WELS_EXTERN VAACalcSadSsd_sse2 %define psqsum16x16 arg9; %define psqdiff16x16 arg10 - push r12 - push r13 - push r14 - push r15 - %assign push_num 4 - PUSH_XMM 10 + push r12 + push r13 + push r14 + push r15 + %assign push_num 4 + PUSH_XMM 10 %ifdef WIN64 - mov r4,arg5 + mov r4,arg5 %endif - mov r14,arg7 - SIGN_EXTENSION r2,r2d - SIGN_EXTENSION r3,r3d - SIGN_EXTENSION r4,r4d + mov r14,arg7 + SIGN_EXTENSION r2,r2d + SIGN_EXTENSION r3,r3d + SIGN_EXTENSION r4,r4d - mov r13,r4 - shr r2,4 ; iPicWidth/16 - shr r3,4 ; iPicHeight/16 - shl r13,4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm8, xmm8 ;framesad - pxor xmm9, xmm9 + mov r13,r4 + shr r2,4 ; iPicWidth/16 + shr r3,4 ; iPicHeight/16 + shl r13,4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm8, xmm8 ;framesad + pxor xmm9, xmm9 sqdiff_height_loop: - ;mov ecx, dword [iPicWidth] - ;mov r14,r2 - push r2 - %assign push_num push_num +1 - mov r10, r0 - mov r11, r1 + ;mov ecx, dword [iPicWidth] + ;mov r14,r2 + push r2 + %assign push_num push_num +1 + mov r10, r0 + mov r11, r1 sqdiff_width_loop: - pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 - pxor xmm6, xmm6 ; pSum16x16 - pxor xmm5, xmm5 ; sqsum_16x16 four dword - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - movdqa xmm1, xmm7 - movd [r14], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [r14+4], xmm7 - movd r15d, xmm1 - movd xmm9, r15d - paddd xmm8,xmm9 + pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 + pxor xmm6, xmm6 ; pSum16x16 + pxor xmm5, xmm5 ; sqsum_16x16 four dword + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + movdqa xmm1, xmm7 + movd [r14], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [r14+4], xmm7 + movd r15d, xmm1 + movd xmm9, r15d + paddd xmm8,xmm9 - pxor xmm7, xmm7 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 - movdqa xmm1, xmm7 - movd [r14+8], xmm7 - psrldq xmm7, 8 - paddd xmm1, xmm7 - movd [r14+12], xmm7 - movd r15d, xmm1 - movd xmm9, r15d - paddd xmm8,xmm9 + pxor xmm7, xmm7 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 + movdqa xmm1, xmm7 + movd [r14+8], xmm7 + psrldq xmm7, 8 + paddd xmm1, xmm7 + movd [r14+12], xmm7 + movd r15d, xmm1 + movd xmm9, r15d + paddd xmm8,xmm9 - mov r15, psum16x16 - movdqa xmm1, xmm6 - psrldq xmm1, 8 - paddd xmm6, xmm1 - movd [r15], xmm6 - add dword psum16x16, 4 + mov r15, psum16x16 + movdqa xmm1, xmm6 + psrldq xmm1, 8 + paddd xmm6, xmm1 + movd [r15], xmm6 + add dword psum16x16, 4 - mov r15, psqsum16x16 - pshufd xmm6, xmm5, 14 ;00001110 - paddd xmm6, xmm5 - pshufd xmm5, xmm6, 1 ;00000001 - paddd xmm5, xmm6 - movd [r15], xmm5 - add dword psqsum16x16, 4 + mov r15, psqsum16x16 + pshufd xmm6, xmm5, 14 ;00001110 + paddd xmm6, xmm5 + pshufd xmm5, xmm6, 1 ;00000001 + paddd xmm5, xmm6 + movd [r15], xmm5 + add dword psqsum16x16, 4 - mov r15, psqdiff16x16 - pshufd xmm5, xmm4, 14 ; 00001110 - paddd xmm5, xmm4 - pshufd xmm4, xmm5, 1 ; 00000001 - paddd xmm4, xmm5 - movd [r15], xmm4 - add dword psqdiff16x16, 4 + mov r15, psqdiff16x16 + pshufd xmm5, xmm4, 14 ; 00001110 + paddd xmm5, xmm4 + pshufd xmm4, xmm5, 1 ; 00000001 + paddd xmm4, xmm5 + movd [r15], xmm4 + add dword psqdiff16x16, 4 - add r14,16 - sub r0, r13 - sub r1, r13 - add r0, 16 - add r1, 16 + add r14,16 + sub r0, r13 + sub r1, r13 + add r0, 16 + add r1, 16 - dec r2 - jnz sqdiff_width_loop + dec r2 + jnz sqdiff_width_loop - pop r2 - %assign push_num push_num -1 + pop r2 + %assign push_num push_num -1 - mov r0, r10 - mov r1, r11 - add r0, r13 - add r1, r13 + mov r0, r10 + mov r1, r11 + add r0, r13 + add r1, r13 - dec r3 - jnz sqdiff_height_loop + dec r3 + jnz sqdiff_height_loop - mov r13, psadframe - movd [r13], xmm8 + mov r13, psadframe + movd [r13], xmm8 - POP_XMM - pop r15 - pop r14 - pop r13 - pop r12 - %assign push_num 0 + POP_XMM + pop r15 + pop r14 + pop r13 + pop r12 + %assign push_num 0 %undef cur_data %undef ref_data @@ -1221,7 +1221,7 @@ sqdiff_width_loop: %undef tmp_sadframe %undef pushsize %undef localsize - ret + ret @@ -1249,145 +1249,145 @@ WELS_EXTERN VAACalcSadBgd_sse2 %define tmp_edi esp + 4 %define tmp_ecx esp + 8 %define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov eax, ebx + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov eax, ebx - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - xor ebp, ebp - pxor xmm0, xmm0 + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + xor ebp, ebp + pxor xmm0, xmm0 bgd_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 - pxor xmm6, xmm6 ; sum_cur_8x8 - pxor xmm5, xmm5 ; sum_ref_8x8 - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + pxor xmm7, xmm7 ; pSad8x8 + pxor xmm6, xmm6 ; sum_cur_8x8 + pxor xmm5, xmm5 ; sum_ref_8x8 + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm4 + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm4 - ;movdqa xmm1, xmm4 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm4, xmm0 - ;punpcklwd xmm4, xmm0 - ;movd [edx+4], xmm4 - ;add edx, 8 - ;mov [p_mad8x8], edx - mov [tmp_ecx], ecx - movhlps xmm1, xmm4 - movd ecx, xmm4 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx + ;movdqa xmm1, xmm4 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm4, xmm0 + ;punpcklwd xmm4, xmm0 + ;movd [edx+4], xmm4 + ;add edx, 8 + ;mov [p_mad8x8], edx + mov [tmp_ecx], ecx + movhlps xmm1, xmm4 + movd ecx, xmm4 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx - pslldq xmm7, 4 - pslldq xmm6, 4 - pslldq xmm5, 4 + pslldq xmm7, 4 + pslldq xmm6, 4 + pslldq xmm5, 4 - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm4 + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm4 - ;movdqa xmm1, xmm4 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm4, xmm0 - ;punpcklwd xmm4, xmm0 - ;movd [edx+4], xmm4 - ;add edx, 8 - ;mov [p_mad8x8], edx - movhlps xmm1, xmm4 - movd ecx, xmm4 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx + ;movdqa xmm1, xmm4 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm4, xmm0 + ;punpcklwd xmm4, xmm0 + ;movd [edx+4], xmm4 + ;add edx, 8 + ;mov [p_mad8x8], edx + movhlps xmm1, xmm4 + movd ecx, xmm4 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx - ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 + ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 - mov edx, [psad8x8] - pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 - movdqa [edx], xmm1 - add edx, 16 - mov [psad8x8], edx ; sad8x8 + mov edx, [psad8x8] + pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 + movdqa [edx], xmm1 + add edx, 16 + mov [psad8x8], edx ; sad8x8 - paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 - pshufd xmm2, xmm1, 00000011b - paddd xmm1, xmm2 - movd edx, xmm1 - add ebp, edx ; sad frame + paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 + pshufd xmm2, xmm1, 00000011b + paddd xmm1, xmm2 + movd edx, xmm1 + add ebp, edx ; sad frame - mov edx, [p_sd8x8] - psubd xmm6, xmm5 - pshufd xmm1, xmm6, 10001101b - movdqa [edx], xmm1 - add edx, 16 - mov [p_sd8x8], edx + mov edx, [p_sd8x8] + psubd xmm6, xmm5 + pshufd xmm1, xmm6, 10001101b + movdqa [edx], xmm1 + add edx, 16 + mov [p_sd8x8], edx - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 - mov ecx, [tmp_ecx] - dec ecx - jnz bgd_width_loop + mov ecx, [tmp_ecx] + dec ecx + jnz bgd_width_loop - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax - dec dword [iPicHeight] - jnz bgd_height_loop + dec dword [iPicHeight] + jnz bgd_height_loop - mov edx, [psadframe] - mov [edx], ebp + mov edx, [psadframe] + mov [edx], ebp - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp %undef cur_data %undef ref_data %undef iPicWidth @@ -1401,7 +1401,7 @@ bgd_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret @@ -1431,190 +1431,190 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2 %define tmp_sadframe esp + 8 %define tmp_ecx esp + 12 %define pushsize 16 - push ebp - push esi - push edi - push ebx - sub esp, localsize - mov esi, [cur_data] - mov edi, [ref_data] - mov ebx, [iPicStride] - mov eax, ebx + push ebp + push esi + push edi + push ebx + sub esp, localsize + mov esi, [cur_data] + mov edi, [ref_data] + mov ebx, [iPicStride] + mov eax, ebx - shr dword [iPicWidth], 4 ; iPicWidth/16 - shr dword [iPicHeight], 4 ; iPicHeight/16 - shl eax, 4 ; iPicStride*16 - pxor xmm0, xmm0 - movd [tmp_sadframe], xmm0 + shr dword [iPicWidth], 4 ; iPicWidth/16 + shr dword [iPicHeight], 4 ; iPicHeight/16 + shl eax, 4 ; iPicStride*16 + pxor xmm0, xmm0 + movd [tmp_sadframe], xmm0 sqdiff_bgd_height_loop: - mov ecx, dword [iPicWidth] - mov [tmp_esi], esi - mov [tmp_edi], edi + mov ecx, dword [iPicWidth] + mov [tmp_esi], esi + mov [tmp_edi], edi sqdiff_bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - mov edx, [psad8x8] - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [edx], xmm2 - movd [edx+4], xmm1 - add edx, 8 - mov [psad8x8], edx ; sad8x8 + mov edx, [psad8x8] + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [edx], xmm2 + movd [edx+4], xmm1 + add edx, 8 + mov [psad8x8], edx ; sad8x8 - paddd xmm1, xmm2 - movd edx, xmm1 - add [tmp_sadframe], edx ; iFrameSad + paddd xmm1, xmm2 + movd edx, xmm1 + add [tmp_sadframe], edx ; iFrameSad - mov edx, [psum16x16] - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd [edx], xmm1 ; sum + mov edx, [psum16x16] + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd [edx], xmm1 ; sum - mov edx, [p_sd8x8] - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [edx], xmm1 - add edx, 8 - mov [p_sd8x8], edx + mov edx, [p_sd8x8] + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [edx], xmm1 + add edx, 8 + mov [p_sd8x8], edx - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm5 - ;movdqa xmm1, xmm5 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm5, xmm0 - ;punpcklwd xmm5, xmm0 - ;movd [edx+4], xmm5 - ;add edx, 8 - ;mov [p_mad8x8], edx - mov [tmp_ecx], ecx - movhlps xmm1, xmm5 - movd ecx, xmm5 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm5 + ;movdqa xmm1, xmm5 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm5, xmm0 + ;punpcklwd xmm5, xmm0 + ;movd [edx+4], xmm5 + ;add edx, 8 + ;mov [p_mad8x8], edx + mov [tmp_ecx], ecx + movhlps xmm1, xmm5 + movd ecx, xmm5 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx - psrlq xmm7, 32 - psllq xmm7, 32 ; clear sad - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + psrlq xmm7, 32 + psllq xmm7, 32 ; clear sad + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx - mov edx, [psad8x8] - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [edx], xmm2 - movd [edx+4], xmm1 - add edx, 8 - mov [psad8x8], edx ; sad8x8 + mov edx, [psad8x8] + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [edx], xmm2 + movd [edx+4], xmm1 + add edx, 8 + mov [psad8x8], edx ; sad8x8 - paddd xmm1, xmm2 - movd edx, xmm1 - add [tmp_sadframe], edx ; iFrameSad + paddd xmm1, xmm2 + movd edx, xmm1 + add [tmp_sadframe], edx ; iFrameSad - mov edx, [psum16x16] - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd ebp, xmm1 ; sum - add [edx], ebp - add edx, 4 - mov [psum16x16], edx + mov edx, [psum16x16] + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd ebp, xmm1 ; sum + add [edx], ebp + add edx, 4 + mov [psum16x16], edx - mov edx, [psqsum16x16] - psrlq xmm7, 32 - pshufd xmm2, xmm7, 00001110b - paddd xmm2, xmm7 - movd [edx], xmm2 ; sqsum - add edx, 4 - mov [psqsum16x16], edx + mov edx, [psqsum16x16] + psrlq xmm7, 32 + pshufd xmm2, xmm7, 00001110b + paddd xmm2, xmm7 + movd [edx], xmm2 ; sqsum + add edx, 4 + mov [psqsum16x16], edx - mov edx, [p_sd8x8] - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [edx], xmm1 - add edx, 8 - mov [p_sd8x8], edx + mov edx, [p_sd8x8] + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [edx], xmm1 + add edx, 8 + mov [p_sd8x8], edx - mov edx, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm5 - ;movdqa xmm1, xmm5 - ;punpcklbw xmm1, xmm0 - ;punpcklwd xmm1, xmm0 - ;movd [edx], xmm1 - ;punpckhbw xmm5, xmm0 - ;punpcklwd xmm5, xmm0 - ;movd [edx+4], xmm5 - ;add edx, 8 - ;mov [p_mad8x8], edx - movhlps xmm1, xmm5 - movd ecx, xmm5 - mov [edx], cl - movd ecx, xmm1 - mov [edx+1],cl - add edx, 2 - mov [p_mad8x8], edx + mov edx, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm5 + ;movdqa xmm1, xmm5 + ;punpcklbw xmm1, xmm0 + ;punpcklwd xmm1, xmm0 + ;movd [edx], xmm1 + ;punpckhbw xmm5, xmm0 + ;punpcklwd xmm5, xmm0 + ;movd [edx+4], xmm5 + ;add edx, 8 + ;mov [p_mad8x8], edx + movhlps xmm1, xmm5 + movd ecx, xmm5 + mov [edx], cl + movd ecx, xmm1 + mov [edx+1],cl + add edx, 2 + mov [p_mad8x8], edx - mov edx, [psqdiff16x16] - pshufd xmm1, xmm4, 00001110b - paddd xmm4, xmm1 - pshufd xmm1, xmm4, 00000001b - paddd xmm4, xmm1 - movd [edx], xmm4 - add edx, 4 - mov [psqdiff16x16], edx + mov edx, [psqdiff16x16] + pshufd xmm1, xmm4, 00001110b + paddd xmm4, xmm1 + pshufd xmm1, xmm4, 00000001b + paddd xmm4, xmm1 + movd [edx], xmm4 + add edx, 4 + mov [psqdiff16x16], edx - add edx, 16 - sub esi, eax - sub edi, eax - add esi, 16 - add edi, 16 + add edx, 16 + sub esi, eax + sub edi, eax + add esi, 16 + add edi, 16 - mov ecx, [tmp_ecx] - dec ecx - jnz sqdiff_bgd_width_loop + mov ecx, [tmp_ecx] + dec ecx + jnz sqdiff_bgd_width_loop - mov esi, [tmp_esi] - mov edi, [tmp_edi] - add esi, eax - add edi, eax + mov esi, [tmp_esi] + mov edi, [tmp_edi] + add esi, eax + add edi, eax - dec dword [iPicHeight] - jnz sqdiff_bgd_height_loop + dec dword [iPicHeight] + jnz sqdiff_bgd_height_loop - mov edx, [psadframe] - mov ebp, [tmp_sadframe] - mov [edx], ebp + mov edx, [psadframe] + mov ebp, [tmp_sadframe] + mov [edx], ebp - add esp, localsize - pop ebx - pop edi - pop esi - pop ebp + add esp, localsize + pop ebx + pop edi + pop esi + pop ebp %undef cur_data %undef ref_data %undef iPicWidth @@ -1631,7 +1631,7 @@ sqdiff_bgd_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret %else ;************************************************************************************************************* @@ -1651,142 +1651,142 @@ WELS_EXTERN VAACalcSadBgd_sse2 %define p_sd8x8 arg8; %define p_mad8x8 arg9; - push r12 - push r13 - push r14 - push r15 + push r12 + push r13 + push r14 + push r15 %assign push_num 4 - PUSH_XMM 10 + PUSH_XMM 10 %ifdef WIN64 - mov r4,arg5 - ; mov r5,arg6 + mov r4,arg5 + ; mov r5,arg6 %endif - mov r14,arg7 - SIGN_EXTENSION r2,r2d - SIGN_EXTENSION r3,r3d - SIGN_EXTENSION r4,r4d + mov r14,arg7 + SIGN_EXTENSION r2,r2d + SIGN_EXTENSION r3,r3d + SIGN_EXTENSION r4,r4d - mov r13,r4 - mov r15,r0 - shr r2,4 - shr r3,4 - shl r13,4 - pxor xmm0, xmm0 - pxor xmm8, xmm8 - pxor xmm9, xmm9 + mov r13,r4 + mov r15,r0 + shr r2,4 + shr r3,4 + shl r13,4 + pxor xmm0, xmm0 + pxor xmm8, xmm8 + pxor xmm9, xmm9 bgd_height_loop: - ;mov ecx, dword [iPicWidth] - push r2 - %assign push_num push_num+1 - mov r10, r15 - mov r11, r1 + ;mov ecx, dword [iPicWidth] + push r2 + %assign push_num push_num+1 + mov r10, r15 + mov r11, r1 bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 - pxor xmm6, xmm6 ; sum_cur_8x8 - pxor xmm5, xmm5 ; sum_ref_8x8 - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + pxor xmm7, xmm7 ; pSad8x8 + pxor xmm6, xmm6 ; sum_cur_8x8 + pxor xmm5, xmm5 ; sum_ref_8x8 + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - mov r14, p_mad8x8 - WELS_MAX_REG_SSE2 xmm4 + mov r14, p_mad8x8 + WELS_MAX_REG_SSE2 xmm4 - ;mov [tmp_ecx], ecx - movhlps xmm1, xmm4 - movd r0d, xmm4 + ;mov [tmp_ecx], ecx + movhlps xmm1, xmm4 + movd r0d, xmm4 - mov [r14], r0b - movd r0d, xmm1 - mov [r14+1],r0b - add r14, 2 - ;mov p_mad8x8, r14 + mov [r14], r0b + movd r0d, xmm1 + mov [r14+1],r0b + add r14, 2 + ;mov p_mad8x8, r14 - pslldq xmm7, 4 - pslldq xmm6, 4 - pslldq xmm5, 4 + pslldq xmm7, 4 + pslldq xmm6, 4 + pslldq xmm5, 4 - pxor xmm4, xmm4 ; pMad8x8 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + pxor xmm4, xmm4 ; pMad8x8 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 + WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 - ;mov r14, [p_mad8x8] - WELS_MAX_REG_SSE2 xmm4 + ;mov r14, [p_mad8x8] + WELS_MAX_REG_SSE2 xmm4 - movhlps xmm1, xmm4 - movd r0d, xmm4 - mov [r14], r0b - movd r0d, xmm1 - mov [r14+1],r0b - add r14, 2 - mov p_mad8x8, r14 + movhlps xmm1, xmm4 + movd r0d, xmm4 + mov [r14], r0b + movd r0d, xmm1 + mov [r14+1],r0b + add r14, 2 + mov p_mad8x8, r14 - ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 + ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 - mov r14, psad8x8 - pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 - movdqa [r14], xmm1 - add r14, 16 - mov psad8x8, r14 ; sad8x8 + mov r14, psad8x8 + pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 + movdqa [r14], xmm1 + add r14, 16 + mov psad8x8, r14 ; sad8x8 - paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 - pshufd xmm2, xmm1, 00000011b - paddd xmm1, xmm2 - movd r14d, xmm1 - movd xmm9, r14d - paddd xmm8, xmm9 ; sad frame + paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 + pshufd xmm2, xmm1, 00000011b + paddd xmm1, xmm2 + movd r14d, xmm1 + movd xmm9, r14d + paddd xmm8, xmm9 ; sad frame - mov r14, p_sd8x8 - psubd xmm6, xmm5 - pshufd xmm1, xmm6, 10001101b - movdqa [r14], xmm1 - add r14, 16 - mov p_sd8x8, r14 + mov r14, p_sd8x8 + psubd xmm6, xmm5 + pshufd xmm1, xmm6, 10001101b + movdqa [r14], xmm1 + add r14, 16 + mov p_sd8x8, r14 - ;add edx, 16 - sub r15, r13 - sub r1, r13 - add r15, 16 - add r1, 16 + ;add edx, 16 + sub r15, r13 + sub r1, r13 + add r15, 16 + add r1, 16 - dec r2 - jnz bgd_width_loop - pop r2 + dec r2 + jnz bgd_width_loop + pop r2 %assign push_num push_num-1 - mov r15, r10 - mov r1, r11 - add r15, r13 - add r1, r13 + mov r15, r10 + mov r1, r11 + add r15, r13 + add r1, r13 - dec r3 - jnz bgd_height_loop + dec r3 + jnz bgd_height_loop - mov r13, psadframe - movd [r13], xmm8 + mov r13, psadframe + movd [r13], xmm8 - POP_XMM - pop r15 - pop r14 - pop r13 - pop r12 + POP_XMM + pop r15 + pop r14 + pop r13 + pop r12 %assign push_num 0 %undef cur_data %undef ref_data @@ -1801,7 +1801,7 @@ bgd_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret @@ -1826,189 +1826,189 @@ WELS_EXTERN VAACalcSadSsdBgd_sse2 %define p_sd8x8 arg11 %define p_mad8x8 arg12 - push r12 - push r13 - push r14 - push r15 + push r12 + push r13 + push r14 + push r15 %assign push_num 4 - PUSH_XMM 10 + PUSH_XMM 10 %ifdef WIN64 - mov r4,arg5 - ;mov r5,arg6 + mov r4,arg5 + ;mov r5,arg6 %endif - SIGN_EXTENSION r2,r2d - SIGN_EXTENSION r3,r3d - SIGN_EXTENSION r4,r4d + SIGN_EXTENSION r2,r2d + SIGN_EXTENSION r3,r3d + SIGN_EXTENSION r4,r4d - mov r13,r4 - shr r2, 4 ; iPicWidth/16 - shr r3, 4 ; iPicHeight/16 - shl r13, 4 ; iPicStride*16 - pxor xmm0, xmm0 - pxor xmm8, xmm8 - pxor xmm9, xmm9 + mov r13,r4 + shr r2, 4 ; iPicWidth/16 + shr r3, 4 ; iPicHeight/16 + shl r13, 4 ; iPicStride*16 + pxor xmm0, xmm0 + pxor xmm8, xmm8 + pxor xmm9, xmm9 sqdiff_bgd_height_loop: - mov r10, r0 - mov r11, r1 - push r2 + mov r10, r0 + mov r11, r1 + push r2 %assign push_num push_num+1 sqdiff_bgd_width_loop: - pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - pxor xmm4, xmm4 ; sqdiff_16x16 four Dword - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + pxor xmm4, xmm4 ; sqdiff_16x16 four Dword + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - mov r14, psad8x8 - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [r14], xmm2 - movd [r14+4], xmm1 - add r14, 8 - mov psad8x8, r14 ; sad8x8 + mov r14, psad8x8 + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [r14], xmm2 + movd [r14+4], xmm1 + add r14, 8 + mov psad8x8, r14 ; sad8x8 - paddd xmm1, xmm2 - movd r14d, xmm1 - movd xmm9,r14d - paddd xmm8, xmm9 ; iFrameSad + paddd xmm1, xmm2 + movd r14d, xmm1 + movd xmm9,r14d + paddd xmm8, xmm9 ; iFrameSad - mov r14, psum16x16 - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd [r14], xmm1 ; sum + mov r14, psum16x16 + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd [r14], xmm1 ; sum - mov r14, p_sd8x8 - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [r14], xmm1 - add r14, 8 - mov p_sd8x8, r14 + mov r14, p_sd8x8 + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [r14], xmm1 + add r14, 8 + mov p_sd8x8, r14 - mov r14, p_mad8x8 - WELS_MAX_REG_SSE2 xmm5 + mov r14, p_mad8x8 + WELS_MAX_REG_SSE2 xmm5 - movhlps xmm1, xmm5 - push r0 - movd r0d, xmm5 - mov [r14], r0b - movd r0d, xmm1 - mov [r14+1],r0b - pop r0 - add r14, 2 - mov p_mad8x8, r14 + movhlps xmm1, xmm5 + push r0 + movd r0d, xmm5 + mov [r14], r0b + movd r0d, xmm1 + mov [r14+1],r0b + pop r0 + add r14, 2 + mov p_mad8x8, r14 - psrlq xmm7, 32 - psllq xmm7, 32 ; clear sad - pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 - pxor xmm5, xmm5 ; pMad8x8 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + psrlq xmm7, 32 + psllq xmm7, 32 ; clear sad + pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 + pxor xmm5, xmm5 ; pMad8x8 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 + WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 - mov r14, psad8x8 - movdqa xmm2, xmm7 - pshufd xmm1, xmm2, 00001110b - movd [r14], xmm2 - movd [r14+4], xmm1 - add r14, 8 - mov psad8x8, r14 ; sad8x8 + mov r14, psad8x8 + movdqa xmm2, xmm7 + pshufd xmm1, xmm2, 00001110b + movd [r14], xmm2 + movd [r14+4], xmm1 + add r14, 8 + mov psad8x8, r14 ; sad8x8 - paddd xmm1, xmm2 - movd r14d, xmm1 - movd xmm9, r14d - paddd xmm8, xmm9 ; iFrameSad + paddd xmm1, xmm2 + movd r14d, xmm1 + movd xmm9, r14d + paddd xmm8, xmm9 ; iFrameSad - mov r14, psum16x16 - movdqa xmm1, xmm6 - pshufd xmm2, xmm1, 00001110b - paddd xmm1, xmm2 - movd r15d, xmm1 ; sum - add [r14], r15d - add r14, 4 - mov psum16x16, r14 + mov r14, psum16x16 + movdqa xmm1, xmm6 + pshufd xmm2, xmm1, 00001110b + paddd xmm1, xmm2 + movd r15d, xmm1 ; sum + add [r14], r15d + add r14, 4 + mov psum16x16, r14 - mov r14, psqsum16x16 - psrlq xmm7, 32 - pshufd xmm2, xmm7, 00001110b - paddd xmm2, xmm7 - movd [r14], xmm2 ; sqsum - add r14, 4 - mov psqsum16x16, r14 + mov r14, psqsum16x16 + psrlq xmm7, 32 + pshufd xmm2, xmm7, 00001110b + paddd xmm2, xmm7 + movd [r14], xmm2 ; sqsum + add r14, 4 + mov psqsum16x16, r14 - mov r14, p_sd8x8 - pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 - psubd xmm6, xmm1 ; 00 diff1 00 diff0 - pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 - movq [r14], xmm1 - add r14, 8 - mov p_sd8x8, r14 + mov r14, p_sd8x8 + pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 + psubd xmm6, xmm1 ; 00 diff1 00 diff0 + pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 + movq [r14], xmm1 + add r14, 8 + mov p_sd8x8, r14 - mov r14, p_mad8x8 - WELS_MAX_REG_SSE2 xmm5 + mov r14, p_mad8x8 + WELS_MAX_REG_SSE2 xmm5 - movhlps xmm1, xmm5 - push r0 - movd r0d, xmm5 - mov [r14], r0b - movd r0d, xmm1 - mov [r14+1],r0b - pop r0 - add r14, 2 - mov p_mad8x8, r14 + movhlps xmm1, xmm5 + push r0 + movd r0d, xmm5 + mov [r14], r0b + movd r0d, xmm1 + mov [r14+1],r0b + pop r0 + add r14, 2 + mov p_mad8x8, r14 - mov r14, psqdiff16x16 - pshufd xmm1, xmm4, 00001110b - paddd xmm4, xmm1 - pshufd xmm1, xmm4, 00000001b - paddd xmm4, xmm1 - movd [r14], xmm4 - add r14, 4 - mov psqdiff16x16, r14 + mov r14, psqdiff16x16 + pshufd xmm1, xmm4, 00001110b + paddd xmm4, xmm1 + pshufd xmm1, xmm4, 00000001b + paddd xmm4, xmm1 + movd [r14], xmm4 + add r14, 4 + mov psqdiff16x16, r14 - add r14, 16 - sub r0, r13 - sub r1, r13 - add r0, 16 - add r1, 16 + add r14, 16 + sub r0, r13 + sub r1, r13 + add r0, 16 + add r1, 16 - dec r2 - jnz sqdiff_bgd_width_loop - pop r2 - %assign push_num push_num-1 - mov r0, r10 - mov r1, r11 - add r0, r13 - add r1, r13 + dec r2 + jnz sqdiff_bgd_width_loop + pop r2 + %assign push_num push_num-1 + mov r0, r10 + mov r1, r11 + add r0, r13 + add r1, r13 - dec r3 - jnz sqdiff_bgd_height_loop + dec r3 + jnz sqdiff_bgd_height_loop - mov r14, psadframe - movd [r14], xmm8 + mov r14, psadframe + movd [r14], xmm8 - POP_XMM - pop r15 - pop r14 - pop r13 - pop r12 + POP_XMM + pop r15 + pop r14 + pop r13 + pop r12 %assign push_num 0 %undef cur_data %undef ref_data @@ -2026,5 +2026,5 @@ sqdiff_bgd_width_loop: %undef tmp_edi %undef pushsize %undef localsize - ret + ret %endif