mips msa vp9 updated idct 8x8, 16x16 and 32x32 module

Updated sources according to improved version of common MSA macros.
Enabled idct MSA hooks and tests.
Overall, this is just upgrading the code with styling changes.

Change-Id: I1f488ab2c741f6c622b7a855388a202168082209
This commit is contained in:
Parag Salasakar 2015-06-01 09:19:01 +05:30
parent 71e88f903d
commit 6af9d7f2e2
10 changed files with 1227 additions and 1879 deletions

View File

@ -929,8 +929,7 @@ INSTANTIATE_TEST_CASE_P(
&idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if 0 // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(parag): enable when function hooks are added
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, Trans16x16DCT,
::testing::Values(

View File

@ -382,8 +382,7 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if 0 // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(parag): enable when function hooks are added
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, Trans32x32Test,
::testing::Values(

View File

@ -782,8 +782,7 @@ INSTANTIATE_TEST_CASE_P(
VPX_BITS_8)));
#endif
#if 0 // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(parag): enable when function hooks are added
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, FwdTrans8x8DCT,
::testing::Values(

View File

@ -305,8 +305,7 @@ INSTANTIATE_TEST_CASE_P(
TX_8X8, 12)));
#endif
#if 0 // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(parag): enable when function hooks are added
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, PartialIDctTest,
::testing::Values(

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,531 +8,131 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "vp9/common/mips/msa/vp9_idct_msa.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#define SET_COSPI_PAIR(c0_h, c1_h) ({ \
v8i16 out0, r0_m, r1_m; \
\
r0_m = __msa_fill_h(c0_h); \
r1_m = __msa_fill_h(c1_h); \
out0 = __msa_ilvev_h(r1_m, r0_m); \
\
out0; \
})
#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \
v8i16 c0_m, c1_m; \
\
c0_m = __msa_splati_h((mask_h), (idx1_h)); \
c1_m = __msa_splati_h((mask_h), (idx2_h)); \
c0_m = __msa_ilvev_h(c1_m, c0_m); \
\
c0_m; \
})
#define VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, \
in0, in1, in2, in3) { \
uint64_t out0_m, out1_m, out2_m, out3_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \
v16i8 tmp0_m, tmp1_m; \
v16i8 zero_m = { 0 }; \
uint8_t *dst_m = (uint8_t *)(dest); \
\
LOAD_4VECS_UB(dst_m, (dest_stride), \
dest0_m, dest1_m, dest2_m, dest3_m); \
\
res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \
res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \
res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \
res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \
\
res0_m += (v8i16)(in0); \
res1_m += (v8i16)(in1); \
res2_m += (v8i16)(in2); \
res3_m += (v8i16)(in3); \
\
res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \
res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \
res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \
res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \
\
tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \
tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \
\
out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
\
STORE_DWORD(dst_m, out0_m); \
dst_m += (dest_stride); \
STORE_DWORD(dst_m, out1_m); \
dst_m += (dest_stride); \
STORE_DWORD(dst_m, out2_m); \
dst_m += (dest_stride); \
STORE_DWORD(dst_m, out3_m); \
}
/* multiply and add macro */
#define VP9_MADD(inp0, inp1, inp2, inp3, \
cst0, cst1, cst2, cst3, \
out0, out1, out2, out3) { \
v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
ILV_H_LRLR_SH(inp0, inp1, inp2, inp3, \
madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m); \
\
DOTP_S_W_4VECS_SW(madd_s1_m, cst0, madd_s0_m, cst0, \
madd_s1_m, cst1, madd_s0_m, cst1, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
\
SRARI_W_4VECS_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
DCT_CONST_BITS); \
\
PCKEV_H_2VECS_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, \
out0, out1); \
\
DOTP_S_W_4VECS_SW(madd_s3_m, cst2, madd_s2_m, cst2, \
madd_s3_m, cst3, madd_s2_m, cst3, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
\
SRARI_W_4VECS_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
DCT_CONST_BITS); \
\
PCKEV_H_2VECS_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, \
out2, out3); \
}
/* idct 8x8 macro */
#define VP9_IDCT8x8_1D_ODD(in1, in3, in5, in7, \
k0, k1, k2, k3, mask, \
out0, out1, out2, out3) { \
v8i16 res0_m, res1_m, res2_m, res3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
VP9_MADD(in1, in7, in3, in5, k0, k1, k2, k3, \
in1, in7, in3, in5); \
\
res0_m = in1 - in3; \
res1_m = in7 - in5; \
\
k0 = VP9_SET_CONST_PAIR(mask, 4, 7); \
k1 = __msa_splati_h(mask, 4); \
\
res2_m = __msa_ilvr_h(res0_m, res1_m); \
res3_m = __msa_ilvl_h(res0_m, res1_m); \
\
DOTP_S_W_4VECS_SW(res2_m, k0, res3_m, k0, \
res2_m, k1, res3_m, k1, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
\
SRARI_W_4VECS_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
DCT_CONST_BITS); \
out0 = in1 + in3; \
PCKEV_H_2VECS_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, \
out1, out2); \
out3 = in7 + in5; \
}
#define VP9_IDCT8x8_1D_EVEN(in0, in2, in4, in6, \
k0, k1, k2, k3, \
out0, out1, out2, out3) { \
k2 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k3 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
\
VP9_MADD(in0, in4, in2, in6, k1, k0, k2, k3, \
in0, in4, in2, in6); \
\
out0 = in0 + in6; \
out1 = in4 + in2; \
out2 = in4 - in2; \
out3 = in0 - in6; \
}
#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
v8i16 k0_m, k1_m, k2_m, k3_m; \
v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 \
}; \
\
k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
\
VP9_IDCT8x8_1D_ODD(in1, in3, in5, in7, k0_m, k1_m, k2_m, k3_m, mask_m, \
res4_m, res5_m, res6_m, res7_m); \
\
VP9_IDCT8x8_1D_EVEN(in0, in2, in4, in6, k0_m, k1_m, k2_m, k3_m, \
res0_m, res1_m, res2_m, res3_m); \
\
BUTTERFLY_8(res0_m, res1_m, res2_m, res3_m, \
res4_m, res5_m, res6_m, res7_m, \
out0, out1, out2, out3, \
out4, out5, out6, out7); \
}
#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
dst0, dst1, dst2, dst3) { \
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; \
\
tmp0 = __msa_dotp_s_w((in0), (in4)); \
tmp2 = __msa_dotp_s_w((in1), (in4)); \
tmp3 = __msa_dotp_s_w((in0), (in5)); \
tmp4 = __msa_dotp_s_w((in1), (in5)); \
tmp5 = __msa_dotp_s_w((in2), (in6)); \
tmp6 = __msa_dotp_s_w((in3), (in6)); \
tmp7 = __msa_dotp_s_w((in2), (in7)); \
tmp8 = __msa_dotp_s_w((in3), (in7)); \
\
BUTTERFLY_4(tmp0, tmp3, tmp7, tmp5, tmp1, tmp9, tmp7, tmp5); \
BUTTERFLY_4(tmp2, tmp4, tmp8, tmp6, tmp3, tmp0, tmp4, tmp2); \
\
SRARI_W_4VECS_SW(tmp1, tmp9, tmp7, tmp5, tmp1, tmp9, tmp7, tmp5, \
DCT_CONST_BITS); \
SRARI_W_4VECS_SW(tmp3, tmp0, tmp4, tmp2, tmp3, tmp0, tmp4, tmp2, \
DCT_CONST_BITS); \
\
PCKEV_H_4VECS_SH(tmp1, tmp3, tmp9, tmp0, tmp7, tmp4, tmp5, tmp2, \
dst0, dst1, dst2, dst3); \
}
#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
v8i16 dst_m; \
v4i32 tp0_m, tp1_m; \
\
tp1_m = __msa_dotp_s_w((in0), (in2)); \
tp0_m = __msa_dotp_s_w((in1), (in2)); \
tp1_m = __msa_srari_w(tp1_m, DCT_CONST_BITS); \
tp0_m = __msa_srari_w(tp0_m, DCT_CONST_BITS); \
dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
\
dst_m; \
})
#define VP9_ADST8_ROW(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 const0_m, const1_m, const2_m, const3_m, const4_m; \
v8i16 temp0_m, temp1_m, temp2_m, temp3_m, s0_m, s1_m; \
v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, \
cospi_14_64, cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 \
}; \
v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \
-cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 \
}; \
\
const0_m = __msa_splati_h(coeff0_m, 0); \
const1_m = __msa_splati_h(coeff0_m, 7); \
const2_m = -const0_m; \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = __msa_ilvev_h(const2_m, const1_m); \
const2_m = __msa_splati_h(coeff0_m, 4); \
const3_m = __msa_splati_h(coeff0_m, 3); \
const4_m = -const2_m; \
const2_m = __msa_ilvev_h(const3_m, const2_m); \
const3_m = __msa_ilvev_h(const4_m, const3_m); \
\
ILV_H_LRLR_SH(in7, in0, in3, in4, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const1_m, const2_m, const3_m, \
in7, in0, in4, in3); \
\
const0_m = __msa_splati_h(coeff0_m, 2); \
const1_m = __msa_splati_h(coeff0_m, 5); \
const2_m = -const0_m; \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = __msa_ilvev_h(const2_m, const1_m); \
const2_m = __msa_splati_h(coeff0_m, 6); \
const3_m = __msa_splati_h(coeff0_m, 1); \
const4_m = -const2_m; \
const2_m = __msa_ilvev_h(const3_m, const2_m); \
const3_m = __msa_ilvev_h(const4_m, const3_m); \
\
ILV_H_LRLR_SH(in5, in2, in1, in6, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const1_m, const2_m, const3_m, \
in5, in2, in6, in1); \
\
BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
out7 = -s0_m; \
out0 = s1_m; \
\
SPLATI_H_4VECS_SH(coeff1_m, 0, 4, 1, 5, \
const0_m, const1_m, const2_m, const3_m); \
\
const3_m = __msa_ilvev_h(const0_m, const3_m); \
const2_m = __msa_ilvev_h(const2_m, const1_m); \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = const0_m; \
\
ILV_H_LRLR_SH(in3, in4, in1, in6, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const2_m, const3_m, const1_m, \
out1, out6, s0_m, s1_m); \
\
const0_m = __msa_splati_h(coeff1_m, 2); \
const1_m = __msa_splati_h(coeff1_m, 3); \
const1_m = __msa_ilvev_h(const1_m, const0_m); \
\
ILV_H_LRLR_SH(in5, in2, s1_m, s0_m, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
out3 = DOT_SHIFT_RIGHT_PCK_H(temp0_m, temp1_m, const0_m); \
out4 = DOT_SHIFT_RIGHT_PCK_H(temp0_m, temp1_m, const1_m); \
out2 = DOT_SHIFT_RIGHT_PCK_H(temp2_m, temp3_m, const0_m); \
out5 = DOT_SHIFT_RIGHT_PCK_H(temp2_m, temp3_m, const1_m); \
\
out1 = -out1; \
out3 = -out3; \
out5 = -out5; \
}
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 const0_m, const1_m, const2_m, const3_m, const4_m; \
v8i16 temp0_m, temp1_m, temp2_m, temp3_m, s0_m, s1_m; \
\
const0_m = __msa_fill_h(cospi_2_64); \
const1_m = __msa_fill_h(cospi_30_64); \
const2_m = -const0_m; \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = __msa_ilvev_h(const2_m, const1_m); \
const2_m = __msa_fill_h(cospi_18_64); \
const3_m = __msa_fill_h(cospi_14_64); \
const4_m = -const2_m; \
const2_m = __msa_ilvev_h(const3_m, const2_m); \
const3_m = __msa_ilvev_h(const4_m, const3_m); \
\
ILV_H_LRLR_SH(in7, in0, in3, in4, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const1_m, const2_m, const3_m, \
in7, in0, in4, in3); \
\
const0_m = __msa_fill_h(cospi_10_64); \
const1_m = __msa_fill_h(cospi_22_64); \
const2_m = -const0_m; \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = __msa_ilvev_h(const2_m, const1_m); \
const2_m = __msa_fill_h(cospi_26_64); \
const3_m = __msa_fill_h(cospi_6_64); \
const4_m = -const2_m; \
const2_m = __msa_ilvev_h(const3_m, const2_m); \
const3_m = __msa_ilvev_h(const4_m, const3_m); \
\
ILV_H_LRLR_SH(in5, in2, in1, in6, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const1_m, const2_m, const3_m, \
in5, in2, in6, in1); \
\
BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
out7 = -s0_m; \
out0 = s1_m; \
\
const1_m = __msa_fill_h(cospi_24_64); \
const0_m = __msa_fill_h(cospi_8_64); \
const3_m = -const1_m; \
const2_m = -const0_m; \
\
const3_m = __msa_ilvev_h(const0_m, const3_m); \
const2_m = __msa_ilvev_h(const2_m, const1_m); \
const0_m = __msa_ilvev_h(const1_m, const0_m); \
const1_m = const0_m; \
\
ILV_H_LRLR_SH(in3, in4, in1, in6, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
DOT_ADD_SUB_SRARI_PCK(temp0_m, temp1_m, temp2_m, temp3_m, \
const0_m, const2_m, const3_m, const1_m, \
out1, out6, s0_m, s1_m); \
\
const0_m = __msa_fill_h(cospi_16_64); \
const1_m = -const0_m; \
const1_m = __msa_ilvev_h(const1_m, const0_m); \
\
ILV_H_LRLR_SH(in5, in2, s1_m, s0_m, \
temp0_m, temp1_m, temp2_m, temp3_m); \
\
out3 = DOT_SHIFT_RIGHT_PCK_H(temp0_m, temp1_m, const0_m); \
out4 = DOT_SHIFT_RIGHT_PCK_H(temp0_m, temp1_m, const1_m); \
out2 = DOT_SHIFT_RIGHT_PCK_H(temp2_m, temp3_m, const0_m); \
out5 = DOT_SHIFT_RIGHT_PCK_H(temp2_m, temp3_m, const1_m); \
\
out1 = -out1; \
out3 = -out3; \
out5 = -out5; \
}
void vp9_idct8x8_64_add_msa(const int16_t *input, uint8_t *dest,
int32_t dest_stride) {
void vp9_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
int32_t dst_stride) {
v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
/* load vector elements of 8x8 block */
LOAD_8VECS_SH(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
/* rows transform */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* 1D idct8x8 */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* columns transform */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* 1D idct8x8 */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H_4VECS_SH(in0, in1, in2, in3, in0, in1, in2, in3, 5);
SRARI_H_4VECS_SH(in4, in5, in6, in7, in4, in5, in6, in7, 5);
SRARI_H4_SH(in0, in1, in2, in3, 5);
SRARI_H4_SH(in4, in5, in6, in7, 5);
/* add block and store 8x8 */
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in0, in1, in2, in3);
dest += (4 * dest_stride);
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in4, in5, in6, in7);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
dst += (4 * dst_stride);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
}
void vp9_idct8x8_12_add_msa(const int16_t *input, uint8_t *dest,
int32_t dest_stride) {
void vp9_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
int32_t dst_stride) {
v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
v8i16 s0, s1, s2, s3, s4, s5, s6, s7;
v8i16 k0, k1, k2, k3, m0, m1, m2, m3;
v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
v4i32 tmp0, tmp1, tmp2, tmp3;
v8i16 zero = { 0 };
/* load vector elements of 8x8 block */
LOAD_8VECS_SH(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8X4_H(in0, in1, in2, in3, in0, in1, in2, in3);
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
/* stage1 */
s0 = __msa_ilvl_h(in3, in0);
s1 = __msa_ilvl_h(in2, in1);
k0 = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
k1 = SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
k2 = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
k3 = SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
DOTP_S_W_4VECS_SW(s0, k0, s0, k1, s1, k2, s1, k3, tmp0, tmp1, tmp2, tmp3);
SRARI_W_4VECS_SW(tmp0, tmp1, tmp2, tmp3,
tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
PCKEV_H_2VECS_SH(zero, tmp0, zero, tmp1, s0, s1);
PCKEV_H_2VECS_SH(zero, tmp2, zero, tmp3, s2, s3);
ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
/* stage2 */
s0 = __msa_ilvr_h(in2, in0);
s1 = __msa_ilvr_h(in3, in1);
k0 = SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k1 = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
k2 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
k3 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
DOTP_S_W_4VECS_SW(s0, k0, s0, k1, s1, k2, s1, k3, tmp0, tmp1, tmp2, tmp3);
SRARI_W_4VECS_SW(tmp0, tmp1, tmp2, tmp3,
tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
PCKEV_H_2VECS_SH(zero, tmp0, zero, tmp1, s0, s1);
PCKEV_H_2VECS_SH(zero, tmp2, zero, tmp3, s2, s3);
ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
/* stage3 */
s0 = __msa_ilvr_h(s6, s5);
k1 = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
tmp0 = __msa_dotp_s_w(s0, k1);
tmp1 = __msa_dotp_s_w(s0, k0);
tmp0 = __msa_srari_w(tmp0, DCT_CONST_BITS);
tmp1 = __msa_srari_w(tmp1, DCT_CONST_BITS);
PCKEV_H_2VECS_SH(zero, tmp0, zero, tmp1, s2, s3);
k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
/* stage4 */
BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE4X8_H(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H_4VECS_SH(in0, in1, in2, in3, in0, in1, in2, in3, 5);
SRARI_H_4VECS_SH(in4, in5, in6, in7, in4, in5, in6, in7, 5);
SRARI_H4_SH(in0, in1, in2, in3, 5);
SRARI_H4_SH(in4, in5, in6, in7, 5);
/* add block and store 8x8 */
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in0, in1, in2, in3);
dest += (4 * dest_stride);
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in4, in5, in6, in7);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
dst += (4 * dst_stride);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
}
void vp9_idct8x8_1_add_msa(const int16_t *input, uint8_t *dest,
int32_t dest_stride) {
void vp9_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
int32_t dst_stride) {
int16_t out;
int32_t const1;
v8i16 const2;
int32_t val;
v8i16 vec;
out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
const1 = ROUND_POWER_OF_TWO(out, 5);
const2 = __msa_fill_h(const1);
out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
val = ROUND_POWER_OF_TWO(out, 5);
vec = __msa_fill_h(val);
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride,
const2, const2, const2, const2);
dest += (4 * dest_stride);
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride,
const2, const2, const2, const2);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
dst += (4 * dst_stride);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
}
void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dest,
int32_t dest_stride, int32_t tx_type) {
void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
int32_t dst_stride, int32_t tx_type) {
v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
/* load vector elements of 8x8 block */
LOAD_8VECS_SH(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
switch (tx_type) {
case DCT_DCT:
/* DCT in horizontal */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* DCT in vertical */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
break;
@ -540,21 +140,19 @@ void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dest,
/* DCT in horizontal */
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* ADST in vertical */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
break;
case DCT_ADST:
/* ADST in horizontal */
VP9_ADST8_ROW(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* DCT in vertical */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
break;
@ -562,10 +160,9 @@ void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dest,
/* ADST in horizontal */
VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
/* ADST in vertical */
TRANSPOSE8x8_H_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
in0, in1, in2, in3, in4, in5, in6, in7);
break;
@ -575,11 +172,11 @@ void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dest,
}
/* final rounding (add 2^4, divide by 2^5) and shift */
SRARI_H_4VECS_SH(in0, in1, in2, in3, in0, in1, in2, in3, 5);
SRARI_H_4VECS_SH(in4, in5, in6, in7, in4, in5, in6, in7, 5);
SRARI_H4_SH(in0, in1, in2, in3, 5);
SRARI_H4_SH(in4, in5, in6, in7, 5);
/* add block and store 8x8 */
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in0, in1, in2, in3);
dest += (4 * dest_stride);
VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, in4, in5, in6, in7);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
dst += (4 * dst_stride);
VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
}

View File

@ -0,0 +1,483 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_
#include <assert.h>
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \
v4i32 s0_m, s1_m, s2_m, s3_m; \
\
s0_m = (v4i32)__msa_fill_h(cnst1); \
k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
\
ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
\
DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
}
#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
dst0, dst1, dst2, dst3) { \
v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
\
DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
tp0_m, tp2_m, tp3_m, tp4_m); \
DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
tp5_m, tp6_m, tp7_m, tp8_m); \
BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
dst0, dst1, dst2, dst3); \
}
#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
v8i16 dst_m; \
v4i32 tp0_m, tp1_m; \
\
DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
\
dst_m; \
})
#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \
-cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \
\
SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
cnst2_m = -cnst0_m; \
ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
cnst4_m = -cnst2_m; \
ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
\
ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
cnst1_m, cnst2_m, cnst3_m, in7, in0, \
in4, in3); \
\
SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
cnst2_m = -cnst0_m; \
ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
cnst4_m = -cnst2_m; \
ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
\
ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
\
VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
cnst1_m, cnst2_m, cnst3_m, in5, in2, \
in6, in1); \
BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
out7 = -s0_m; \
out0 = s1_m; \
\
SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \
cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
\
ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
cnst1_m = cnst0_m; \
\
ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
cnst2_m, cnst3_m, cnst1_m, out1, out6, \
s0_m, s1_m); \
\
SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
\
ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
\
out1 = -out1; \
out3 = -out3; \
out5 = -out5; \
}
#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
v8i16 madd_s0_m, madd_s1_m; \
\
ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
}
#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
out0, out1, out2, out3) { \
v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
\
ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
m4_m, m5_m, tmp3_m, tmp2_m); \
SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
m4_m, m5_m, tmp3_m, tmp2_m); \
SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
}
#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \
v8i16 out0_m, r0_m, r1_m; \
\
r0_m = __msa_fill_h(c0_h); \
r1_m = __msa_fill_h(c1_h); \
out0_m = __msa_ilvev_h(r1_m, r0_m); \
\
out0_m; \
})
#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \
uint8_t *dst_m = (uint8_t *) (dst); \
v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
v16i8 tmp0_m, tmp1_m; \
v16i8 zero_m = { 0 }; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
\
LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \
zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \
ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \
res0_m, res1_m, res2_m, res3_m); \
CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
}
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v8i16 c0_m, c1_m, c2_m, c3_m; \
v8i16 step0_m, step1_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
step0_m = __msa_ilvr_h(in2, in0); \
DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
\
c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
step1_m = __msa_ilvr_h(in3, in1); \
DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
\
PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \
(v8i16)tmp2_m, (v8i16)tmp3_m, \
out0, out1, out2, out3); \
}
#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v8i16 res0_m, res1_m, c0_m, c1_m; \
v8i16 k1_m, k2_m, k3_m, k4_m; \
v8i16 zero_m = { 0 }; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v4i32 int0_m, int1_m, int2_m, int3_m; \
v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \
sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \
-sinpi_4_9 }; \
\
SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
int0_m = tmp2_m + tmp1_m; \
\
SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
int1_m = tmp0_m + tmp1_m; \
\
c0_m = __msa_splati_h(mask_m, 6); \
ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
int2_m = tmp0_m + tmp1_m; \
\
c0_m = __msa_splati_h(mask_m, 6); \
c0_m = __msa_ilvev_h(c0_m, k1_m); \
\
res0_m = __msa_ilvr_h((in1), (in3)); \
tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
int3_m = tmp2_m + tmp0_m; \
\
res0_m = __msa_ilvr_h((in2), (in3)); \
c1_m = __msa_ilvev_h(k4_m, k3_m); \
\
tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
res1_m = __msa_ilvr_h((in0), (in2)); \
c1_m = __msa_ilvev_h(k1_m, zero_m); \
\
tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
int3_m += tmp2_m; \
int3_m += tmp3_m; \
\
SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
}
#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \
v8i16 c0_m, c1_m; \
\
SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
c0_m = __msa_ilvev_h(c1_m, c0_m); \
\
c0_m; \
})
/* multiply and add macro */
#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
out0, out1, out2, out3) { \
v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \
cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \
cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
}
/* idct 8x8 macro */
#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
\
k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
SUB2(in1, in3, in7, in5, res0_m, res1_m); \
k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
k1_m = __msa_splati_h(mask_m, 4); \
\
ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
tp4_m = in1 + in3; \
PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
tp7_m = in7 + in5; \
k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \
in0, in4, in2, in6); \
BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \
out0, out1, out2, out3, out4, out5, out6, out7); \
}
#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \
cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \
cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \
-cospi_16_64, 0, 0, 0, 0 }; \
\
k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
r0_m, r1_m, r2_m, r3_m); \
k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
r4_m, r5_m, r6_m, r7_m); \
ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
r0_m, r1_m, r2_m, r3_m); \
k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
r4_m, r5_m, r6_m, r7_m); \
ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
r0_m, r1_m, r2_m, r3_m); \
k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
r4_m, r5_m, r6_m, r7_m); \
ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
m0_m, m1_m, m2_m, m3_m); \
SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
\
out1 = -in1; \
out3 = -in3; \
out5 = -in5; \
out7 = -in7; \
}
#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \
r9, r10, r11, r12, r13, r14, r15, \
out0, out1, out2, out3, out4, out5, \
out6, out7, out8, out9, out10, out11, \
out12, out13, out14, out15) { \
v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
v8i16 h8_m, h9_m, h10_m, h11_m; \
v8i16 k0_m, k1_m, k2_m, k3_m; \
\
/* stage 1 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
g0_m, g1_m, g2_m, g3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
g4_m, g5_m, g6_m, g7_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
g8_m, g9_m, g10_m, g11_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
g12_m, g13_m, g14_m, g15_m); \
\
/* stage 2 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
h0_m, h1_m, h2_m, h3_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
h4_m, h5_m, h6_m, h7_m); \
BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
\
/* stage 3 */ \
BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
out4, out6, out5, out7); \
VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
out12, out14, out13, out15); \
\
/* stage 4 */ \
k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
}
#endif /* VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ */

View File

@ -425,42 +425,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_256_add sse2 neon dspr2/;
specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_10_add sse2 neon dspr2/;
specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/;
specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;
#is this a typo?
$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
# dct and add

View File

@ -132,6 +132,10 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_ds
# common (msa)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h