Merge "mips msa vpx convolve optimzation"

This commit is contained in:
Parag Salasakar 2015-08-04 04:30:22 +00:00 committed by Gerrit Code Review
commit 814e1346a6
6 changed files with 14 additions and 141 deletions

View File

@ -323,7 +323,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
v8u16 vec2, vec3, const255, filt; v8u16 vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
@ -331,14 +331,11 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, FILTER_BITS); SRARI_H2_UH(vec2, vec3, FILTER_BITS);
MIN_UH2_UH(vec2, vec3, const255);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
@ -353,7 +350,7 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v8u16 vec4, vec5, vec6, vec7, const255, filt; v8u16 vec4, vec5, vec6, vec7, filt;
mask = LD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
@ -361,8 +358,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@ -370,7 +365,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7); vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
res3); res3);
ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
@ -402,7 +396,7 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, const255, filt; v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -410,8 +404,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@ -419,7 +411,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride); dst, dst_stride);
} }
@ -432,7 +423,7 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
int32_t height) { int32_t height) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, const255, filt; v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -440,8 +431,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@ -450,7 +439,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
@ -463,7 +451,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride); dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -478,7 +465,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride); dst, dst_stride);
@ -490,7 +476,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
dst, dst_stride); dst, dst_stride);
} }
@ -520,7 +505,7 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -528,8 +513,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src2, src4, src6); LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7); LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
src += (4 * src_stride); src += (4 * src_stride);
@ -545,8 +528,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst); PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
dst += dst_stride; dst += dst_stride;
PCKEV_AVG_ST_UB(res3, res2, dst1, dst); PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
@ -572,8 +553,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst); PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
dst += dst_stride; dst += dst_stride;
PCKEV_AVG_ST_UB(res3, res2, dst1, dst); PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
@ -595,7 +574,7 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -603,8 +582,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
for (loop_cnt = (height >> 1); loop_cnt--;) { for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LD_SB(src); src0 = LD_SB(src);
src2 = LD_SB(src + 16); src2 = LD_SB(src + 16);
@ -627,8 +604,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
res6, res7); res6, res7);
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
MIN_UH4_UH(res0, res1, res2, res3, const255);
MIN_UH4_UH(res4, res5, res6, res7, const255);
LD_UB2(dst, 16, dst0, dst1); LD_UB2(dst, 16, dst0, dst1);
PCKEV_AVG_ST_UB(res1, res0, dst0, dst); PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
@ -650,7 +625,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, dst0, dst1, dst2, dst3; v16u8 filt0, dst0, dst1, dst2, dst3;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt; v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -658,8 +633,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
const255 = (v8u16)__msa_ldi_h(255);
for (loop_cnt = height; loop_cnt--;) { for (loop_cnt = height; loop_cnt--;) {
LD_SB4(src, 16, src0, src2, src4, src6); LD_SB4(src, 16, src0, src2, src4, src6);
src7 = LD_SB(src + 56); src7 = LD_SB(src + 56);
@ -677,8 +650,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
LD_UB4(dst, 16, dst0, dst1, dst2, dst3); LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_AVG_ST_UB(out1, out0, dst0, dst); PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);

View File

@ -274,7 +274,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@ -323,7 +322,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
tmp0, tmp1, tmp2, tmp3); tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
res2, res3); res2, res3);
AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
@ -391,7 +389,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec3, filt_vt); tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride); dst, dst_stride);
} }
@ -436,7 +433,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
tmp1 = __msa_dotp_u_h(vec0, filt_vt); tmp1 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@ -447,7 +443,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
tmp3 = __msa_dotp_u_h(vec0, filt_vt); tmp3 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride); dst, dst_stride);
@ -511,7 +506,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
dst += dst_stride; dst += dst_stride;
@ -520,7 +514,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
dst += dst_stride; dst += dst_stride;
@ -529,7 +522,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
dst += dst_stride; dst += dst_stride;
@ -538,7 +530,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
dst += dst_stride; dst += dst_stride;
} }

View File

@ -283,7 +283,6 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
out = __msa_aver_u_b(out, dst0); out = __msa_aver_u_b(out, dst0);
@ -323,7 +322,6 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3); tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
@ -365,7 +363,6 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
dst, dst_stride); dst, dst_stride);
} }
@ -402,7 +399,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
dst, dst_stride); dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -410,7 +406,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
dst, dst_stride); dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -460,7 +455,6 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
dst += dst_stride; dst += dst_stride;
@ -468,19 +462,16 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
dst += dst_stride; dst += dst_stride;
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
dst += dst_stride; dst += dst_stride;
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
dst += dst_stride; dst += dst_stride;
@ -519,48 +510,40 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -605,48 +588,40 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
dst += (2 * dst_stride); dst += (2 * dst_stride);

View File

@ -318,7 +318,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter) { int8_t *filter) {
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v16u8 filt0, vec0, vec1, res0, res1; v16u8 filt0, vec0, vec1, res0, res1;
v8u16 vec2, vec3, filt, const255; v8u16 vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
@ -326,13 +326,10 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, FILTER_BITS); SRARI_H2_UH(vec2, vec3, FILTER_BITS);
MIN_UH2_UH(vec2, vec3, const255);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
} }
@ -343,7 +340,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
v16u8 vec0, vec1, vec2, vec3, filt0; v16u8 vec0, vec1, vec2, vec3, filt0;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16i8 res0, res1, res2, res3; v16i8 res0, res1, res2, res3;
v8u16 vec4, vec5, vec6, vec7, filt, const255; v8u16 vec4, vec5, vec6, vec7, filt;
mask = LD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
@ -351,15 +348,12 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
vec6, vec7); vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
res2, res3); res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@ -382,7 +376,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter) { int8_t *filter) {
v16u8 filt0; v16u8 filt0;
v16i8 src0, src1, src2, src3, mask; v16i8 src0, src1, src2, src3, mask;
v8u16 vec0, vec1, vec2, vec3, const255, filt; v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -390,15 +384,12 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride); ST8x4_UB(src0, src1, dst, dst_stride);
} }
@ -408,7 +399,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
int8_t *filter, int32_t height) { int8_t *filter, int32_t height) {
v16u8 filt0; v16u8 filt0;
v16i8 src0, src1, src2, src3, mask, out0, out1; v16i8 src0, src1, src2, src3, mask, out0, out1;
v8u16 vec0, vec1, vec2, vec3, filt, const255; v8u16 vec0, vec1, vec2, vec3, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -416,8 +407,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
@ -426,7 +415,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
@ -440,7 +428,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -454,7 +441,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
@ -466,7 +452,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
vec2, vec3); vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
} }
@ -488,7 +473,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -498,8 +483,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
LD_SB4(src, src_stride, src0, src2, src4, src6); LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7); LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
src += (4 * src_stride); src += (4 * src_stride);
@ -514,8 +497,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
out6, out7); out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_ST_SB(out0, out1, dst); PCKEV_ST_SB(out0, out1, dst);
dst += dst_stride; dst += dst_stride;
PCKEV_ST_SB(out2, out3, dst); PCKEV_ST_SB(out2, out3, dst);
@ -540,8 +521,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
out6, out7); out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_ST_SB(out0, out1, dst); PCKEV_ST_SB(out0, out1, dst);
dst += dst_stride; dst += dst_stride;
PCKEV_ST_SB(out2, out3, dst); PCKEV_ST_SB(out2, out3, dst);
@ -559,7 +538,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -567,8 +546,6 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
for (loop_cnt = height >> 1; loop_cnt--;) { for (loop_cnt = height >> 1; loop_cnt--;) {
src0 = LD_SB(src); src0 = LD_SB(src);
src2 = LD_SB(src + 16); src2 = LD_SB(src + 16);
@ -591,8 +568,6 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
out6, out7); out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_ST_SB(out0, out1, dst); PCKEV_ST_SB(out0, out1, dst);
PCKEV_ST_SB(out2, out3, dst + 16); PCKEV_ST_SB(out2, out3, dst + 16);
dst += dst_stride; dst += dst_stride;
@ -608,7 +583,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
mask = LD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
@ -616,8 +591,6 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
filt = LD_UH(filter); filt = LD_UH(filter);
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
const255 = (v8u16) __msa_ldi_h(255);
for (loop_cnt = height; loop_cnt--;) { for (loop_cnt = height; loop_cnt--;) {
src0 = LD_SB(src); src0 = LD_SB(src);
src2 = LD_SB(src + 16); src2 = LD_SB(src + 16);
@ -637,8 +610,6 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
out6, out7); out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
MIN_UH4_UH(out0, out1, out2, out3, const255);
MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_ST_SB(out0, out1, dst); PCKEV_ST_SB(out0, out1, dst);
PCKEV_ST_SB(out2, out3, dst + 16); PCKEV_ST_SB(out2, out3, dst + 16);
PCKEV_ST_SB(out4, out5, dst + 32); PCKEV_ST_SB(out4, out5, dst + 32);

View File

@ -256,7 +256,6 @@ static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
} }
@ -298,7 +297,6 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
vec4, vec5, vec6, vec7); vec4, vec5, vec6, vec7);
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
res2, res3); res2, res3);
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@ -357,7 +355,6 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
tmp3 = __msa_dotp_u_h(vec3, filt_vt); tmp3 = __msa_dotp_u_h(vec3, filt_vt);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
} }
@ -402,7 +399,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
tmp2 = __msa_dotp_u_h(vec0, filt_vt); tmp2 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
SAT_UH2_UH(tmp1, tmp2, 7);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@ -415,7 +411,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
tmp4 = __msa_dotp_u_h(vec0, filt_vt); tmp4 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -437,7 +432,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
tmp8 = __msa_dotp_u_h(vec0, filt_vt); tmp8 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -492,7 +486,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
SAT_UH2_UH(tmp1, tmp2, 7);
PCKEV_ST_SB(tmp1, tmp2, dst); PCKEV_ST_SB(tmp1, tmp2, dst);
dst += dst_stride; dst += dst_stride;
@ -501,7 +494,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
SAT_UH2_UH(tmp1, tmp2, 7);
PCKEV_ST_SB(tmp1, tmp2, dst); PCKEV_ST_SB(tmp1, tmp2, dst);
dst += dst_stride; dst += dst_stride;
@ -510,7 +502,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
SAT_UH2_UH(tmp1, tmp2, 7);
PCKEV_ST_SB(tmp1, tmp2, dst); PCKEV_ST_SB(tmp1, tmp2, dst);
dst += dst_stride; dst += dst_stride;
@ -519,7 +510,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
SAT_UH2_UH(tmp1, tmp2, 7);
PCKEV_ST_SB(tmp1, tmp2, dst); PCKEV_ST_SB(tmp1, tmp2, dst);
dst += dst_stride; dst += dst_stride;
} }

View File

@ -316,7 +316,6 @@ static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
} }
@ -349,7 +348,6 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3); tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
@ -383,7 +381,6 @@ static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
} }
@ -416,7 +413,6 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -424,7 +420,6 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
tmp2, tmp3); tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST8x4_UB(out0, out1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -467,7 +462,6 @@ static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst); PCKEV_ST_SB(tmp0, tmp1, dst);
dst += dst_stride; dst += dst_stride;
@ -475,19 +469,16 @@ static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst); PCKEV_ST_SB(tmp2, tmp3, dst);
dst += dst_stride; dst += dst_stride;
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst); PCKEV_ST_SB(tmp0, tmp1, dst);
dst += dst_stride; dst += dst_stride;
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst); PCKEV_ST_SB(tmp2, tmp3, dst);
dst += dst_stride; dst += dst_stride;
@ -522,47 +513,39 @@ static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst); PCKEV_ST_SB(tmp0, tmp1, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst + 16); PCKEV_ST_SB(tmp0, tmp1, dst + 16);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
@ -598,48 +581,40 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst); PCKEV_ST_SB(tmp0, tmp1, dst);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_ST_SB(tmp4, tmp5, dst + 16); PCKEV_ST_SB(tmp4, tmp5, dst + 16);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_ST_SB(tmp0, tmp1, dst + 32); PCKEV_ST_SB(tmp0, tmp1, dst + 32);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
SAT_UH2_UH(tmp4, tmp5, 7);
PCKEV_ST_SB(tmp4, tmp5, dst + 48); PCKEV_ST_SB(tmp4, tmp5, dst + 48);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
SAT_UH2_UH(tmp6, tmp7, 7);
PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
dst += (2 * dst_stride); dst += (2 * dst_stride);