Merge "mips msa vpx convolve optimzation"
This commit is contained in:
commit
814e1346a6
@ -323,7 +323,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
|
||||
int8_t *filter) {
|
||||
v16i8 src0, src1, src2, src3, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
|
||||
v8u16 vec2, vec3, const255, filt;
|
||||
v8u16 vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[16]);
|
||||
|
||||
@ -331,14 +331,11 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
|
||||
SRARI_H2_UH(vec2, vec3, FILTER_BITS);
|
||||
MIN_UH2_UH(vec2, vec3, const255);
|
||||
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
|
||||
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
|
||||
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
|
||||
@ -353,7 +350,7 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
|
||||
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
v8u16 vec4, vec5, vec6, vec7, const255, filt;
|
||||
v8u16 vec4, vec5, vec6, vec7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[16]);
|
||||
|
||||
@ -361,8 +358,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
|
||||
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
|
||||
@ -370,7 +365,6 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
|
||||
vec6, vec7);
|
||||
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
|
||||
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
|
||||
PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
|
||||
res3);
|
||||
ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
|
||||
@ -402,7 +396,7 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
|
||||
int8_t *filter) {
|
||||
v16i8 src0, src1, src2, src3, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3;
|
||||
v8u16 vec0, vec1, vec2, vec3, const255, filt;
|
||||
v8u16 vec0, vec1, vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -410,8 +404,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
|
||||
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
|
||||
@ -419,7 +411,6 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
|
||||
dst, dst_stride);
|
||||
}
|
||||
@ -432,7 +423,7 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
int32_t height) {
|
||||
v16i8 src0, src1, src2, src3, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3;
|
||||
v8u16 vec0, vec1, vec2, vec3, const255, filt;
|
||||
v8u16 vec0, vec1, vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -440,8 +431,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
|
||||
@ -450,7 +439,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
|
||||
@ -463,7 +451,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
|
||||
dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -478,7 +465,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
|
||||
dst, dst_stride);
|
||||
@ -490,7 +476,6 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
|
||||
dst, dst_stride);
|
||||
}
|
||||
@ -520,7 +505,7 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3;
|
||||
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
|
||||
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -528,8 +513,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src2, src4, src6);
|
||||
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
|
||||
src += (4 * src_stride);
|
||||
@ -545,8 +528,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
|
||||
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(res0, res1, res2, res3, const255);
|
||||
MIN_UH4_UH(res4, res5, res6, res7, const255);
|
||||
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
|
||||
dst += dst_stride;
|
||||
PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
|
||||
@ -572,8 +553,6 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
|
||||
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(res0, res1, res2, res3, const255);
|
||||
MIN_UH4_UH(res4, res5, res6, res7, const255);
|
||||
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
|
||||
dst += dst_stride;
|
||||
PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
|
||||
@ -595,7 +574,7 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3;
|
||||
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
|
||||
v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -603,8 +582,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
for (loop_cnt = (height >> 1); loop_cnt--;) {
|
||||
src0 = LD_SB(src);
|
||||
src2 = LD_SB(src + 16);
|
||||
@ -627,8 +604,6 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
|
||||
res6, res7);
|
||||
SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
|
||||
SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
|
||||
MIN_UH4_UH(res0, res1, res2, res3, const255);
|
||||
MIN_UH4_UH(res4, res5, res6, res7, const255);
|
||||
LD_UB2(dst, 16, dst0, dst1);
|
||||
PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
|
||||
PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
|
||||
@ -650,7 +625,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, dst0, dst1, dst2, dst3;
|
||||
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -658,8 +633,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
|
||||
|
||||
const255 = (v8u16)__msa_ldi_h(255);
|
||||
|
||||
for (loop_cnt = height; loop_cnt--;) {
|
||||
LD_SB4(src, 16, src0, src2, src4, src6);
|
||||
src7 = LD_SB(src + 56);
|
||||
@ -677,8 +650,6 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
|
||||
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
|
||||
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
|
||||
LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
|
||||
MIN_UH4_UH(out0, out1, out2, out3, const255);
|
||||
MIN_UH4_UH(out4, out5, out6, out7, const255);
|
||||
PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
|
||||
PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
|
||||
PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
|
||||
|
@ -274,7 +274,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
|
||||
ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
|
||||
AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
|
||||
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
|
||||
@ -323,7 +322,6 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
|
||||
res2, res3);
|
||||
AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
|
||||
@ -391,7 +389,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
|
||||
tmp3 = __msa_dotp_u_h(vec3, filt_vt);
|
||||
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
|
||||
dst, dst_stride);
|
||||
}
|
||||
@ -436,7 +433,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
tmp1 = __msa_dotp_u_h(vec0, filt_vt);
|
||||
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
|
||||
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
|
||||
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
|
||||
@ -447,7 +443,6 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
|
||||
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
|
||||
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
|
||||
dst, dst_stride);
|
||||
@ -511,7 +506,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -520,7 +514,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -529,7 +522,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -538,7 +530,6 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
|
||||
dst += dst_stride;
|
||||
}
|
||||
|
@ -283,7 +283,6 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
|
||||
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
|
||||
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
|
||||
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
|
||||
out = __msa_aver_u_b(out, dst0);
|
||||
@ -323,7 +322,6 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
|
||||
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
|
||||
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
|
||||
@ -365,7 +363,6 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
|
||||
dst, dst_stride);
|
||||
}
|
||||
@ -402,7 +399,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
|
||||
dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -410,7 +406,6 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
|
||||
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
|
||||
dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -460,7 +455,6 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -468,19 +462,16 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
|
||||
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -519,48 +510,40 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
|
||||
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
|
||||
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
|
||||
|
||||
ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
|
||||
ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
|
||||
ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
|
||||
@ -605,48 +588,40 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
|
||||
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
|
||||
ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
|
||||
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp4, tmp5, 7);
|
||||
PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
|
||||
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp6, tmp7, 7);
|
||||
PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
|
||||
ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
|
||||
ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
|
||||
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp4, tmp5, 7);
|
||||
PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
|
||||
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp6, tmp7, 7);
|
||||
PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
|
||||
dst += (2 * dst_stride);
|
||||
|
||||
|
@ -318,7 +318,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
int8_t *filter) {
|
||||
v16i8 src0, src1, src2, src3, mask;
|
||||
v16u8 filt0, vec0, vec1, res0, res1;
|
||||
v8u16 vec2, vec3, filt, const255;
|
||||
v8u16 vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[16]);
|
||||
|
||||
@ -326,13 +326,10 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
|
||||
SRARI_H2_UH(vec2, vec3, FILTER_BITS);
|
||||
MIN_UH2_UH(vec2, vec3, const255);
|
||||
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
|
||||
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
|
||||
}
|
||||
@ -343,7 +340,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
v16u8 vec0, vec1, vec2, vec3, filt0;
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16i8 res0, res1, res2, res3;
|
||||
v8u16 vec4, vec5, vec6, vec7, filt, const255;
|
||||
v8u16 vec4, vec5, vec6, vec7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[16]);
|
||||
|
||||
@ -351,15 +348,12 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
|
||||
VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
|
||||
vec6, vec7);
|
||||
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
|
||||
MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
|
||||
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
|
||||
res2, res3);
|
||||
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
|
||||
@ -382,7 +376,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
int8_t *filter) {
|
||||
v16u8 filt0;
|
||||
v16i8 src0, src1, src2, src3, mask;
|
||||
v8u16 vec0, vec1, vec2, vec3, const255, filt;
|
||||
v8u16 vec0, vec1, vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -390,15 +384,12 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
|
||||
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
|
||||
ST8x4_UB(src0, src1, dst, dst_stride);
|
||||
}
|
||||
@ -408,7 +399,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
int8_t *filter, int32_t height) {
|
||||
v16u8 filt0;
|
||||
v16i8 src0, src1, src2, src3, mask, out0, out1;
|
||||
v8u16 vec0, vec1, vec2, vec3, filt, const255;
|
||||
v8u16 vec0, vec1, vec2, vec3, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -416,8 +407,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
|
||||
@ -426,7 +415,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
@ -440,7 +428,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -454,7 +441,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
LD_SB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
|
||||
@ -466,7 +452,6 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
|
||||
vec2, vec3);
|
||||
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
|
||||
MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
|
||||
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
|
||||
}
|
||||
@ -488,7 +473,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
uint32_t loop_cnt;
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -498,8 +483,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
LD_SB4(src, src_stride, src0, src2, src4, src6);
|
||||
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
|
||||
src += (4 * src_stride);
|
||||
@ -514,8 +497,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
out6, out7);
|
||||
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
|
||||
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
|
||||
MIN_UH4_UH(out0, out1, out2, out3, const255);
|
||||
MIN_UH4_UH(out4, out5, out6, out7, const255);
|
||||
PCKEV_ST_SB(out0, out1, dst);
|
||||
dst += dst_stride;
|
||||
PCKEV_ST_SB(out2, out3, dst);
|
||||
@ -540,8 +521,6 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
out6, out7);
|
||||
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
|
||||
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
|
||||
MIN_UH4_UH(out0, out1, out2, out3, const255);
|
||||
MIN_UH4_UH(out4, out5, out6, out7, const255);
|
||||
PCKEV_ST_SB(out0, out1, dst);
|
||||
dst += dst_stride;
|
||||
PCKEV_ST_SB(out2, out3, dst);
|
||||
@ -559,7 +538,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
|
||||
uint32_t loop_cnt;
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -567,8 +546,6 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
for (loop_cnt = height >> 1; loop_cnt--;) {
|
||||
src0 = LD_SB(src);
|
||||
src2 = LD_SB(src + 16);
|
||||
@ -591,8 +568,6 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
|
||||
out6, out7);
|
||||
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
|
||||
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
|
||||
MIN_UH4_UH(out0, out1, out2, out3, const255);
|
||||
MIN_UH4_UH(out4, out5, out6, out7, const255);
|
||||
PCKEV_ST_SB(out0, out1, dst);
|
||||
PCKEV_ST_SB(out2, out3, dst + 16);
|
||||
dst += dst_stride;
|
||||
@ -608,7 +583,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
|
||||
uint32_t loop_cnt;
|
||||
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
|
||||
v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
|
||||
v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
|
||||
|
||||
mask = LD_SB(&mc_filt_mask_arr[0]);
|
||||
|
||||
@ -616,8 +591,6 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
|
||||
filt = LD_UH(filter);
|
||||
filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
|
||||
|
||||
const255 = (v8u16) __msa_ldi_h(255);
|
||||
|
||||
for (loop_cnt = height; loop_cnt--;) {
|
||||
src0 = LD_SB(src);
|
||||
src2 = LD_SB(src + 16);
|
||||
@ -637,8 +610,6 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
|
||||
out6, out7);
|
||||
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
|
||||
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
|
||||
MIN_UH4_UH(out0, out1, out2, out3, const255);
|
||||
MIN_UH4_UH(out4, out5, out6, out7, const255);
|
||||
PCKEV_ST_SB(out0, out1, dst);
|
||||
PCKEV_ST_SB(out2, out3, dst + 16);
|
||||
PCKEV_ST_SB(out4, out5, dst + 32);
|
||||
|
@ -256,7 +256,6 @@ static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
|
||||
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
|
||||
}
|
||||
@ -298,7 +297,6 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
|
||||
vec4, vec5, vec6, vec7);
|
||||
SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
|
||||
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
|
||||
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
|
||||
res2, res3);
|
||||
ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
|
||||
@ -357,7 +355,6 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
tmp3 = __msa_dotp_u_h(vec3, filt_vt);
|
||||
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
}
|
||||
@ -402,7 +399,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
|
||||
tmp2 = __msa_dotp_u_h(vec0, filt_vt);
|
||||
|
||||
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp1, tmp2, 7);
|
||||
|
||||
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
|
||||
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
|
||||
@ -415,7 +411,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
|
||||
tmp4 = __msa_dotp_u_h(vec0, filt_vt);
|
||||
|
||||
SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp3, tmp4, 7);
|
||||
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -437,7 +432,6 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
|
||||
tmp8 = __msa_dotp_u_h(vec0, filt_vt);
|
||||
|
||||
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
|
||||
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -492,7 +486,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
|
||||
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp1, tmp2, 7);
|
||||
PCKEV_ST_SB(tmp1, tmp2, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -501,7 +494,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
|
||||
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp1, tmp2, 7);
|
||||
PCKEV_ST_SB(tmp1, tmp2, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -510,7 +502,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
|
||||
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp1, tmp2, 7);
|
||||
PCKEV_ST_SB(tmp1, tmp2, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -519,7 +510,6 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
|
||||
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
|
||||
SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp1, tmp2, 7);
|
||||
PCKEV_ST_SB(tmp1, tmp2, dst);
|
||||
dst += dst_stride;
|
||||
}
|
||||
|
@ -316,7 +316,6 @@ static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
|
||||
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
|
||||
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
|
||||
}
|
||||
@ -349,7 +348,6 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
|
||||
ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
|
||||
ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
|
||||
@ -383,7 +381,6 @@ static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
}
|
||||
@ -416,7 +413,6 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -424,7 +420,6 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
|
||||
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
|
||||
tmp2, tmp3);
|
||||
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
|
||||
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
|
||||
ST8x4_UB(out0, out1, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
@ -467,7 +462,6 @@ static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -475,19 +469,16 @@ static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst);
|
||||
dst += dst_stride;
|
||||
|
||||
@ -522,47 +513,39 @@ static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
|
||||
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst);
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
|
||||
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
|
||||
|
||||
ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
|
||||
ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst + 16);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
|
||||
ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
|
||||
@ -598,48 +581,40 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
|
||||
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
|
||||
ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
|
||||
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp4, tmp5, 7);
|
||||
PCKEV_ST_SB(tmp4, tmp5, dst + 16);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
|
||||
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp6, tmp7, 7);
|
||||
PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
|
||||
ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
|
||||
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
|
||||
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp0, tmp1, 7);
|
||||
PCKEV_ST_SB(tmp0, tmp1, dst + 32);
|
||||
|
||||
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
|
||||
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp2, tmp3, 7);
|
||||
PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
|
||||
|
||||
ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
|
||||
ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
|
||||
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
|
||||
SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp4, tmp5, 7);
|
||||
PCKEV_ST_SB(tmp4, tmp5, dst + 48);
|
||||
|
||||
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
|
||||
SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
|
||||
SAT_UH2_UH(tmp6, tmp7, 7);
|
||||
PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
|
||||
dst += (2 * dst_stride);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user