mips msa vp9 loopfilter 16 optimization

average improvement ~3x-4x

Change-Id: I8ef263da6ebcf8f20aabaefeccf25a84640ba048
This commit is contained in:
Parag Salasakar 2015-06-04 11:50:41 +05:30
parent c005792951
commit 914f8f9ee0
6 changed files with 1885 additions and 3 deletions

View File

@ -137,6 +137,20 @@ void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON_ASM
#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int count) {
vp9_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
}
void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int count) {
vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
}
#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
public:
virtual ~Loop8Test6Param() {}
@ -676,4 +690,13 @@ INSTANTIATE_TEST_CASE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
INSTANTIATE_TEST_CASE_P(
MSA, Loop8Test6Param,
::testing::Values(
make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 1),
make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 2),
make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
#endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
} // namespace

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,246 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
v8i16 q0_sub_p0_r, filt_r, cnst3h; \
\
p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
\
filt = __msa_subs_s_b(p1_m, q1_m); \
filt = filt & (v16i8)hev_in; \
q0_sub_p0 = q0_m - p0_m; \
filt_sign = __msa_clti_s_b(filt, 0); \
\
cnst3h = __msa_ldi_h(3); \
q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
filt_r += q0_sub_p0_r; \
filt_r = __msa_sat_s_h(filt_r, 7); \
\
/* combine left and right part */ \
filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
\
filt = filt & (v16i8)mask_in; \
cnst4b = __msa_ldi_b(4); \
filt1 = __msa_adds_s_b(filt, cnst4b); \
filt1 >>= 3; \
\
cnst3b = __msa_ldi_b(3); \
filt2 = __msa_adds_s_b(filt, cnst3b); \
filt2 >>= 3; \
\
q0_m = __msa_subs_s_b(q0_m, filt1); \
q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
\
filt = __msa_srari_b(filt1, 1); \
hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
filt = filt & (v16i8)hev_in; \
\
q1_m = __msa_subs_s_b(q1_m, filt); \
q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \
p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
}
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \
v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
\
p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
\
filt = __msa_subs_s_b(p1_m, q1_m); \
\
filt = filt & (v16i8)hev_in; \
\
q0_sub_p0 = q0_m - p0_m; \
filt_sign = __msa_clti_s_b(filt, 0); \
\
cnst3h = __msa_ldi_h(3); \
q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
filt_r += q0_sub_p0_r; \
filt_r = __msa_sat_s_h(filt_r, 7); \
\
q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
filt_l += q0_sub_p0_l; \
filt_l = __msa_sat_s_h(filt_l, 7); \
\
filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
filt = filt & (v16i8)mask_in; \
\
cnst4b = __msa_ldi_b(4); \
filt1 = __msa_adds_s_b(filt, cnst4b); \
filt1 >>= 3; \
\
cnst3b = __msa_ldi_b(3); \
filt2 = __msa_adds_s_b(filt, cnst3b); \
filt2 >>= 3; \
\
q0_m = __msa_subs_s_b(q0_m, filt1); \
q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_m = __msa_adds_s_b(p0_m, filt2); \
p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
\
filt = __msa_srari_b(filt1, 1); \
hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
filt = filt & (v16i8)hev_in; \
\
q1_m = __msa_subs_s_b(q1_m, filt); \
q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \
p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
}
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \
v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
v16u8 zero_in = { 0 }; \
\
tmp = __msa_ori_b(zero_in, 1); \
p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
\
p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
\
flat_out = (tmp < (v16u8)flat_out); \
flat_out = __msa_xori_b(flat_out, 0xff); \
flat_out = flat_out & (mask); \
}
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
q5_in, q6_in, q7_in, flat_in, flat2_out) { \
v16u8 tmp, zero_in = { 0 }; \
v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
\
tmp = __msa_ori_b(zero_in, 1); \
p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
\
p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
\
flat2_out = (tmp < (v16u8)flat2_out); \
flat2_out = __msa_xori_b(flat2_out, 0xff); \
flat2_out = flat2_out & flat_in; \
}
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
q0_in, q1_in, q2_in, q3_in, \
p2_filt8_out, p1_filt8_out, p0_filt8_out, \
q0_filt8_out, q1_filt8_out, q2_filt8_out) { \
v8u16 tmp0, tmp1, tmp2; \
\
tmp2 = p2_in + p1_in + p0_in; \
tmp0 = p3_in << 1; \
\
tmp0 = tmp0 + tmp2 + q0_in; \
tmp1 = tmp0 + p3_in + p2_in; \
p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
\
tmp1 = tmp0 + p1_in + q1_in; \
p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
\
tmp1 = q2_in + q1_in + q0_in; \
tmp2 = tmp2 + tmp1; \
tmp0 = tmp2 + (p0_in); \
tmp0 = tmp0 + (p3_in); \
p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \
\
tmp0 = q2_in + q3_in; \
tmp0 = p0_in + tmp1 + tmp0; \
tmp1 = q3_in + q3_in; \
tmp1 = tmp1 + tmp0; \
q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
\
tmp0 = tmp2 + q3_in; \
tmp1 = tmp0 + q0_in; \
q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
\
tmp1 = tmp0 - p2_in; \
tmp0 = q1_in + q3_in; \
tmp1 = tmp0 + tmp1; \
q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \
}
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
q0_in, q1_in, q2_in, q3_in, \
limit_in, b_limit_in, thresh_in, \
hev_out, mask_out, flat_out) { \
v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
\
/* absolute subtraction of pixel values */ \
p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
\
/* calculation of hev */ \
flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
hev_out = thresh_in < (v16u8)flat_out; \
\
/* calculation of mask */ \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
p1_asub_q1_m >>= 1; \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
\
mask_out = b_limit_in < p0_asub_q0_m; \
mask_out = __msa_max_u_b(flat_out, mask_out); \
p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
\
mask_out = limit_in < (v16u8)mask_out; \
mask_out = __msa_xori_b(mask_out, 0xff); \
}
#endif /* VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ */

View File

@ -413,6 +413,33 @@
}
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
/* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride
Return Type - unsigned byte
Details : Index stidx halfword element from 'in' vector is copied and
stored on first line
Index stidx+1 halfword element from 'in' vector is copied and
stored on second line
Index stidx+2 halfword element from 'in' vector is copied and
stored on third line
Index stidx+3 halfword element from 'in' vector is copied and
stored on fourth line
*/
#define ST2x4_UB(in, stidx, pdst, stride) { \
uint16_t out0_m, out1_m, out2_m, out3_m; \
uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
\
out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
\
SH(out0_m, pblk_2x4_m); \
SH(out1_m, pblk_2x4_m + stride); \
SH(out2_m, pblk_2x4_m + 2 * stride); \
SH(out3_m, pblk_2x4_m + 3 * stride); \
}
/* Description : Store as 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
@ -518,6 +545,13 @@
}
#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
out0, out1, out2, out3, slide_val) { \
SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
}
#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
/* Description : Immediate number of columns to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
@ -754,6 +788,21 @@
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
/* Description : Interleave even double word elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Return Type - as per RTYPE
Details : Even double word elements of 'in0' and even double word
elements of 'in1' are interleaved and copied to 'out0'
Even double word elements of 'in2' and even double word
elements of 'in3' are interleaved and copied to 'out1'
*/
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
}
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
@ -840,6 +889,16 @@
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3); \
ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
out4, out5, out6, out7); \
}
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
/* Description : Interleave right half of halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3
@ -1301,6 +1360,78 @@
out7 = in0 - in7; \
}
/* Description : Transposes input 8x8 byte block
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
(input 8x8 byte block)
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
(output 8x8 byte block)
Return Type - unsigned byte
Details :
*/
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
\
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
}
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - unsigned byte
Details :
*/
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
\
ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
\
tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
\
ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
\
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
}
/* Description : Transposes 4x4 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3

View File

@ -216,11 +216,11 @@ specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc";
# Loopfilter
#
add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/;
specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
@ -238,7 +238,7 @@ add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_
specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

View File

@ -146,6 +146,8 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h