Add mips msa vpx Integer projection row/col functions

average improvement ~4x-5x

Change-Id: I17c41383250282b39f5ecae0197ef1df7de20801
This commit is contained in:
Kaustubh Raste 2017-01-27 11:11:42 +05:30
parent c1553f859f
commit 407fad2356
4 changed files with 195 additions and 2 deletions

View File

@ -393,6 +393,20 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
INSTANTIATE_TEST_CASE_P(
MSA, IntProRowTest,
::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
make_tuple(64, &vpx_int_pro_row_msa,
&vpx_int_pro_row_c)));
INSTANTIATE_TEST_CASE_P(
MSA, IntProColTest,
::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
make_tuple(64, &vpx_int_pro_col_msa,
&vpx_int_pro_col_c)));
INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),

View File

@ -389,3 +389,175 @@ int vpx_satd_msa(const int16_t *data, int length) {
return satd;
}
void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
const int ref_stride, const int height) {
int i;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v8i16 hbuf_r = { 0 };
v8i16 hbuf_l = { 0 };
v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
if (16 == height) {
for (i = 2; i--;) {
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ref += 8 * ref_stride;
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
}
SRA_2V(hbuf_r, hbuf_l, 3);
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
} else if (32 == height) {
for (i = 2; i--;) {
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ref += 8 * ref_stride;
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ref += 8 * ref_stride;
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
}
SRA_2V(hbuf_r, hbuf_l, 4);
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
} else if (64 == height) {
for (i = 4; i--;) {
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ref += 8 * ref_stride;
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ref += 8 * ref_stride;
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
}
SRA_2V(hbuf_r, hbuf_l, 5);
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
} else {
const int norm_factor = height >> 1;
int cnt;
for (cnt = 0; cnt < 16; cnt++) {
hbuf[cnt] = 0;
}
for (i = 0; i < height; ++i) {
for (cnt = 0; cnt < 16; cnt++) {
hbuf[cnt] += ref[cnt];
}
ref += ref_stride;
}
for (cnt = 0; cnt < 16; cnt++) {
hbuf[cnt] /= norm_factor;
}
}
}
int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
int16_t sum;
v16u8 ref0, ref1, ref2, ref3;
v8u16 ref0_h;
if (16 == width) {
ref0 = LD_UB(ref);
ref0_h = __msa_hadd_u_h(ref0, ref0);
sum = HADD_UH_U32(ref0_h);
} else if (32 == width) {
LD_UB2(ref, 16, ref0, ref1);
ref0_h = __msa_hadd_u_h(ref0, ref0);
ref0_h += __msa_hadd_u_h(ref1, ref1);
sum = HADD_UH_U32(ref0_h);
} else if (64 == width) {
LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
ref0_h = __msa_hadd_u_h(ref0, ref0);
ref0_h += __msa_hadd_u_h(ref1, ref1);
ref0_h += __msa_hadd_u_h(ref2, ref2);
ref0_h += __msa_hadd_u_h(ref3, ref3);
sum = HADD_UH_U32(ref0_h);
} else {
int idx;
sum = 0;
for (idx = 0; idx < width; ++idx) {
sum += ref[idx];
}
}
return sum;
}

View File

@ -1049,6 +1049,7 @@
}
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@ -1559,6 +1560,12 @@
Details : Each element of vector 'in0' is right shifted by 'shift' and
the result is written in-place. 'shift' is a GP variable.
*/
#define SRA_2V(in0, in1, shift) \
{ \
in0 = in0 >> shift; \
in1 = in1 >> shift; \
}
#define SRA_4V(in0, in1, in2, in3, shift) \
{ \
in0 = in0 >> shift; \

View File

@ -895,10 +895,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_satd sse2 neon msa/;
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
specialize qw/vpx_int_pro_row sse2 neon/;
specialize qw/vpx_int_pro_row sse2 neon msa/;
add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
specialize qw/vpx_int_pro_col sse2 neon/;
specialize qw/vpx_int_pro_col sse2 neon msa/;
add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
specialize qw/vpx_vector_var neon sse2/;