Add mips msa vpx Integer projection row/col functions
average improvement ~4x-5x Change-Id: I17c41383250282b39f5ecae0197ef1df7de20801
This commit is contained in:
parent
c1553f859f
commit
407fad2356
@ -393,6 +393,20 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
|
||||
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MSA, IntProRowTest,
|
||||
::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
|
||||
make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
|
||||
make_tuple(64, &vpx_int_pro_row_msa,
|
||||
&vpx_int_pro_row_c)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MSA, IntProColTest,
|
||||
::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
|
||||
make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
|
||||
make_tuple(64, &vpx_int_pro_col_msa,
|
||||
&vpx_int_pro_col_c)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
|
||||
::testing::Values(make_tuple(16, &vpx_satd_msa),
|
||||
make_tuple(64, &vpx_satd_msa),
|
||||
|
@ -389,3 +389,175 @@ int vpx_satd_msa(const int16_t *data, int length) {
|
||||
|
||||
return satd;
|
||||
}
|
||||
|
||||
void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
|
||||
const int ref_stride, const int height) {
|
||||
int i;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||||
v8i16 hbuf_r = { 0 };
|
||||
v8i16 hbuf_l = { 0 };
|
||||
v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
|
||||
v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
|
||||
|
||||
if (16 == height) {
|
||||
for (i = 2; i--;) {
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ref += 8 * ref_stride;
|
||||
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
|
||||
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
|
||||
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
|
||||
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
|
||||
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
|
||||
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
|
||||
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
|
||||
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
|
||||
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
}
|
||||
|
||||
SRA_2V(hbuf_r, hbuf_l, 3);
|
||||
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
|
||||
} else if (32 == height) {
|
||||
for (i = 2; i--;) {
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ref += 8 * ref_stride;
|
||||
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
|
||||
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
|
||||
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
|
||||
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
|
||||
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
|
||||
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
|
||||
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
|
||||
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
|
||||
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ref += 8 * ref_stride;
|
||||
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
|
||||
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
|
||||
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
|
||||
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
|
||||
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
|
||||
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
|
||||
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
|
||||
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
|
||||
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
}
|
||||
|
||||
SRA_2V(hbuf_r, hbuf_l, 4);
|
||||
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
|
||||
} else if (64 == height) {
|
||||
for (i = 4; i--;) {
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ref += 8 * ref_stride;
|
||||
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
|
||||
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
|
||||
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
|
||||
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
|
||||
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
|
||||
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
|
||||
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
|
||||
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
|
||||
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ref += 8 * ref_stride;
|
||||
UNPCK_UB_SH(ref0, ref0_r, ref0_l);
|
||||
UNPCK_UB_SH(ref1, ref1_r, ref1_l);
|
||||
UNPCK_UB_SH(ref2, ref2_r, ref2_l);
|
||||
UNPCK_UB_SH(ref3, ref3_r, ref3_l);
|
||||
UNPCK_UB_SH(ref4, ref4_r, ref4_l);
|
||||
UNPCK_UB_SH(ref5, ref5_r, ref5_l);
|
||||
UNPCK_UB_SH(ref6, ref6_r, ref6_l);
|
||||
UNPCK_UB_SH(ref7, ref7_r, ref7_l);
|
||||
ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
|
||||
hbuf_r, hbuf_l, hbuf_r, hbuf_l);
|
||||
}
|
||||
|
||||
SRA_2V(hbuf_r, hbuf_l, 5);
|
||||
ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
|
||||
} else {
|
||||
const int norm_factor = height >> 1;
|
||||
int cnt;
|
||||
|
||||
for (cnt = 0; cnt < 16; cnt++) {
|
||||
hbuf[cnt] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
for (cnt = 0; cnt < 16; cnt++) {
|
||||
hbuf[cnt] += ref[cnt];
|
||||
}
|
||||
|
||||
ref += ref_stride;
|
||||
}
|
||||
|
||||
for (cnt = 0; cnt < 16; cnt++) {
|
||||
hbuf[cnt] /= norm_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
|
||||
int16_t sum;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v8u16 ref0_h;
|
||||
|
||||
if (16 == width) {
|
||||
ref0 = LD_UB(ref);
|
||||
ref0_h = __msa_hadd_u_h(ref0, ref0);
|
||||
sum = HADD_UH_U32(ref0_h);
|
||||
} else if (32 == width) {
|
||||
LD_UB2(ref, 16, ref0, ref1);
|
||||
ref0_h = __msa_hadd_u_h(ref0, ref0);
|
||||
ref0_h += __msa_hadd_u_h(ref1, ref1);
|
||||
sum = HADD_UH_U32(ref0_h);
|
||||
} else if (64 == width) {
|
||||
LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
|
||||
ref0_h = __msa_hadd_u_h(ref0, ref0);
|
||||
ref0_h += __msa_hadd_u_h(ref1, ref1);
|
||||
ref0_h += __msa_hadd_u_h(ref2, ref2);
|
||||
ref0_h += __msa_hadd_u_h(ref3, ref3);
|
||||
sum = HADD_UH_U32(ref0_h);
|
||||
} else {
|
||||
int idx;
|
||||
|
||||
sum = 0;
|
||||
for (idx = 0; idx < width; ++idx) {
|
||||
sum += ref[idx];
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
@ -1049,6 +1049,7 @@
|
||||
}
|
||||
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
|
||||
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
|
||||
#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even byte elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -1559,6 +1560,12 @@
|
||||
Details : Each element of vector 'in0' is right shifted by 'shift' and
|
||||
the result is written in-place. 'shift' is a GP variable.
|
||||
*/
|
||||
#define SRA_2V(in0, in1, shift) \
|
||||
{ \
|
||||
in0 = in0 >> shift; \
|
||||
in1 = in1 >> shift; \
|
||||
}
|
||||
|
||||
#define SRA_4V(in0, in1, in2, in3, shift) \
|
||||
{ \
|
||||
in0 = in0 >> shift; \
|
||||
|
@ -895,10 +895,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
||||
specialize qw/vpx_satd sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
|
||||
specialize qw/vpx_int_pro_row sse2 neon/;
|
||||
specialize qw/vpx_int_pro_row sse2 neon msa/;
|
||||
|
||||
add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
|
||||
specialize qw/vpx_int_pro_col sse2 neon/;
|
||||
specialize qw/vpx_int_pro_col sse2 neon msa/;
|
||||
|
||||
add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
|
||||
specialize qw/vpx_vector_var neon sse2/;
|
||||
|
Loading…
x
Reference in New Issue
Block a user