mips msa vp8 block subtract optimization
average improvement ~2x-3x Change-Id: I30abf4c92cddcc9e87b7a40d4106076e1ec701c2
This commit is contained in:
parent
e3ee8c292b
commit
0e3f494b21
@ -643,6 +643,23 @@
|
||||
}
|
||||
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product of word vector elements
|
||||
Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Signed word elements from 'mult0' are multiplied with
|
||||
signed word elements from 'cnst0' producing a result
|
||||
twice the size of input i.e. signed double word.
|
||||
The multiplication result of adjacent odd-even elements
|
||||
are added together and written to the 'out0' vector
|
||||
*/
|
||||
#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
|
||||
out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
|
||||
}
|
||||
#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of byte vector elements
|
||||
Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
Outputs - out0, out1
|
||||
@ -693,6 +710,23 @@
|
||||
}
|
||||
#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of double word vector elements
|
||||
Arguments : Inputs - mult0, mult1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Each signed word element from 'mult0' is multiplied with itself
|
||||
producing an intermediate result twice the size of it
|
||||
i.e. signed double word
|
||||
The multiplication result of adjacent odd-even elements
|
||||
are added to the 'out0' vector
|
||||
*/
|
||||
#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
|
||||
out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
|
||||
}
|
||||
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Clips all signed halfword elements of input vector
|
||||
between 0 & 255
|
||||
Arguments : Input - in
|
||||
@ -805,6 +839,21 @@
|
||||
}
|
||||
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Horizontal subtraction of signed halfword vector elements
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Each signed odd halfword element from 'in0' is subtracted from
|
||||
even signed halfword element from 'in0' (pairwise) and the
|
||||
word result is written to 'out0'
|
||||
*/
|
||||
#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
|
||||
out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
|
||||
}
|
||||
#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Set element n input vector to GPR value
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Output - out
|
||||
|
@ -295,15 +295,15 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
|
||||
# Block subtraction
|
||||
#
|
||||
add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
|
||||
specialize qw/vp8_block_error mmx sse2/;
|
||||
specialize qw/vp8_block_error mmx sse2 msa/;
|
||||
$vp8_block_error_sse2=vp8_block_error_xmm;
|
||||
|
||||
add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
|
||||
specialize qw/vp8_mbblock_error mmx sse2/;
|
||||
specialize qw/vp8_mbblock_error mmx sse2 msa/;
|
||||
$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
|
||||
|
||||
add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
|
||||
specialize qw/vp8_mbuverror mmx sse2/;
|
||||
specialize qw/vp8_mbuverror mmx sse2 msa/;
|
||||
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
|
||||
|
||||
#
|
||||
|
174
vp8/encoder/mips/msa/encodeopt_msa.c
Normal file
174
vp8/encoder/mips/msa/encodeopt_msa.c
Normal file
@ -0,0 +1,174 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "vp8/common/mips/msa/vp8_macros_msa.h"
|
||||
#include "vp8/encoder/block.h"
|
||||
|
||||
int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr)
|
||||
{
|
||||
int32_t err = 0;
|
||||
uint32_t loop_cnt;
|
||||
v8i16 coeff, dq_coeff, coeff0, coeff1;
|
||||
v4i32 diff0, diff1;
|
||||
v2i64 err0 = { 0 };
|
||||
v2i64 err1 = { 0 };
|
||||
|
||||
for (loop_cnt = 2; loop_cnt--;)
|
||||
{
|
||||
coeff = LD_SH(coeff_ptr);
|
||||
dq_coeff = LD_SH(dq_coeff_ptr);
|
||||
ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DPADD_SD2_SD(diff0, diff1, err0, err1);
|
||||
coeff_ptr += 8;
|
||||
dq_coeff_ptr += 8;
|
||||
}
|
||||
|
||||
err0 += __msa_splati_d(err0, 1);
|
||||
err1 += __msa_splati_d(err1, 1);
|
||||
err = __msa_copy_s_d(err0, 0);
|
||||
err += __msa_copy_s_d(err1, 0);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc)
|
||||
{
|
||||
BLOCK *be;
|
||||
BLOCKD *bd;
|
||||
int16_t *coeff_ptr, *dq_coeff_ptr;
|
||||
int32_t err = 0;
|
||||
uint32_t loop_cnt;
|
||||
v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
|
||||
v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
|
||||
v4i32 diff0, diff1;
|
||||
v2i64 err0, err1;
|
||||
v16u8 zero = { 0 };
|
||||
v16u8 mask0 = (v16u8)__msa_ldi_b(255);
|
||||
|
||||
if (1 == dc)
|
||||
{
|
||||
mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero);
|
||||
}
|
||||
|
||||
for (loop_cnt = 0; loop_cnt < 8; loop_cnt++)
|
||||
{
|
||||
be = &mb->block[2 * loop_cnt];
|
||||
bd = &mb->e_mbd.block[2 * loop_cnt];
|
||||
coeff_ptr = be->coeff;
|
||||
dq_coeff_ptr = bd->dqcoeff;
|
||||
coeff = LD_SH(coeff_ptr);
|
||||
dq_coeff = LD_SH(dq_coeff_ptr);
|
||||
coeff_ptr += 8;
|
||||
dq_coeff_ptr += 8;
|
||||
coeff2 = LD_SH(coeff_ptr);
|
||||
dq_coeff2 = LD_SH(dq_coeff_ptr);
|
||||
be = &mb->block[2 * loop_cnt + 1];
|
||||
bd = &mb->e_mbd.block[2 * loop_cnt + 1];
|
||||
coeff_ptr = be->coeff;
|
||||
dq_coeff_ptr = bd->dqcoeff;
|
||||
coeff3 = LD_SH(coeff_ptr);
|
||||
dq_coeff3 = LD_SH(dq_coeff_ptr);
|
||||
coeff_ptr += 8;
|
||||
dq_coeff_ptr += 8;
|
||||
coeff4 = LD_SH(coeff_ptr);
|
||||
dq_coeff4 = LD_SH(dq_coeff_ptr);
|
||||
ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
|
||||
DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
|
||||
ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DPADD_SD2_SD(diff0, diff1, err0, err1);
|
||||
err0 += __msa_splati_d(err0, 1);
|
||||
err1 += __msa_splati_d(err1, 1);
|
||||
err += __msa_copy_s_d(err0, 0);
|
||||
err += __msa_copy_s_d(err1, 0);
|
||||
|
||||
ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
|
||||
DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
|
||||
ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DPADD_SD2_SD(diff0, diff1, err0, err1);
|
||||
err0 += __msa_splati_d(err0, 1);
|
||||
err1 += __msa_splati_d(err1, 1);
|
||||
err += __msa_copy_s_d(err0, 0);
|
||||
err += __msa_copy_s_d(err1, 0);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int32_t vp8_mbuverror_msa(MACROBLOCK *mb)
|
||||
{
|
||||
BLOCK *be;
|
||||
BLOCKD *bd;
|
||||
int16_t *coeff_ptr, *dq_coeff_ptr;
|
||||
int32_t err = 0;
|
||||
uint32_t loop_cnt;
|
||||
v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
|
||||
v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
|
||||
v4i32 diff0, diff1;
|
||||
v2i64 err0, err1, err_dup0, err_dup1;
|
||||
|
||||
for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2)
|
||||
{
|
||||
be = &mb->block[loop_cnt];
|
||||
bd = &mb->e_mbd.block[loop_cnt];
|
||||
coeff_ptr = be->coeff;
|
||||
dq_coeff_ptr = bd->dqcoeff;
|
||||
coeff = LD_SH(coeff_ptr);
|
||||
dq_coeff = LD_SH(dq_coeff_ptr);
|
||||
coeff_ptr += 8;
|
||||
dq_coeff_ptr += 8;
|
||||
coeff2 = LD_SH(coeff_ptr);
|
||||
dq_coeff2 = LD_SH(dq_coeff_ptr);
|
||||
be = &mb->block[loop_cnt + 1];
|
||||
bd = &mb->e_mbd.block[loop_cnt + 1];
|
||||
coeff_ptr = be->coeff;
|
||||
dq_coeff_ptr = bd->dqcoeff;
|
||||
coeff3 = LD_SH(coeff_ptr);
|
||||
dq_coeff3 = LD_SH(dq_coeff_ptr);
|
||||
coeff_ptr += 8;
|
||||
dq_coeff_ptr += 8;
|
||||
coeff4 = LD_SH(coeff_ptr);
|
||||
dq_coeff4 = LD_SH(dq_coeff_ptr);
|
||||
|
||||
ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
|
||||
|
||||
ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DPADD_SD2_SD(diff0, diff1, err0, err1);
|
||||
err_dup0 = __msa_splati_d(err0, 1);
|
||||
err_dup1 = __msa_splati_d(err1, 1);
|
||||
ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
|
||||
err += __msa_copy_s_d(err0, 0);
|
||||
err += __msa_copy_s_d(err1, 0);
|
||||
|
||||
ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
|
||||
ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
|
||||
HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
|
||||
DPADD_SD2_SD(diff0, diff1, err0, err1);
|
||||
err_dup0 = __msa_splati_d(err0, 1);
|
||||
err_dup1 = __msa_splati_d(err1, 1);
|
||||
ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
|
||||
err += __msa_copy_s_d(err0, 0);
|
||||
err += __msa_copy_s_d(err1, 0);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
@ -104,6 +104,7 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
|
||||
endif
|
||||
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
|
||||
|
||||
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
|
||||
|
Loading…
x
Reference in New Issue
Block a user