avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uniw mc functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC uniw mc functions (qpel as well as epel) in new file hevc_mc_uniw_msa.c
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Shivraj Patil 2015-06-02 14:08:09 +05:30 committed by Michael Niedermayer
parent d7a762553c
commit ce1761db19
5 changed files with 5040 additions and 0 deletions

View File

@ -23,6 +23,7 @@ OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o
OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_init_mips.o
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_mc_uni_msa.o \
mips/hevc_mc_uniw_msa.o \
mips/hevc_mc_bi_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o

File diff suppressed because it is too large Load Diff

View File

@ -97,6 +97,99 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
c->put_hevc_qpel_uni_w[1][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
c->put_hevc_qpel_uni_w[3][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
c->put_hevc_qpel_uni_w[4][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
c->put_hevc_qpel_uni_w[5][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
c->put_hevc_qpel_uni_w[6][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
c->put_hevc_qpel_uni_w[7][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
c->put_hevc_qpel_uni_w[8][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
c->put_hevc_qpel_uni_w[9][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
c->put_hevc_qpel_uni_w[4][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
c->put_hevc_qpel_uni_w[5][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
c->put_hevc_qpel_uni_w[6][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
c->put_hevc_qpel_uni_w[7][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
c->put_hevc_qpel_uni_w[8][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
c->put_hevc_qpel_uni_w[9][1][1] =
ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
c->put_hevc_epel_uni_w[1][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
c->put_hevc_epel_uni_w[2][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
c->put_hevc_epel_uni_w[3][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
c->put_hevc_epel_uni_w[4][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
c->put_hevc_epel_uni_w[5][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
c->put_hevc_epel_uni_w[6][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
c->put_hevc_epel_uni_w[7][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
c->put_hevc_epel_uni_w[4][1][1] =
ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
c->put_hevc_epel_uni_w[5][1][1] =
ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
c->put_hevc_epel_uni_w[6][1][1] =
ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
c->put_hevc_epel_uni_w[7][1][1] =
ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;

View File

@ -117,6 +117,90 @@ UNI_MC(qpel, hv, 64);
#undef UNI_MC
#define UNI_W_MC(PEL, DIR, WIDTH) \
void ff_hevc_put_hevc_uni_w_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t \
dst_stride, \
uint8_t *src, \
ptrdiff_t \
src_stride, \
int height, \
int denom, \
int weight, \
int offset, \
intptr_t mx, \
intptr_t my, \
int width)
UNI_W_MC(pel, pixels, 4);
UNI_W_MC(pel, pixels, 6);
UNI_W_MC(pel, pixels, 8);
UNI_W_MC(pel, pixels, 12);
UNI_W_MC(pel, pixels, 16);
UNI_W_MC(pel, pixels, 24);
UNI_W_MC(pel, pixels, 32);
UNI_W_MC(pel, pixels, 48);
UNI_W_MC(pel, pixels, 64);
UNI_W_MC(qpel, h, 4);
UNI_W_MC(qpel, h, 8);
UNI_W_MC(qpel, h, 12);
UNI_W_MC(qpel, h, 16);
UNI_W_MC(qpel, h, 24);
UNI_W_MC(qpel, h, 32);
UNI_W_MC(qpel, h, 48);
UNI_W_MC(qpel, h, 64);
UNI_W_MC(qpel, v, 4);
UNI_W_MC(qpel, v, 8);
UNI_W_MC(qpel, v, 12);
UNI_W_MC(qpel, v, 16);
UNI_W_MC(qpel, v, 24);
UNI_W_MC(qpel, v, 32);
UNI_W_MC(qpel, v, 48);
UNI_W_MC(qpel, v, 64);
UNI_W_MC(qpel, hv, 4);
UNI_W_MC(qpel, hv, 8);
UNI_W_MC(qpel, hv, 12);
UNI_W_MC(qpel, hv, 16);
UNI_W_MC(qpel, hv, 24);
UNI_W_MC(qpel, hv, 32);
UNI_W_MC(qpel, hv, 48);
UNI_W_MC(qpel, hv, 64);
UNI_W_MC(epel, h, 4);
UNI_W_MC(epel, h, 6);
UNI_W_MC(epel, h, 8);
UNI_W_MC(epel, h, 12);
UNI_W_MC(epel, h, 16);
UNI_W_MC(epel, h, 24);
UNI_W_MC(epel, h, 32);
UNI_W_MC(epel, h, 48);
UNI_W_MC(epel, h, 64);
UNI_W_MC(epel, v, 4);
UNI_W_MC(epel, v, 6);
UNI_W_MC(epel, v, 8);
UNI_W_MC(epel, v, 12);
UNI_W_MC(epel, v, 16);
UNI_W_MC(epel, v, 24);
UNI_W_MC(epel, v, 32);
UNI_W_MC(epel, v, 48);
UNI_W_MC(epel, v, 64);
UNI_W_MC(epel, hv, 4);
UNI_W_MC(epel, hv, 6);
UNI_W_MC(epel, hv, 8);
UNI_W_MC(epel, hv, 12);
UNI_W_MC(epel, hv, 16);
UNI_W_MC(epel, hv, 24);
UNI_W_MC(epel, hv, 32);
UNI_W_MC(epel, hv, 48);
UNI_W_MC(epel, hv, 64);
#undef UNI_W_MC
#define BI_MC(PEL, DIR, WIDTH) \
void ff_hevc_put_hevc_bi_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t dst_stride, \

View File

@ -802,6 +802,34 @@
}
#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
/* Description : Dot product of halfword vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
Outputs - out0, out1
Return Type - signed word
Details : Signed halfword elements from mult0 are multiplied with
signed halfword elements from cnst0 producing a result
twice the size of input i.e. signed word.
Then this multiplication results of adjacent odd-even elements
are added together and stored to the out vector
(2 signed word results)
*/
#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
{ \
out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
}
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
cnst0, cnst1, cnst2, cnst3, \
out0, out1, out2, out3) \
{ \
DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
}
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
/* Description : Dot product & addition of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
@ -1017,6 +1045,7 @@
out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
}
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
#define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
@ -1088,6 +1117,7 @@
out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
}
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
#define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
{ \
@ -1555,6 +1585,31 @@
#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded words
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1, (in place)
Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right arithmetic by
number of bits respective element holds in vector 'shift'.
The last discarded bit is added to shifted value for rounding
and the result is in place written to 'in0'
Here, 'shift' is a vector passed in
Similar for other pairs
*/
#define SRAR_W2(RTYPE, in0, in1, shift) \
{ \
in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
}
#define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRAR_W2(RTYPE, in0, in1, shift) \
SRAR_W2(RTYPE, in2, in3, shift) \
}
#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, in2, in3, shift
Outputs - in0, in1, in2, in3 (in place)
@ -1616,6 +1671,23 @@
MUL2(in4, in5, in6, in7, out2, out3); \
}
/* Description : Addition of 2 pairs of vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
Details : Each element from 2 pairs vectors is added and 2 results are
produced
*/
#define ADD2(in0, in1, in2, in3, out0, out1) \
{ \
out0 = in0 + in1; \
out1 = in2 + in3; \
}
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
{ \
ADD2(in0, in1, in2, in3, out0, out1); \
ADD2(in4, in5, in6, in7, out2, out3); \
}
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Inputs - in (1 input unsigned byte vector)
Outputs - out0, out1 (unsigned 2 halfword vectors)