avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions
This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions in new file hevc_lpf_sao_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h In this patch, in comparision with previous patch, duplicated c functions are removed. Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
05c57ba2f4
commit
271195f85b
@ -26,7 +26,8 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
|
||||
mips/hevc_mc_uniw_msa.o \
|
||||
mips/hevc_mc_bi_msa.o \
|
||||
mips/hevc_mc_biw_msa.o \
|
||||
mips/hevc_idct_msa.o
|
||||
mips/hevc_idct_msa.o \
|
||||
mips/hevc_lpf_sao_msa.o
|
||||
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
|
||||
|
2088
libavcodec/mips/hevc_lpf_sao_msa.c
Normal file
2088
libavcodec/mips/hevc_lpf_sao_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -403,6 +403,32 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
|
||||
c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
|
||||
c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
|
||||
|
||||
c->sao_band_filter[0] =
|
||||
c->sao_band_filter[1] =
|
||||
c->sao_band_filter[2] =
|
||||
c->sao_band_filter[3] =
|
||||
c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
|
||||
|
||||
c->sao_edge_filter[0] =
|
||||
c->sao_edge_filter[1] =
|
||||
c->sao_edge_filter[2] =
|
||||
c->sao_edge_filter[3] =
|
||||
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
|
||||
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
|
||||
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
|
||||
|
||||
c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
|
||||
c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
|
||||
|
||||
c->hevc_h_loop_filter_chroma_c =
|
||||
ff_hevc_loop_filter_chroma_h_8_msa;
|
||||
c->hevc_v_loop_filter_chroma_c =
|
||||
ff_hevc_loop_filter_chroma_v_8_msa;
|
||||
|
||||
c->idct[0] = ff_hevc_idct_4x4_msa;
|
||||
c->idct[1] = ff_hevc_idct_8x8_msa;
|
||||
c->idct[2] = ff_hevc_idct_16x16_msa;
|
||||
|
@ -431,6 +431,36 @@ BI_W_MC(epel, hv, 64);
|
||||
|
||||
#undef BI_W_MC
|
||||
|
||||
void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t beta, int32_t *tc,
|
||||
uint8_t *no_p, uint8_t *no_q);
|
||||
|
||||
void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t beta, int32_t *tc,
|
||||
uint8_t *no_p, uint8_t *no_q);
|
||||
|
||||
void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t *tc, uint8_t *no_p,
|
||||
uint8_t *no_q);
|
||||
|
||||
void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t *tc, uint8_t *no_p,
|
||||
uint8_t *no_q);
|
||||
|
||||
void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride_dst, ptrdiff_t stride_src,
|
||||
int16_t *sao_offset_val, int sao_left_class,
|
||||
int width, int height);
|
||||
|
||||
void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride_dst,
|
||||
int16_t *sao_offset_val,
|
||||
int eo, int width, int height);
|
||||
|
||||
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit);
|
||||
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit);
|
||||
|
@ -761,6 +761,8 @@
|
||||
out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
|
||||
}
|
||||
#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
|
||||
#define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
|
||||
#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
|
||||
|
||||
#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
|
||||
out0, out1, out2, out3, slide_val) \
|
||||
@ -821,6 +823,23 @@
|
||||
VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
|
||||
}
|
||||
#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
|
||||
#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Shuffle halfword vector elements as per mask vector
|
||||
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Selective halfword elements from in0 & in1 are copied to out0
|
||||
as per control vector mask0
|
||||
Selective halfword elements from in2 & in3 are copied to out1
|
||||
as per control vector mask1
|
||||
*/
|
||||
#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
|
||||
out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
|
||||
}
|
||||
#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Shuffle byte vector elements as per mask vector
|
||||
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
|
||||
@ -1060,6 +1079,25 @@
|
||||
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
|
||||
#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even byte elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Even byte elements of 'in0' and even byte
|
||||
elements of 'in1' are interleaved and copied to 'out0'
|
||||
Even byte elements of 'in2' and even byte
|
||||
elements of 'in3' are interleaved and copied to 'out1'
|
||||
*/
|
||||
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
|
||||
out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
|
||||
}
|
||||
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
|
||||
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even halfword elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
@ -1107,6 +1145,8 @@
|
||||
out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
|
||||
}
|
||||
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of byte elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -1212,9 +1252,22 @@
|
||||
ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
|
||||
ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
|
||||
#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
|
||||
#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
|
||||
#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
|
||||
#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3); \
|
||||
ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out4, out5, out6, out7); \
|
||||
}
|
||||
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave right half of halfword elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
@ -1870,6 +1923,25 @@
|
||||
ADD2(in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
|
||||
/* Description : Sign extend byte elements from input vector and return
|
||||
halfword results in pair of vectors
|
||||
Arguments : Inputs - in (1 input byte vector)
|
||||
Outputs - out0, out1 (sign extended 2 halfword vectors)
|
||||
Return Type - signed halfword
|
||||
Details : Sign bit of byte elements from input vector 'in' is
|
||||
extracted and interleaved right with same vector 'in0' to
|
||||
generate 8 signed halfword elements in 'out0'
|
||||
Then interleaved left with same vector 'in0' to
|
||||
generate 8 signed halfword elements in 'out1'
|
||||
*/
|
||||
#define UNPCK_SB_SH(in, out0, out1) \
|
||||
{ \
|
||||
v16i8 tmp_m; \
|
||||
\
|
||||
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
|
||||
ILVRL_B2_SH(tmp_m, in, out0, out1); \
|
||||
}
|
||||
|
||||
/* Description : Zero extend unsigned byte elements to halfword elements
|
||||
Arguments : Inputs - in (1 input unsigned byte vector)
|
||||
Outputs - out0, out1 (unsigned 2 halfword vectors)
|
||||
@ -1903,6 +1975,18 @@
|
||||
ILVRL_H2_SW(tmp_m, in, out0, out1); \
|
||||
}
|
||||
|
||||
/* Description : Swap two variables
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - in0, in1 (in-place)
|
||||
Details : Swapping of two input variables using xor
|
||||
*/
|
||||
#define SWAP(in0, in1) \
|
||||
{ \
|
||||
in0 = in0 ^ in1; \
|
||||
in1 = in0 ^ in1; \
|
||||
in0 = in0 ^ in1; \
|
||||
}
|
||||
|
||||
/* Description : Butterfly of 4 input vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1, out2, out3
|
||||
@ -1959,9 +2043,34 @@
|
||||
out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
|
||||
out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
|
||||
}
|
||||
|
||||
#define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
|
||||
#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Transposes input 8x8 byte block
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
(input 8x8 byte block)
|
||||
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
(output 8x8 byte block)
|
||||
Return Type - unsigned byte
|
||||
Details :
|
||||
*/
|
||||
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
|
||||
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
|
||||
\
|
||||
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
|
||||
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
|
||||
ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
|
||||
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
|
||||
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
|
||||
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
|
||||
SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
|
||||
SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
|
||||
}
|
||||
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
|
||||
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
|
||||
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
in8, in9, in10, in11, in12, in13, in14, in15
|
||||
|
Loading…
x
Reference in New Issue
Block a user