avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions
This patch adds MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions in new file me_cmp_msa.c Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
2f3f98af2b
commit
709bb45c66
@ -991,4 +991,6 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
|
||||
ff_me_cmp_init_ppc(c, avctx);
|
||||
if (ARCH_X86)
|
||||
ff_me_cmp_init_x86(c, avctx);
|
||||
if (ARCH_MIPS)
|
||||
ff_me_cmp_init_mips(c, avctx);
|
||||
}
|
||||
|
@ -87,6 +87,7 @@ void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
|
||||
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
|
||||
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
|
||||
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
|
||||
void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
|
||||
|
||||
void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
|
||||
|
||||
|
@ -31,6 +31,7 @@ OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_init_mips.o
|
||||
OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o
|
||||
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
|
||||
mips/hevc_mc_uni_msa.o \
|
||||
mips/hevc_mc_uniw_msa.o \
|
||||
@ -51,5 +52,6 @@ MSA-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_msa.o
|
||||
MSA-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_msa.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
|
||||
|
56
libavcodec/mips/me_cmp_init_mips.c
Normal file
56
libavcodec/mips/me_cmp_init_mips.c
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "me_cmp_mips.h"
|
||||
|
||||
#if HAVE_MSA
|
||||
static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#if BIT_DEPTH == 8
|
||||
c->pix_abs[0][0] = ff_pix_abs16_msa;
|
||||
c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
|
||||
c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
|
||||
c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
|
||||
c->pix_abs[1][0] = ff_pix_abs8_msa;
|
||||
c->pix_abs[1][1] = ff_pix_abs8_x2_msa;
|
||||
c->pix_abs[1][2] = ff_pix_abs8_y2_msa;
|
||||
c->pix_abs[1][3] = ff_pix_abs8_xy2_msa;
|
||||
|
||||
c->hadamard8_diff[0] = ff_hadamard8_diff16_msa;
|
||||
c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa;
|
||||
|
||||
c->hadamard8_diff[4] = ff_hadamard8_intra16_msa;
|
||||
c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa;
|
||||
|
||||
c->sad[0] = ff_pix_abs16_msa;
|
||||
c->sad[1] = ff_pix_abs8_msa;
|
||||
c->sse[0] = ff_sse16_msa;
|
||||
c->sse[1] = ff_sse8_msa;
|
||||
c->sse[2] = ff_sse4_msa;
|
||||
#endif
|
||||
}
|
||||
#endif // #if HAVE_MSA
|
||||
|
||||
av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_MSA
|
||||
me_cmp_msa(c, avctx);
|
||||
#endif // #if HAVE_MSA
|
||||
}
|
60
libavcodec/mips/me_cmp_mips.h
Normal file
60
libavcodec/mips/me_cmp_mips.h
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
|
||||
#define AVCODEC_MIPS_ME_CMP_MIPS_H
|
||||
|
||||
#include "../mpegvideo.h"
|
||||
#include "libavcodec/bit_depth_template.c"
|
||||
|
||||
int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h);
|
||||
int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
|
||||
ptrdiff_t stride, int i32Height);
|
||||
int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
|
||||
ptrdiff_t stride, int i32Height);
|
||||
int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
|
||||
ptrdiff_t stride, int i32Height);
|
||||
void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block,
|
||||
ptrdiff_t stride);
|
||||
|
||||
#endif // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
|
686
libavcodec/mips/me_cmp_msa.c
Normal file
686
libavcodec/mips/me_cmp_msa.c
Normal file
@ -0,0 +1,686 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/mips/generic_macros_msa.h"
|
||||
#include "me_cmp_mips.h"
|
||||
|
||||
static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *ref, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
|
||||
src0, src1, ref0, ref1);
|
||||
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *ref, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, ref0, ref1;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB2(src, src_stride, src0, src1);
|
||||
src += (2 * src_stride);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
ref += (2 * ref_stride);
|
||||
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
|
||||
|
||||
LD_UB2(src, src_stride, src0, src1);
|
||||
src += (2 * src_stride);
|
||||
LD_UB2(ref, ref_stride, ref0, ref1);
|
||||
ref += (2 * ref_stride);
|
||||
sad += SAD_UB2_UH(src0, src1, ref0, ref1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
|
||||
SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
|
||||
SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
|
||||
AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
|
||||
SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
|
||||
SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
|
||||
AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
|
||||
LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
|
||||
LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
|
||||
PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
|
||||
AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
|
||||
ref += (4 * ref_stride);
|
||||
|
||||
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
|
||||
PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
|
||||
PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
|
||||
AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, comp0, comp1;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
|
||||
ref += (5 * ref_stride);
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
|
||||
AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
|
||||
ref4 = ref3;
|
||||
|
||||
LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref += (3 * ref_stride);
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
|
||||
AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src0, src1, comp0, comp1);
|
||||
AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
|
||||
sad += SAD_UB2_UH(src2, src3, comp0, comp1);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, temp0, temp1, diff;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4;
|
||||
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
|
||||
v8u16 comp0, comp1, comp2, comp3;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
|
||||
ref += (4 * ref_stride);
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
|
||||
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
|
||||
|
||||
VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp0 += comp1;
|
||||
comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
|
||||
comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
|
||||
|
||||
temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
|
||||
comp2 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 += comp2;
|
||||
comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
|
||||
comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
|
||||
comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
|
||||
diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
|
||||
comp3 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp3;
|
||||
comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
|
||||
comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
|
||||
|
||||
temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp3 += comp0;
|
||||
comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
|
||||
comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
|
||||
comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
|
||||
diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
|
||||
int32_t src_stride,
|
||||
uint8_t *ref,
|
||||
int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3, comp, diff;
|
||||
v16u8 temp0, temp1, temp2, temp3;
|
||||
v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
|
||||
v8u16 comp0, comp1, comp2, comp3;
|
||||
v8u16 sad = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 3); ht_cnt--;) {
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
|
||||
LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
|
||||
ref += (5 * ref_stride);
|
||||
|
||||
ILVRL_B2_UB(ref14, ref04, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
ILVRL_B2_UB(ref10, ref00, temp2, temp3);
|
||||
comp2 = __msa_hadd_u_h(temp2, temp2);
|
||||
comp3 = __msa_hadd_u_h(temp3, temp3);
|
||||
comp0 += comp2;
|
||||
comp1 += comp3;
|
||||
SRARI_H2_UH(comp0, comp1, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
|
||||
diff = __msa_asub_u_b(src0, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref11, ref01, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp0;
|
||||
comp3 += comp1;
|
||||
SRARI_H2_UH(comp2, comp3, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src1, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref12, ref02, temp2, temp3);
|
||||
comp2 = __msa_hadd_u_h(temp2, temp2);
|
||||
comp3 = __msa_hadd_u_h(temp3, temp3);
|
||||
comp0 += comp2;
|
||||
comp1 += comp3;
|
||||
SRARI_H2_UH(comp0, comp1, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
|
||||
diff = __msa_asub_u_b(src2, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref13, ref03, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp0;
|
||||
comp3 += comp1;
|
||||
SRARI_H2_UH(comp2, comp3, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src3, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
LD_UB4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
|
||||
LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
|
||||
ref += (3 * ref_stride);
|
||||
|
||||
ILVRL_B2_UB(ref10, ref00, temp2, temp3);
|
||||
comp2 = __msa_hadd_u_h(temp2, temp2);
|
||||
comp3 = __msa_hadd_u_h(temp3, temp3);
|
||||
comp0 += comp2;
|
||||
comp1 += comp3;
|
||||
SRARI_H2_UH(comp0, comp1, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
|
||||
diff = __msa_asub_u_b(src0, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref11, ref01, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp0;
|
||||
comp3 += comp1;
|
||||
SRARI_H2_UH(comp2, comp3, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src1, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref12, ref02, temp2, temp3);
|
||||
comp2 = __msa_hadd_u_h(temp2, temp2);
|
||||
comp3 = __msa_hadd_u_h(temp3, temp3);
|
||||
comp0 += comp2;
|
||||
comp1 += comp3;
|
||||
SRARI_H2_UH(comp0, comp1, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
|
||||
diff = __msa_asub_u_b(src2, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
|
||||
ILVRL_B2_UB(ref13, ref03, temp0, temp1);
|
||||
comp0 = __msa_hadd_u_h(temp0, temp0);
|
||||
comp1 = __msa_hadd_u_h(temp1, temp1);
|
||||
comp2 += comp0;
|
||||
comp3 += comp1;
|
||||
SRARI_H2_UH(comp2, comp3, 2);
|
||||
comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
|
||||
diff = __msa_asub_u_b(src3, comp);
|
||||
sad += __msa_hadd_u_h(diff, diff);
|
||||
}
|
||||
|
||||
return (HADD_UH_U32(sad));
|
||||
}
|
||||
|
||||
#define CALC_MSE_B(src, ref, var) \
|
||||
{ \
|
||||
v16u8 src_l0_m, src_l1_m; \
|
||||
v8i16 res_l0_m, res_l1_m; \
|
||||
\
|
||||
ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
|
||||
HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
|
||||
DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
|
||||
}
|
||||
|
||||
static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||
uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
uint32_t sse;
|
||||
uint32_t src0, src1, src2, src3;
|
||||
uint32_t ref0, ref1, ref2, ref3;
|
||||
v16u8 src = { 0 };
|
||||
v16u8 ref = { 0 };
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LW4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
INSERT_W4_UB(src0, src1, src2, src3, src);
|
||||
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
}
|
||||
|
||||
static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||
uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
uint32_t sse;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
|
||||
src0, src1, ref0, ref1);
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
}
|
||||
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
}
|
||||
|
||||
static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||
uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height)
|
||||
{
|
||||
int32_t ht_cnt;
|
||||
uint32_t sse;
|
||||
v16u8 src, ref;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
sse = HADD_SW_S32(var);
|
||||
|
||||
return sse;
|
||||
}
|
||||
|
||||
static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *ref, int32_t ref_stride)
|
||||
{
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||||
v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
|
||||
v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v8i16 sum = { 0 };
|
||||
v8i16 zero = { 0 };
|
||||
|
||||
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
|
||||
ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
|
||||
src4, ref4, src5, ref5, src6, ref6, src7, ref7,
|
||||
diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
|
||||
HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
|
||||
HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
|
||||
TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
|
||||
diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
|
||||
BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
|
||||
temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
|
||||
BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
|
||||
diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
|
||||
BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
|
||||
temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
|
||||
TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
|
||||
temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
|
||||
BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
|
||||
diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
|
||||
BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
|
||||
temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
|
||||
ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
|
||||
diff0, diff1, diff2, diff3);
|
||||
sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
|
||||
sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
|
||||
sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
|
||||
sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
|
||||
sum += __msa_add_a_h((v8i16) diff0, zero);
|
||||
sum += __msa_add_a_h((v8i16) diff1, zero);
|
||||
sum += __msa_add_a_h((v8i16) diff2, zero);
|
||||
sum += __msa_add_a_h((v8i16) diff3, zero);
|
||||
|
||||
return (HADD_UH_U32(sum));
|
||||
}
|
||||
|
||||
static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *ref, int32_t ref_stride)
|
||||
{
|
||||
int32_t sum_res = 0;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
|
||||
v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v8i16 sum = { 0 };
|
||||
v16i8 zero = { 0 };
|
||||
|
||||
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
|
||||
src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
|
||||
zero, src4, zero, src5, zero, src6, zero, src7,
|
||||
diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
|
||||
BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
|
||||
temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
|
||||
BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
|
||||
diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
|
||||
BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
|
||||
temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
|
||||
TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
|
||||
temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
|
||||
BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
|
||||
diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
|
||||
BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
|
||||
temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
|
||||
ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
|
||||
diff0, diff1, diff2, diff3);
|
||||
sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
|
||||
sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
|
||||
sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
|
||||
sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
|
||||
sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
|
||||
sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
|
||||
sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
|
||||
sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
|
||||
sum_res = (HADD_UH_U32(sum));
|
||||
sum_res -= abs(temp0[0] + temp4[0]);
|
||||
|
||||
return sum_res;
|
||||
}
|
||||
|
||||
int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
|
||||
ptrdiff_t stride, int height)
|
||||
{
|
||||
return sad_16width_msa(src, stride, ref, stride, height);
|
||||
}
|
||||
|
||||
int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
|
||||
ptrdiff_t stride, int height)
|
||||
{
|
||||
return sad_8width_msa(src, stride, ref, stride, height);
|
||||
}
|
||||
|
||||
int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
|
||||
}
|
||||
|
||||
int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
|
||||
ptrdiff_t stride, int height)
|
||||
{
|
||||
return sse_16width_msa(src, stride, ref, stride, height);
|
||||
}
|
||||
|
||||
int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
|
||||
ptrdiff_t stride, int height)
|
||||
{
|
||||
return sse_8width_msa(src, stride, ref, stride, height);
|
||||
}
|
||||
|
||||
int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
|
||||
ptrdiff_t stride, int height)
|
||||
{
|
||||
return sse_4width_msa(src, stride, ref, stride, height);
|
||||
}
|
||||
|
||||
int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return hadamard_diff_8x8_msa(src, stride, dst, stride);
|
||||
}
|
||||
|
||||
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
|
||||
ptrdiff_t stride, int h)
|
||||
{
|
||||
return hadamard_intra_8x8_msa(src, stride, dst, stride);
|
||||
}
|
||||
|
||||
/* Hadamard Transform functions */
|
||||
#define WRAPPER8_16_SQ(name8, name16) \
|
||||
int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
|
||||
ptrdiff_t stride, int h) \
|
||||
{ \
|
||||
int score = 0; \
|
||||
score += name8(s, dst, src, stride, 8); \
|
||||
score += name8(s, dst + 8, src + 8, stride, 8); \
|
||||
if(h == 16) { \
|
||||
dst += 8 * stride; \
|
||||
src += 8 * stride; \
|
||||
score +=name8(s, dst, src, stride, 8); \
|
||||
score +=name8(s, dst + 8, src + 8, stride, 8); \
|
||||
} \
|
||||
return score; \
|
||||
}
|
||||
|
||||
WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
|
||||
WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
|
@ -1295,6 +1295,29 @@
|
||||
#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
|
||||
#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : SAD (Sum of Absolute Difference)
|
||||
Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
|
||||
Outputs - sad_m (halfword vector with sad)
|
||||
Return Type - unsigned halfword
|
||||
Details : Absolute difference of all the byte elements from 'in0' with
|
||||
'ref0' is calculated and preserved in 'diff0'. From the 16
|
||||
unsigned absolute diff values, even-odd pairs are added
|
||||
together to generate 8 halfword results.
|
||||
*/
|
||||
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
|
||||
( { \
|
||||
v16u8 diff0_m, diff1_m; \
|
||||
v8u16 sad_m = { 0 }; \
|
||||
\
|
||||
diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
|
||||
diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
|
||||
\
|
||||
sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
|
||||
sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
|
||||
\
|
||||
sad_m; \
|
||||
} )
|
||||
|
||||
/* Description : Insert specified word elements from input vectors to 1
|
||||
destination vector
|
||||
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
|
||||
@ -2429,6 +2452,42 @@
|
||||
}
|
||||
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
|
||||
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
in8, in9, in10, in11, in12, in13, in14, in15
|
||||
Outputs - out0, out1, out2, out3
|
||||
Return Type - unsigned byte
|
||||
Details :
|
||||
*/
|
||||
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3) \
|
||||
{ \
|
||||
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
|
||||
\
|
||||
ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
|
||||
out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
|
||||
\
|
||||
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
|
||||
out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
|
||||
\
|
||||
ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
|
||||
\
|
||||
tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
|
||||
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
|
||||
\
|
||||
tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
|
||||
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
|
||||
out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
|
||||
out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
|
||||
\
|
||||
tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
|
||||
tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
|
||||
out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
|
||||
out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
|
||||
}
|
||||
|
||||
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
in8, in9, in10, in11, in12, in13, in14, in15
|
||||
|
Loading…
x
Reference in New Issue
Block a user