x86: dcadsp: implement int8x8_fmul_int32
For the callable function (as opposed to the inline one): C SSE SSE2 SSE4 Win32: 47 42 29 26 Win64: 30 33 25 23 The SSE version is neither compiled nor set for ARCH_X86_64, as the inlinable function takes over. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
This commit is contained in:
parent
2bd44cb705
commit
5b59a9fc61
@ -50,6 +50,9 @@
|
||||
#if ARCH_ARM
|
||||
# include "arm/dca.h"
|
||||
#endif
|
||||
#if ARCH_X86
|
||||
# include "x86/dca.h"
|
||||
#endif
|
||||
|
||||
//#define TRACE
|
||||
|
||||
|
@ -88,4 +88,5 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
|
||||
s->qmf_32_subbands = dca_qmf_32_subbands;
|
||||
s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
|
||||
if (ARCH_ARM) ff_dcadsp_init_arm(s);
|
||||
if (ARCH_X86) ff_dcadsp_init_x86(s);
|
||||
}
|
||||
|
@ -36,5 +36,6 @@ typedef struct DCADSPContext {
|
||||
|
||||
void ff_dcadsp_init(DCADSPContext *s);
|
||||
void ff_dcadsp_init_arm(DCADSPContext *s);
|
||||
void ff_dcadsp_init_x86(DCADSPContext *s);
|
||||
|
||||
#endif /* AVCODEC_DCADSP_H */
|
||||
|
@ -4,6 +4,7 @@ OBJS += x86/constants.o \
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
|
||||
@ -54,6 +55,7 @@ YASM-OBJS += x86/deinterlace.o \
|
||||
|
||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
||||
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
|
||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
|
||||
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
|
||||
x86/fpel.o \
|
||||
|
52
libavcodec/x86/dca.h
Normal file
52
libavcodec/x86/dca.h
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#if ARCH_X86_64
|
||||
# include "libavutil/x86/asm.h"
|
||||
# include "libavutil/mem.h"
|
||||
|
||||
# define int8x8_fmul_int32 int8x8_fmul_int32
|
||||
static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
|
||||
float *dst, const int8_t *src, int scale)
|
||||
{
|
||||
DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
|
||||
__asm__ volatile (
|
||||
"cvtsi2ss %2, %%xmm0 \n\t"
|
||||
"mulss %3, %%xmm0 \n\t"
|
||||
"movq (%1), %%xmm1 \n\t"
|
||||
"punpcklbw %%xmm1, %%xmm1 \n\t"
|
||||
"movaps %%xmm1, %%xmm2 \n\t"
|
||||
"punpcklwd %%xmm1, %%xmm1 \n\t"
|
||||
"punpckhwd %%xmm2, %%xmm2 \n\t"
|
||||
"psrad $24, %%xmm1 \n\t"
|
||||
"psrad $24, %%xmm2 \n\t"
|
||||
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
||||
"cvtdq2ps %%xmm1, %%xmm1 \n\t"
|
||||
"cvtdq2ps %%xmm2, %%xmm2 \n\t"
|
||||
"mulps %%xmm0, %%xmm1 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"movaps %%xmm1, 0(%0) \n\t"
|
||||
"movaps %%xmm2, 16(%0) \n\t"
|
||||
:: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
|
||||
XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* ARCH_X86_64 */
|
90
libavcodec/x86/dcadsp.asm
Normal file
90
libavcodec/x86/dcadsp.asm
Normal file
@ -0,0 +1,90 @@
|
||||
;******************************************************************************
|
||||
;* SSE-optimized functions for the DCA decoder
|
||||
;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pf_inv16: times 4 dd 0x3D800000 ; 1/16
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
|
||||
%macro INT8X8_FMUL_INT32 0
|
||||
cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
|
||||
cvtsi2ss m0, scalem
|
||||
mulss m0, [pf_inv16]
|
||||
shufps m0, m0, 0
|
||||
%if cpuflag(sse2)
|
||||
%if cpuflag(sse4)
|
||||
pmovsxbd m1, [srcq+0]
|
||||
pmovsxbd m2, [srcq+4]
|
||||
%else
|
||||
movq m1, [srcq]
|
||||
punpcklbw m1, m1
|
||||
mova m2, m1
|
||||
punpcklwd m1, m1
|
||||
punpckhwd m2, m2
|
||||
psrad m1, 24
|
||||
psrad m2, 24
|
||||
%endif
|
||||
cvtdq2ps m1, m1
|
||||
cvtdq2ps m2, m2
|
||||
%else
|
||||
movd mm0, [srcq+0]
|
||||
movd mm1, [srcq+4]
|
||||
punpcklbw mm0, mm0
|
||||
punpcklbw mm1, mm1
|
||||
movq mm2, mm0
|
||||
movq mm3, mm1
|
||||
punpcklwd mm0, mm0
|
||||
punpcklwd mm1, mm1
|
||||
punpckhwd mm2, mm2
|
||||
punpckhwd mm3, mm3
|
||||
psrad mm0, 24
|
||||
psrad mm1, 24
|
||||
psrad mm2, 24
|
||||
psrad mm3, 24
|
||||
cvtpi2ps m1, mm0
|
||||
cvtpi2ps m2, mm1
|
||||
cvtpi2ps m3, mm2
|
||||
cvtpi2ps m4, mm3
|
||||
shufps m0, m0, 0
|
||||
emms
|
||||
shufps m1, m3, q1010
|
||||
shufps m2, m4, q1010
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mova [dstq+ 0], m1
|
||||
mova [dstq+16], m2
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_XMM sse
|
||||
INT8X8_FMUL_INT32
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
INT8X8_FMUL_INT32
|
||||
|
||||
INIT_XMM sse4
|
||||
INT8X8_FMUL_INT32
|
47
libavcodec/x86/dcadsp_init.c
Normal file
47
libavcodec/x86/dcadsp_init.c
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
|
||||
void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
|
||||
void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
|
||||
|
||||
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
#if ARCH_X86_32
|
||||
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user