From 209f50e16b5e66424d593ba4f9d4d8be5feff947 Mon Sep 17 00:00:00 2001 From: James Almer Date: Sun, 24 Jan 2016 23:52:19 -0300 Subject: [PATCH] avcodec/synth_filter: split off remaining code from dcadec files Signed-off-by: James Almer --- libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/dcadsp_init.c | 21 --- libavcodec/aarch64/synth_filter_init.c | 47 +++++ libavcodec/arm/Makefile | 3 +- libavcodec/arm/dcadsp_init_arm.c | 22 --- libavcodec/arm/synth_filter_init_arm.c | 49 +++++ libavcodec/x86/Makefile | 6 +- libavcodec/x86/dcadsp.asm | 222 ---------------------- libavcodec/x86/dcadsp_init.c | 51 ----- libavcodec/x86/synth_filter.asm | 246 +++++++++++++++++++++++++ libavcodec/x86/synth_filter_init.c | 74 ++++++++ 11 files changed, 424 insertions(+), 320 deletions(-) create mode 100644 libavcodec/aarch64/synth_filter_init.c create mode 100644 libavcodec/arm/synth_filter_init_arm.c create mode 100644 libavcodec/x86/synth_filter.asm create mode 100644 libavcodec/x86/synth_filter_init.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 022ed847a3..99f590c650 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -1,4 +1,5 @@ -OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o +OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o \ + aarch64/synth_filter_init.o OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c index 78642a5ed8..4440e4b95f 100644 --- a/libavcodec/aarch64/dcadsp_init.c +++ b/libavcodec/aarch64/dcadsp_init.c @@ -24,23 +24,10 @@ #include "libavutil/attributes.h" #include "libavutil/internal.h" #include "libavcodec/dcadsp.h" -#include "libavcodec/fft.h" - -#include "asm-offsets.h" - -#if HAVE_NEON || HAVE_VFP -AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); -#endif void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -50,11 +37,3 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) s->lfe_fir[1] = ff_dca_lfe_fir1_neon; } } - -av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_neon; -} diff --git a/libavcodec/aarch64/synth_filter_init.c b/libavcodec/aarch64/synth_filter_init.c new file mode 100644 index 0000000000..767b01112a --- /dev/null +++ b/libavcodec/aarch64/synth_filter_init.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/aarch64/cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" + +#include "asm-offsets.h" + +#if HAVE_NEON || HAVE_VFP +AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); +#endif + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale); + +av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; +} diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index cdd35b08ea..6a29a5fbb7 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -36,7 +36,8 @@ OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o # decoders/encoders OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o -OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o +OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ + arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c index 0f2e4c49c9..febb4445d2 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/dcadsp_init_arm.c @@ -37,18 +37,6 @@ void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, const float window[512], float *samples_out, float raXin[32], float scale); -void ff_synth_filter_float_vfp(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -63,13 +51,3 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) s->lfe_fir[1] = ff_dca_lfe_fir1_neon; } } - -av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp_vm(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_vfp; - if (have_neon(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_neon; -} diff --git a/libavcodec/arm/synth_filter_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c new file mode 100644 index 0000000000..ea0ce148d4 --- /dev/null +++ b/libavcodec/arm/synth_filter_init_arm.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/arm/cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" + +void ff_synth_filter_float_vfp(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale); + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale); + +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_vfp_vm(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_vfp; + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; +} diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0d09fe6663..bcb42332a0 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,7 +44,8 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o -OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o +OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o \ + x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o @@ -132,7 +133,8 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o -YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o +YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o \ + x86/synth_filter.o YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o \ x86/dwt_yasm.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 502b70a4cb..55e73bcc29 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -121,225 +121,3 @@ DCA_LFE_FIR 1 INIT_XMM fma3 DCA_LFE_FIR 0 %endif - -%macro SETZERO 1 -%if cpuflag(sse2) && notcpuflag(avx) - pxor %1, %1 -%else - xorps %1, %1, %1 -%endif -%endmacro - -%macro SHUF 3 -%if cpuflag(avx) - mova %3, [%2 - 16] - vperm2f128 %1, %3, %3, 1 - vshufps %1, %1, %1, q0123 -%elif cpuflag(sse2) - pshufd %1, [%2], q0123 -%else - mova %1, [%2] - shufps %1, %1, q0123 -%endif -%endmacro - -%macro INNER_LOOP 1 - ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i - ;~ a += window[i + j] * (-synth_buf[15 - i + j]) - ;~ b += window[i + j + 16] * (synth_buf[i + j]) - SHUF m5, ptr2 + j + (15 - 3) * 4, m6 - mova m6, [ptr1 + j] -%if ARCH_X86_64 - SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 - mova m12, [ptr1 + j + mmsize] -%endif -%if cpuflag(fma3) - fmaddps m2, m6, [win + %1 + j + 16 * 4], m2 - fnmaddps m1, m5, [win + %1 + j], m1 -%if ARCH_X86_64 - fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8 - fnmaddps m7, m11, [win + %1 + j + mmsize], m7 -%endif -%else ; non-FMA - mulps m6, m6, [win + %1 + j + 16 * 4] - mulps m5, m5, [win + %1 + j] -%if ARCH_X86_64 - mulps m12, m12, [win + %1 + j + mmsize + 16 * 4] - mulps m11, m11, [win + %1 + j + mmsize] -%endif - addps m2, m2, m6 - subps m1, m1, m5 -%if ARCH_X86_64 - addps m8, m8, m12 - subps m7, m7, m11 -%endif -%endif ; cpuflag(fma3) - ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) - ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) - SHUF m6, ptr2 + j + (31 - 3) * 4, m5 - mova m5, [ptr1 + j + 16 * 4] -%if ARCH_X86_64 - SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 - mova m11, [ptr1 + j + mmsize + 16 * 4] -%endif -%if cpuflag(fma3) - fmaddps m3, m5, [win + %1 + j + 32 * 4], m3 - fmaddps m4, m6, [win + %1 + j + 48 * 4], m4 -%if ARCH_X86_64 - fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9 - fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10 -%endif -%else ; non-FMA - mulps m5, m5, [win + %1 + j + 32 * 4] - mulps m6, m6, [win + %1 + j + 48 * 4] -%if ARCH_X86_64 - mulps m11, m11, [win + %1 + j + mmsize + 32 * 4] - mulps m12, m12, [win + %1 + j + mmsize + 48 * 4] -%endif - addps m3, m3, m5 - addps m4, m4, m6 -%if ARCH_X86_64 - addps m9, m9, m11 - addps m10, m10, m12 -%endif -%endif ; cpuflag(fma3) - sub j, 64 * 4 -%endmacro - -; void ff_synth_filter_inner_(float *synth_buf, float synth_buf2[32], -; const float window[512], float out[32], -; intptr_t offset, float scale) -%macro SYNTH_FILTER 0 -cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ - synth_buf, synth_buf2, window, out, off, scale -%define scale m0 -%if ARCH_X86_32 || WIN64 -%if cpuflag(sse2) && notcpuflag(avx) - movd scale, scalem - SPLATD m0 -%else - VBROADCASTSS m0, scalem -%endif -; Make sure offset is in a register and not on the stack -%define OFFQ r4q -%else - SPLATD xmm0 -%if cpuflag(avx) - vinsertf128 m0, m0, xmm0, 1 -%endif -%define OFFQ offq -%endif - ; prepare inner counter limit 1 - mov r5q, 480 - sub r5q, offmp - and r5q, -64 - shl r5q, 2 -%if ARCH_X86_32 || notcpuflag(avx) - mov OFFQ, r5q -%define i r5q - mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter -%else -%define i 0 -%define OFFQ r5q -%endif - -%define buf2 synth_buf2q -%if ARCH_X86_32 - mov buf2, synth_buf2mp -%endif -.mainloop: - ; m1 = a m2 = b m3 = c m4 = d - SETZERO m3 - SETZERO m4 - mova m1, [buf2 + i] - mova m2, [buf2 + i + 16 * 4] -%if ARCH_X86_32 -%define ptr1 r0q -%define ptr2 r1q -%define win r2q -%define j r3q - mov win, windowm - mov ptr1, synth_bufm -%if ARCH_X86_32 || notcpuflag(avx) - add win, i - add ptr1, i -%endif -%else ; ARCH_X86_64 -%define ptr1 r6q -%define ptr2 r7q ; must be loaded -%define win r8q -%define j r9q - SETZERO m9 - SETZERO m10 - mova m7, [buf2 + i + mmsize] - mova m8, [buf2 + i + mmsize + 16 * 4] - lea win, [windowq + i] - lea ptr1, [synth_bufq + i] -%endif - mov ptr2, synth_bufmp - ; prepare the inner loop counter - mov j, OFFQ -%if ARCH_X86_32 || notcpuflag(avx) - sub ptr2, i -%endif -.loop1: - INNER_LOOP 0 - jge .loop1 - - mov j, 448 * 4 - sub j, OFFQ - jz .end - sub ptr1, j - sub ptr2, j - add win, OFFQ ; now at j-64, so define OFFSET - sub j, 64 * 4 -.loop2: - INNER_LOOP 64 * 4 - jge .loop2 - -.end: -%if ARCH_X86_32 - mov buf2, synth_buf2m ; needed for next iteration anyway - mov outq, outmp ; j, which will be set again during it -%endif - ;~ out[i] = a * scale; - ;~ out[i + 16] = b * scale; - mulps m1, m1, scale - mulps m2, m2, scale -%if ARCH_X86_64 - mulps m7, m7, scale - mulps m8, m8, scale -%endif - ;~ synth_buf2[i] = c; - ;~ synth_buf2[i + 16] = d; - mova [buf2 + i + 0 * 4], m3 - mova [buf2 + i + 16 * 4], m4 -%if ARCH_X86_64 - mova [buf2 + i + 0 * 4 + mmsize], m9 - mova [buf2 + i + 16 * 4 + mmsize], m10 -%endif - ;~ out[i] = a; - ;~ out[i + 16] = a; - mova [outq + i + 0 * 4], m1 - mova [outq + i + 16 * 4], m2 -%if ARCH_X86_64 - mova [outq + i + 0 * 4 + mmsize], m7 - mova [outq + i + 16 * 4 + mmsize], m8 -%endif -%if ARCH_X86_32 || notcpuflag(avx) - sub i, (ARCH_X86_64 + 1) * mmsize - jge .mainloop -%endif - RET -%endmacro - -%if ARCH_X86_32 -INIT_XMM sse -SYNTH_FILTER -%endif -INIT_XMM sse2 -SYNTH_FILTER -INIT_YMM avx -SYNTH_FILTER -INIT_YMM fma3 -SYNTH_FILTER diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 1321dda652..c27c045d1d 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -40,54 +40,3 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) s->lfe_fir[0] = ff_dca_lfe_fir0_fma3; } } - - -#define SYNTH_FILTER_FUNC(opt) \ -void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \ - const float window[512], \ - float out[32], intptr_t offset, float scale); \ -static void synth_filter_##opt(FFTContext *imdct, \ - float *synth_buf_ptr, int *synth_buf_offset, \ - float synth_buf2[32], const float window[512], \ - float out[32], const float in[32], float scale) \ -{ \ - float *synth_buf= synth_buf_ptr + *synth_buf_offset; \ - \ - imdct->imdct_half(imdct, synth_buf, in); \ - \ - ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \ - out, *synth_buf_offset, scale); \ - \ - *synth_buf_offset = (*synth_buf_offset - 32) & 511; \ -} \ - -#if HAVE_YASM -#if ARCH_X86_32 -SYNTH_FILTER_FUNC(sse) -#endif -SYNTH_FILTER_FUNC(sse2) -SYNTH_FILTER_FUNC(avx) -SYNTH_FILTER_FUNC(fma3) -#endif /* HAVE_YASM */ - -av_cold void ff_synth_filter_init_x86(SynthFilterContext *s) -{ -#if HAVE_YASM - int cpu_flags = av_get_cpu_flags(); - -#if ARCH_X86_32 - if (EXTERNAL_SSE(cpu_flags)) { - s->synth_filter_float = synth_filter_sse; - } -#endif - if (EXTERNAL_SSE2(cpu_flags)) { - s->synth_filter_float = synth_filter_sse2; - } - if (EXTERNAL_AVX_FAST(cpu_flags)) { - s->synth_filter_float = synth_filter_avx; - } - if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) { - s->synth_filter_float = synth_filter_fma3; - } -#endif /* HAVE_YASM */ -} diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm new file mode 100644 index 0000000000..bc1a48f409 --- /dev/null +++ b/libavcodec/x86/synth_filter.asm @@ -0,0 +1,246 @@ +;****************************************************************************** +;* SSE-optimized functions for the DCA decoder +;* Copyright (C) 2012-2014 Christophe Gisquet +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro SETZERO 1 +%if cpuflag(sse2) && notcpuflag(avx) + pxor %1, %1 +%else + xorps %1, %1, %1 +%endif +%endmacro + +%macro SHUF 3 +%if cpuflag(avx) + mova %3, [%2 - 16] + vperm2f128 %1, %3, %3, 1 + vshufps %1, %1, %1, q0123 +%elif cpuflag(sse2) + pshufd %1, [%2], q0123 +%else + mova %1, [%2] + shufps %1, %1, q0123 +%endif +%endmacro + +%macro INNER_LOOP 1 + ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i + ;~ a += window[i + j] * (-synth_buf[15 - i + j]) + ;~ b += window[i + j + 16] * (synth_buf[i + j]) + SHUF m5, ptr2 + j + (15 - 3) * 4, m6 + mova m6, [ptr1 + j] +%if ARCH_X86_64 + SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 + mova m12, [ptr1 + j + mmsize] +%endif +%if cpuflag(fma3) + fmaddps m2, m6, [win + %1 + j + 16 * 4], m2 + fnmaddps m1, m5, [win + %1 + j], m1 +%if ARCH_X86_64 + fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8 + fnmaddps m7, m11, [win + %1 + j + mmsize], m7 +%endif +%else ; non-FMA + mulps m6, m6, [win + %1 + j + 16 * 4] + mulps m5, m5, [win + %1 + j] +%if ARCH_X86_64 + mulps m12, m12, [win + %1 + j + mmsize + 16 * 4] + mulps m11, m11, [win + %1 + j + mmsize] +%endif + addps m2, m2, m6 + subps m1, m1, m5 +%if ARCH_X86_64 + addps m8, m8, m12 + subps m7, m7, m11 +%endif +%endif ; cpuflag(fma3) + ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) + ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) + SHUF m6, ptr2 + j + (31 - 3) * 4, m5 + mova m5, [ptr1 + j + 16 * 4] +%if ARCH_X86_64 + SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 + mova m11, [ptr1 + j + mmsize + 16 * 4] +%endif +%if cpuflag(fma3) + fmaddps m3, m5, [win + %1 + j + 32 * 4], m3 + fmaddps m4, m6, [win + %1 + j + 48 * 4], m4 +%if ARCH_X86_64 + fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9 + fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10 +%endif +%else ; non-FMA + mulps m5, m5, [win + %1 + j + 32 * 4] + mulps m6, m6, [win + %1 + j + 48 * 4] +%if ARCH_X86_64 + mulps m11, m11, [win + %1 + j + mmsize + 32 * 4] + mulps m12, m12, [win + %1 + j + mmsize + 48 * 4] +%endif + addps m3, m3, m5 + addps m4, m4, m6 +%if ARCH_X86_64 + addps m9, m9, m11 + addps m10, m10, m12 +%endif +%endif ; cpuflag(fma3) + sub j, 64 * 4 +%endmacro + +; void ff_synth_filter_inner_(float *synth_buf, float synth_buf2[32], +; const float window[512], float out[32], +; intptr_t offset, float scale) +%macro SYNTH_FILTER 0 +cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ + synth_buf, synth_buf2, window, out, off, scale +%define scale m0 +%if ARCH_X86_32 || WIN64 +%if cpuflag(sse2) && notcpuflag(avx) + movd scale, scalem + SPLATD m0 +%else + VBROADCASTSS m0, scalem +%endif +; Make sure offset is in a register and not on the stack +%define OFFQ r4q +%else + SPLATD xmm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xmm0, 1 +%endif +%define OFFQ offq +%endif + ; prepare inner counter limit 1 + mov r5q, 480 + sub r5q, offmp + and r5q, -64 + shl r5q, 2 +%if ARCH_X86_32 || notcpuflag(avx) + mov OFFQ, r5q +%define i r5q + mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter +%else +%define i 0 +%define OFFQ r5q +%endif + +%define buf2 synth_buf2q +%if ARCH_X86_32 + mov buf2, synth_buf2mp +%endif +.mainloop: + ; m1 = a m2 = b m3 = c m4 = d + SETZERO m3 + SETZERO m4 + mova m1, [buf2 + i] + mova m2, [buf2 + i + 16 * 4] +%if ARCH_X86_32 +%define ptr1 r0q +%define ptr2 r1q +%define win r2q +%define j r3q + mov win, windowm + mov ptr1, synth_bufm +%if ARCH_X86_32 || notcpuflag(avx) + add win, i + add ptr1, i +%endif +%else ; ARCH_X86_64 +%define ptr1 r6q +%define ptr2 r7q ; must be loaded +%define win r8q +%define j r9q + SETZERO m9 + SETZERO m10 + mova m7, [buf2 + i + mmsize] + mova m8, [buf2 + i + mmsize + 16 * 4] + lea win, [windowq + i] + lea ptr1, [synth_bufq + i] +%endif + mov ptr2, synth_bufmp + ; prepare the inner loop counter + mov j, OFFQ +%if ARCH_X86_32 || notcpuflag(avx) + sub ptr2, i +%endif +.loop1: + INNER_LOOP 0 + jge .loop1 + + mov j, 448 * 4 + sub j, OFFQ + jz .end + sub ptr1, j + sub ptr2, j + add win, OFFQ ; now at j-64, so define OFFSET + sub j, 64 * 4 +.loop2: + INNER_LOOP 64 * 4 + jge .loop2 + +.end: +%if ARCH_X86_32 + mov buf2, synth_buf2m ; needed for next iteration anyway + mov outq, outmp ; j, which will be set again during it +%endif + ;~ out[i] = a * scale; + ;~ out[i + 16] = b * scale; + mulps m1, m1, scale + mulps m2, m2, scale +%if ARCH_X86_64 + mulps m7, m7, scale + mulps m8, m8, scale +%endif + ;~ synth_buf2[i] = c; + ;~ synth_buf2[i + 16] = d; + mova [buf2 + i + 0 * 4], m3 + mova [buf2 + i + 16 * 4], m4 +%if ARCH_X86_64 + mova [buf2 + i + 0 * 4 + mmsize], m9 + mova [buf2 + i + 16 * 4 + mmsize], m10 +%endif + ;~ out[i] = a; + ;~ out[i + 16] = a; + mova [outq + i + 0 * 4], m1 + mova [outq + i + 16 * 4], m2 +%if ARCH_X86_64 + mova [outq + i + 0 * 4 + mmsize], m7 + mova [outq + i + 16 * 4 + mmsize], m8 +%endif +%if ARCH_X86_32 || notcpuflag(avx) + sub i, (ARCH_X86_64 + 1) * mmsize + jge .mainloop +%endif + RET +%endmacro + +%if ARCH_X86_32 +INIT_XMM sse +SYNTH_FILTER +%endif +INIT_XMM sse2 +SYNTH_FILTER +INIT_YMM avx +SYNTH_FILTER +INIT_YMM fma3 +SYNTH_FILTER diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c new file mode 100644 index 0000000000..0649ea20a6 --- /dev/null +++ b/libavcodec/x86/synth_filter_init.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2012-2014 Christophe Gisquet + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/synth_filter.h" + +#define SYNTH_FILTER_FUNC(opt) \ +void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \ + const float window[512], \ + float out[32], intptr_t offset, float scale); \ +static void synth_filter_##opt(FFTContext *imdct, \ + float *synth_buf_ptr, int *synth_buf_offset, \ + float synth_buf2[32], const float window[512], \ + float out[32], const float in[32], float scale) \ +{ \ + float *synth_buf= synth_buf_ptr + *synth_buf_offset; \ + \ + imdct->imdct_half(imdct, synth_buf, in); \ + \ + ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \ + out, *synth_buf_offset, scale); \ + \ + *synth_buf_offset = (*synth_buf_offset - 32) & 511; \ +} \ + +#if HAVE_YASM +#if ARCH_X86_32 +SYNTH_FILTER_FUNC(sse) +#endif +SYNTH_FILTER_FUNC(sse2) +SYNTH_FILTER_FUNC(avx) +SYNTH_FILTER_FUNC(fma3) +#endif /* HAVE_YASM */ + +av_cold void ff_synth_filter_init_x86(SynthFilterContext *s) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_SSE(cpu_flags)) { + s->synth_filter_float = synth_filter_sse; + } +#endif + if (EXTERNAL_SSE2(cpu_flags)) { + s->synth_filter_float = synth_filter_sse2; + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + s->synth_filter_float = synth_filter_avx; + } + if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) { + s->synth_filter_float = synth_filter_fma3; + } +#endif /* HAVE_YASM */ +}