avfilter/avf_showcqt: cqt_calc optimization on x86
on x86_64: time PSNR plain 3.303 inf SSE 1.649 107.087535 SSE3 1.632 107.087535 AVX 1.409 106.986771 FMA3 1.265 107.108437 on x86_32 (PSNR compared to x86_64 plain): time PSNR plain 7.225 103.951979 SSE 1.827 105.859282 SSE3 1.819 105.859282 AVX 1.533 105.997661 FMA3 1.384 105.885377 FMA4 test is not available Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
This commit is contained in:
parent
49b0246635
commit
1e69ac9246
@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
|
||||
w *= sign * (1.0 / s->fft_len);
|
||||
s->coeffs[m].val[x - s->coeffs[m].start] = w;
|
||||
}
|
||||
|
||||
if (s->permute_coeffs)
|
||||
s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
|
||||
}
|
||||
|
||||
av_expr_free(expr);
|
||||
@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
|
||||
|
||||
s->cqt_align = 1;
|
||||
s->cqt_calc = cqt_calc;
|
||||
s->permute_coeffs = NULL;
|
||||
s->draw_sono = draw_sono;
|
||||
if (s->format == AV_PIX_FMT_RGB24) {
|
||||
s->draw_bar = draw_bar_rgb;
|
||||
@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
|
||||
s->update_sono = update_sono_yuv;
|
||||
}
|
||||
|
||||
if (ARCH_X86)
|
||||
ff_showcqt_init_x86(s);
|
||||
|
||||
if ((ret = init_cqt(s)) < 0)
|
||||
return ret;
|
||||
|
||||
|
@ -74,6 +74,7 @@ typedef struct {
|
||||
/* callback */
|
||||
void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
|
||||
int len, int fft_len);
|
||||
void (*permute_coeffs)(float *v, int len);
|
||||
void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
|
||||
const ColorFloat *c, int bar_h);
|
||||
void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
|
||||
@ -112,4 +113,6 @@ typedef struct {
|
||||
int axis;
|
||||
} ShowCQTContext;
|
||||
|
||||
void ff_showcqt_init_x86(ShowCQTContext *s);
|
||||
|
||||
#endif
|
||||
|
@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
|
||||
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
|
||||
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
|
||||
OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
|
||||
OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o
|
||||
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
|
||||
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
|
||||
OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
|
||||
@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
|
||||
ifdef CONFIG_GPL
|
||||
YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o
|
||||
endif
|
||||
YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
|
||||
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
|
||||
YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o
|
||||
YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
|
||||
|
206
libavfilter/x86/avf_showcqt.asm
Normal file
206
libavfilter/x86/avf_showcqt.asm
Normal file
@ -0,0 +1,206 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for showcqt filter
|
||||
;*
|
||||
;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define pointer resq
|
||||
%else
|
||||
%define pointer resd
|
||||
%endif
|
||||
|
||||
struc Coeffs
|
||||
.val: pointer 1
|
||||
.start: resd 1
|
||||
.len: resd 1
|
||||
.sizeof:
|
||||
endstruc
|
||||
|
||||
%macro EMULATE_HADDPS 3 ; dst, src, tmp
|
||||
%if cpuflag(sse3)
|
||||
haddps %1, %2
|
||||
%else
|
||||
movaps %3, %1
|
||||
shufps %1, %2, q2020
|
||||
shufps %3, %2, q3131
|
||||
addps %1, %3
|
||||
%endif
|
||||
%endmacro ; EMULATE_HADDPS
|
||||
|
||||
%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
|
||||
%if cpuflag(fma3) || cpuflag(fma4)
|
||||
fmaddps %1, %2, %3, %4
|
||||
%else
|
||||
mulps %5, %2, %3
|
||||
addps %1, %4, %5
|
||||
%endif
|
||||
%endmacro ; EMULATE_FMADDPS
|
||||
|
||||
%macro CQT_CALC 9
|
||||
; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
|
||||
; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
|
||||
mov id, xd
|
||||
add id, [coeffsq + Coeffs.start + %9]
|
||||
movaps m%5, [srcq + 8 * iq]
|
||||
movaps m%7, [srcq + 8 * iq + mmsize]
|
||||
shufps m%6, m%5, m%7, q3131
|
||||
shufps m%5, m%5, m%7, q2020
|
||||
sub id, fft_lend
|
||||
EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
|
||||
neg id
|
||||
EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
|
||||
movups m%5, [srcq + 8 * iq - mmsize + 8]
|
||||
movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
|
||||
%if mmsize == 32
|
||||
vperm2f128 m%5, m%5, m%5, 1
|
||||
vperm2f128 m%7, m%7, m%7, 1
|
||||
%endif
|
||||
shufps m%6, m%5, m%7, q1313
|
||||
shufps m%5, m%5, m%7, q0202
|
||||
EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
|
||||
EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
|
||||
%endmacro ; CQT_CALC
|
||||
|
||||
%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
|
||||
addps m%5, m%4, m%2
|
||||
subps m%6, m%3, m%1
|
||||
addps m%1, m%3
|
||||
subps m%2, m%4
|
||||
EMULATE_HADDPS m%5, m%6, m%3
|
||||
EMULATE_HADDPS m%1, m%2, m%3
|
||||
EMULATE_HADDPS m%1, m%5, m%2
|
||||
%if mmsize == 32
|
||||
vextractf128 xmm%2, m%1, 1
|
||||
addps xmm%1, xmm%2
|
||||
%endif
|
||||
%endmacro ; CQT_SEPARATE
|
||||
|
||||
%macro DECLARE_CQT_CALC 0
|
||||
; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
|
||||
%if ARCH_X86_64
|
||||
cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
|
||||
align 16
|
||||
.loop_k:
|
||||
mov xd, [coeffsq + Coeffs.len]
|
||||
xorps m0, m0
|
||||
movaps m1, m0
|
||||
movaps m2, m0
|
||||
mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
|
||||
movaps m3, m0
|
||||
movaps m8, m0
|
||||
cmp coeffs_lend, xd
|
||||
movaps m9, m0
|
||||
movaps m10, m0
|
||||
movaps m11, m0
|
||||
cmova coeffs_lend, xd
|
||||
xor xd, xd
|
||||
test coeffs_lend, coeffs_lend
|
||||
jz .check_loop_b
|
||||
mov coeffs_valq, [coeffsq + Coeffs.val]
|
||||
mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
|
||||
align 16
|
||||
.loop_ab:
|
||||
movaps m7, [coeffs_valq + 4 * xq]
|
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
|
||||
movaps m7, [coeffs_val2q + 4 * xq]
|
||||
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
|
||||
add xd, mmsize/4
|
||||
cmp xd, coeffs_lend
|
||||
jb .loop_ab
|
||||
.check_loop_b:
|
||||
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
|
||||
jae .check_loop_a
|
||||
align 16
|
||||
.loop_b:
|
||||
movaps m7, [coeffs_val2q + 4 * xq]
|
||||
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
|
||||
add xd, mmsize/4
|
||||
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
|
||||
jb .loop_b
|
||||
.loop_end:
|
||||
CQT_SEPARATE 0, 1, 2, 3, 4, 5
|
||||
CQT_SEPARATE 8, 9, 10, 11, 4, 5
|
||||
mulps xmm0, xmm0
|
||||
mulps xmm8, xmm8
|
||||
EMULATE_HADDPS xmm0, xmm8, xmm1
|
||||
movaps [dstq], xmm0
|
||||
sub lend, 2
|
||||
lea dstq, [dstq + 16]
|
||||
lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
|
||||
jnz .loop_k
|
||||
REP_RET
|
||||
align 16
|
||||
.check_loop_a:
|
||||
cmp xd, [coeffsq + Coeffs.len]
|
||||
jae .loop_end
|
||||
align 16
|
||||
.loop_a:
|
||||
movaps m7, [coeffs_valq + 4 * xq]
|
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
|
||||
add xd, mmsize/4
|
||||
cmp xd, [coeffsq + Coeffs.len]
|
||||
jb .loop_a
|
||||
jmp .loop_end
|
||||
%else
|
||||
cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
|
||||
%define fft_lend r4m
|
||||
align 16
|
||||
.loop_k:
|
||||
mov xd, [coeffsq + Coeffs.len]
|
||||
xorps m0, m0
|
||||
movaps m1, m0
|
||||
movaps m2, m0
|
||||
movaps m3, m0
|
||||
test xd, xd
|
||||
jz .store
|
||||
mov coeffs_valq, [coeffsq + Coeffs.val]
|
||||
xor xd, xd
|
||||
align 16
|
||||
.loop_x:
|
||||
movaps m7, [coeffs_valq + 4 * xq]
|
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
|
||||
add xd, mmsize/4
|
||||
cmp xd, [coeffsq + Coeffs.len]
|
||||
jb .loop_x
|
||||
CQT_SEPARATE 0, 1, 2, 3, 4, 5
|
||||
mulps xmm0, xmm0
|
||||
EMULATE_HADDPS xmm0, xmm0, xmm1
|
||||
.store:
|
||||
movlps [dstq], xmm0
|
||||
sub lend, 1
|
||||
lea dstq, [dstq + 8]
|
||||
lea coeffsq, [coeffsq + Coeffs.sizeof]
|
||||
jnz .loop_k
|
||||
REP_RET
|
||||
%endif ; ARCH_X86_64
|
||||
%endmacro ; DECLARE_CQT_CALC
|
||||
|
||||
INIT_XMM sse
|
||||
DECLARE_CQT_CALC
|
||||
INIT_XMM sse3
|
||||
DECLARE_CQT_CALC
|
||||
INIT_YMM avx
|
||||
DECLARE_CQT_CALC
|
||||
INIT_YMM fma3
|
||||
DECLARE_CQT_CALC
|
||||
INIT_XMM fma4
|
||||
DECLARE_CQT_CALC
|
63
libavfilter/x86/avf_showcqt_init.c
Normal file
63
libavfilter/x86/avf_showcqt_init.c
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/avf_showcqt.h"
|
||||
|
||||
#define DECLARE_CQT_CALC(type) \
|
||||
void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
|
||||
const Coeffs *coeffs, int len, int fft_len)
|
||||
|
||||
DECLARE_CQT_CALC(sse);
|
||||
DECLARE_CQT_CALC(sse3);
|
||||
DECLARE_CQT_CALC(avx);
|
||||
DECLARE_CQT_CALC(fma3);
|
||||
DECLARE_CQT_CALC(fma4);
|
||||
|
||||
#define permute_coeffs_0 NULL
|
||||
|
||||
static void permute_coeffs_01452367(float *v, int len)
|
||||
{
|
||||
int k;
|
||||
for (k = 0; k < len; k += 8) {
|
||||
FFSWAP(float, v[k+2], v[k+4]);
|
||||
FFSWAP(float, v[k+3], v[k+5]);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
|
||||
{
|
||||
int cpuflags = av_get_cpu_flags();
|
||||
|
||||
#define SELECT_CQT_CALC(type, TYPE, align, perm) \
|
||||
if (EXTERNAL_##TYPE(cpuflags)) { \
|
||||
s->cqt_calc = ff_showcqt_cqt_calc_##type; \
|
||||
s->cqt_align = align; \
|
||||
s->permute_coeffs = permute_coeffs_##perm; \
|
||||
}
|
||||
|
||||
SELECT_CQT_CALC(sse, SSE, 4, 0);
|
||||
SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0);
|
||||
SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
|
||||
SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367);
|
||||
SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user