x86/float_dsp: add ff_vector_{fmul_add, fmac_scalar}_fma3
~7% faster than AVX Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
12ce58bebd
commit
7d7487e85c
@ -80,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
|
|||||||
.loop:
|
.loop:
|
||||||
%assign a 0
|
%assign a 0
|
||||||
%rep 32/mmsize
|
%rep 32/mmsize
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova m1, [dstq+lenq+(a+0)*mmsize]
|
||||||
|
mova m2, [dstq+lenq+(a+1)*mmsize]
|
||||||
|
fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1
|
||||||
|
fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2
|
||||||
|
%else
|
||||||
mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
|
mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
|
||||||
mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
|
mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
|
||||||
addps m1, m1, [dstq+lenq+(a+0)*mmsize]
|
addps m1, m1, [dstq+lenq+(a+0)*mmsize]
|
||||||
addps m2, m2, [dstq+lenq+(a+1)*mmsize]
|
addps m2, m2, [dstq+lenq+(a+1)*mmsize]
|
||||||
|
%endif
|
||||||
mova [dstq+lenq+(a+0)*mmsize], m1
|
mova [dstq+lenq+(a+0)*mmsize], m1
|
||||||
mova [dstq+lenq+(a+1)*mmsize], m2
|
mova [dstq+lenq+(a+1)*mmsize], m2
|
||||||
%assign a a+2
|
%assign a a+2
|
||||||
@ -99,6 +106,10 @@ VECTOR_FMAC_SCALAR
|
|||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
VECTOR_FMAC_SCALAR
|
VECTOR_FMAC_SCALAR
|
||||||
%endif
|
%endif
|
||||||
|
%if HAVE_FMA3_EXTERNAL
|
||||||
|
INIT_YMM fma3
|
||||||
|
VECTOR_FMAC_SCALAR
|
||||||
|
%endif
|
||||||
|
|
||||||
;------------------------------------------------------------------------------
|
;------------------------------------------------------------------------------
|
||||||
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
|
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
|
||||||
@ -182,16 +193,23 @@ VECTOR_DMUL_SCALAR
|
|||||||
; const float *src2, int len)
|
; const float *src2, int len)
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
%macro VECTOR_FMUL_ADD 0
|
%macro VECTOR_FMUL_ADD 0
|
||||||
cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
|
cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
|
||||||
lea lenq, [lend*4 - 2*mmsize]
|
lea lenq, [lend*4 - 2*mmsize]
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
.loop:
|
.loop:
|
||||||
mova m0, [src0q + lenq]
|
mova m0, [src0q + lenq]
|
||||||
mova m1, [src0q + lenq + mmsize]
|
mova m1, [src0q + lenq + mmsize]
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
mova m2, [src2q + lenq]
|
||||||
|
mova m3, [src2q + lenq + mmsize]
|
||||||
|
fmaddps m0, m0, [src1q + lenq], m2
|
||||||
|
fmaddps m1, m1, [src1q + lenq + mmsize], m3
|
||||||
|
%else
|
||||||
mulps m0, m0, [src1q + lenq]
|
mulps m0, m0, [src1q + lenq]
|
||||||
mulps m1, m1, [src1q + lenq + mmsize]
|
mulps m1, m1, [src1q + lenq + mmsize]
|
||||||
addps m0, m0, [src2q + lenq]
|
addps m0, m0, [src2q + lenq]
|
||||||
addps m1, m1, [src2q + lenq + mmsize]
|
addps m1, m1, [src2q + lenq + mmsize]
|
||||||
|
%endif
|
||||||
mova [dstq + lenq], m0
|
mova [dstq + lenq], m0
|
||||||
mova [dstq + lenq + mmsize], m1
|
mova [dstq + lenq + mmsize], m1
|
||||||
|
|
||||||
@ -206,6 +224,10 @@ VECTOR_FMUL_ADD
|
|||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
VECTOR_FMUL_ADD
|
VECTOR_FMUL_ADD
|
||||||
%endif
|
%endif
|
||||||
|
%if HAVE_FMA3_EXTERNAL
|
||||||
|
INIT_YMM fma3
|
||||||
|
VECTOR_FMUL_ADD
|
||||||
|
%endif
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
|
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
|
||||||
|
@ -33,6 +33,8 @@ void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
|
|||||||
int len);
|
int len);
|
||||||
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
|
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
|
||||||
int len);
|
int len);
|
||||||
|
void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
|
||||||
|
int len);
|
||||||
|
|
||||||
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
|
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
|
||||||
int len);
|
int len);
|
||||||
@ -46,6 +48,8 @@ void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
|
|||||||
const float *src2, int len);
|
const float *src2, int len);
|
||||||
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
|
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
|
||||||
const float *src2, int len);
|
const float *src2, int len);
|
||||||
|
void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
|
||||||
|
const float *src2, int len);
|
||||||
|
|
||||||
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
|
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
|
||||||
const float *src1, int len);
|
const float *src1, int len);
|
||||||
@ -153,4 +157,8 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
|||||||
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
|
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
|
||||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
|
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
|
||||||
}
|
}
|
||||||
|
if (EXTERNAL_FMA3(cpu_flags)) {
|
||||||
|
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
|
||||||
|
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user