x86: use new schema for ASM macros
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
65212e3ed9
commit
2fd5e70869
@ -27,15 +27,15 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
|
|||||||
int has_vectors = av_get_cpu_flags();
|
int has_vectors = av_get_cpu_flags();
|
||||||
if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
|
if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
|
||||||
/* 3DNow! for K6-2/3 */
|
/* 3DNow! for K6-2/3 */
|
||||||
s->imdct_calc = ff_imdct_calc_3dn;
|
s->imdct_calc = ff_imdct_calc_3dnow;
|
||||||
s->imdct_half = ff_imdct_half_3dn;
|
s->imdct_half = ff_imdct_half_3dnow;
|
||||||
s->fft_calc = ff_fft_calc_3dn;
|
s->fft_calc = ff_fft_calc_3dnow;
|
||||||
}
|
}
|
||||||
if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
|
if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
|
||||||
/* 3DNowEx for K7 */
|
/* 3DNowEx for K7 */
|
||||||
s->imdct_calc = ff_imdct_calc_3dn2;
|
s->imdct_calc = ff_imdct_calc_3dnow2;
|
||||||
s->imdct_half = ff_imdct_half_3dn2;
|
s->imdct_half = ff_imdct_half_3dnow2;
|
||||||
s->fft_calc = ff_fft_calc_3dn2;
|
s->fft_calc = ff_fft_calc_3dnow2;
|
||||||
}
|
}
|
||||||
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
|
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
|
||||||
/* SSE for P3/P4/K8 */
|
/* SSE for P3/P4/K8 */
|
||||||
|
@ -24,13 +24,13 @@
|
|||||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
||||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
||||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
||||||
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
|
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
|
||||||
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
|
void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z);
|
||||||
|
|
||||||
void ff_imdct_calc_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_half_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||||
|
@ -30,30 +30,30 @@ DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 };
|
|||||||
"movq "#s","#d"\n"\
|
"movq "#s","#d"\n"\
|
||||||
"psrlq $32,"#d"\n"\
|
"psrlq $32,"#d"\n"\
|
||||||
"punpckldq "#s","#d"\n"
|
"punpckldq "#s","#d"\n"
|
||||||
#define ff_fft_calc_3dn2 ff_fft_calc_3dn
|
#define ff_fft_calc_3dnow2 ff_fft_calc_3dnow
|
||||||
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
|
#define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow
|
||||||
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
|
#define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow
|
||||||
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
|
#define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow
|
||||||
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
|
#define ff_imdct_half_3dnow2 ff_imdct_half_3dnow
|
||||||
#else
|
#else
|
||||||
#define PSWAPD(s,d) "pswapd "#s","#d"\n"
|
#define PSWAPD(s,d) "pswapd "#s","#d"\n"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
|
void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits);
|
||||||
void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
|
void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits);
|
||||||
|
|
||||||
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
|
void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z)
|
||||||
{
|
{
|
||||||
int n = 1<<s->nbits;
|
int n = 1<<s->nbits;
|
||||||
int i;
|
int i;
|
||||||
ff_fft_dispatch_interleave_3dn2(z, s->nbits);
|
ff_fft_dispatch_interleave_3dnow2(z, s->nbits);
|
||||||
__asm__ volatile("femms");
|
__asm__ volatile("femms");
|
||||||
if(n <= 8)
|
if(n <= 8)
|
||||||
for(i=0; i<n; i+=2)
|
for(i=0; i<n; i+=2)
|
||||||
FFSWAP(FFTSample, z[i].im, z[i+1].re);
|
FFSWAP(FFTSample, z[i].im, z[i+1].re);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
|
void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||||
{
|
{
|
||||||
x86_reg j, k;
|
x86_reg j, k;
|
||||||
long n = s->mdct_size;
|
long n = s->mdct_size;
|
||||||
@ -101,7 +101,7 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
ff_fft_dispatch_3dn2(z, s->nbits);
|
ff_fft_dispatch_3dnow2(z, s->nbits);
|
||||||
|
|
||||||
#define CMUL(j,mm0,mm1)\
|
#define CMUL(j,mm0,mm1)\
|
||||||
"movq (%2,"#j",2), %%mm6 \n"\
|
"movq (%2,"#j",2), %%mm6 \n"\
|
||||||
@ -144,13 +144,13 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
|
|||||||
__asm__ volatile("femms");
|
__asm__ volatile("femms");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
|
void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||||
{
|
{
|
||||||
x86_reg j, k;
|
x86_reg j, k;
|
||||||
long n = s->mdct_size;
|
long n = s->mdct_size;
|
||||||
long n4 = n >> 2;
|
long n4 = n >> 2;
|
||||||
|
|
||||||
ff_imdct_half_3dn2(s, output+n4, input);
|
ff_imdct_half_3dnow2(s, output+n4, input);
|
||||||
|
|
||||||
j = -n;
|
j = -n;
|
||||||
k = n-8;
|
k = n-8;
|
||||||
|
@ -297,7 +297,7 @@ IF%1 mova Z(1), m5
|
|||||||
%define Z2(x) [r0+mmsize*x]
|
%define Z2(x) [r0+mmsize*x]
|
||||||
%define ZH(x) [r0+mmsize*x+mmsize/2]
|
%define ZH(x) [r0+mmsize*x+mmsize/2]
|
||||||
|
|
||||||
INIT_YMM
|
INIT_YMM avx
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
align 16
|
align 16
|
||||||
@ -391,7 +391,7 @@ fft32_interleave_avx:
|
|||||||
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM sse
|
||||||
%define movdqa movaps
|
%define movdqa movaps
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
@ -440,11 +440,9 @@ fft16_sse:
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
INIT_MMX
|
%macro FFT48_3DN 0
|
||||||
|
|
||||||
%macro FFT48_3DN 1
|
|
||||||
align 16
|
align 16
|
||||||
fft4%1:
|
fft4_ %+ cpuname:
|
||||||
T2_3DN m0, m1, Z(0), Z(1)
|
T2_3DN m0, m1, Z(0), Z(1)
|
||||||
mova m2, Z(2)
|
mova m2, Z(2)
|
||||||
mova m3, Z(3)
|
mova m3, Z(3)
|
||||||
@ -458,7 +456,7 @@ fft4%1:
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
fft8%1:
|
fft8_ %+ cpuname:
|
||||||
T2_3DN m0, m1, Z(0), Z(1)
|
T2_3DN m0, m1, Z(0), Z(1)
|
||||||
mova m2, Z(2)
|
mova m2, Z(2)
|
||||||
mova m3, Z(3)
|
mova m3, Z(3)
|
||||||
@ -496,7 +494,8 @@ fft8%1:
|
|||||||
ret
|
ret
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT48_3DN _3dn2
|
INIT_MMX 3dnow2
|
||||||
|
FFT48_3DN
|
||||||
|
|
||||||
%macro pswapd 2
|
%macro pswapd 2
|
||||||
%ifidn %1, %2
|
%ifidn %1, %2
|
||||||
@ -509,7 +508,8 @@ FFT48_3DN _3dn2
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT48_3DN _3dn
|
INIT_MMX 3dnow
|
||||||
|
FFT48_3DN
|
||||||
|
|
||||||
|
|
||||||
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
|
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
|
||||||
@ -533,7 +533,7 @@ DEFINE_ARGS z, w, n, o1, o3
|
|||||||
rep ret
|
rep ret
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_YMM
|
INIT_YMM avx
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
%macro INTERL_AVX 5
|
%macro INTERL_AVX 5
|
||||||
@ -551,7 +551,7 @@ DECL_PASS pass_avx, PASS_BIG 1
|
|||||||
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM sse
|
||||||
|
|
||||||
%macro INTERL_SSE 5
|
%macro INTERL_SSE 5
|
||||||
mova %3, %2
|
mova %3, %2
|
||||||
@ -566,16 +566,16 @@ INIT_XMM
|
|||||||
DECL_PASS pass_sse, PASS_BIG 1
|
DECL_PASS pass_sse, PASS_BIG 1
|
||||||
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
||||||
|
|
||||||
INIT_MMX
|
INIT_MMX 3dnow
|
||||||
%define mulps pfmul
|
%define mulps pfmul
|
||||||
%define addps pfadd
|
%define addps pfadd
|
||||||
%define subps pfsub
|
%define subps pfsub
|
||||||
%define unpcklps punpckldq
|
%define unpcklps punpckldq
|
||||||
%define unpckhps punpckhdq
|
%define unpckhps punpckhdq
|
||||||
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
|
DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
|
||||||
DECL_PASS pass_interleave_3dn, PASS_BIG 0
|
DECL_PASS pass_interleave_3dnow, PASS_BIG 0
|
||||||
%define pass_3dn2 pass_3dn
|
%define pass_3dnow2 pass_3dnow
|
||||||
%define pass_interleave_3dn2 pass_interleave_3dn
|
%define pass_interleave_3dnow2 pass_interleave_3dnow
|
||||||
|
|
||||||
%ifdef PIC
|
%ifdef PIC
|
||||||
%define SECTION_REL - $$
|
%define SECTION_REL - $$
|
||||||
@ -593,67 +593,70 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
|
|||||||
call r2
|
call r2
|
||||||
%endmacro ; FFT_DISPATCH
|
%endmacro ; FFT_DISPATCH
|
||||||
|
|
||||||
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
|
%macro DECL_FFT 1-2 ; nbits, cpu, suffix
|
||||||
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
|
%xdefine cpusuffix _ %+ cpuname
|
||||||
|
%xdefine fullsuffix %2_ %+ cpuname
|
||||||
|
%xdefine list_of_fft fft4 %+ cpusuffix SECTION_REL, fft8 %+ cpusuffix SECTION_REL
|
||||||
%if %1>=5
|
%if %1>=5
|
||||||
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
|
%xdefine list_of_fft list_of_fft, fft16 %+ cpusuffix SECTION_REL
|
||||||
%endif
|
%endif
|
||||||
%if %1>=6
|
%if %1>=6
|
||||||
%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
|
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%assign n 1<<%1
|
%assign n 1<<%1
|
||||||
%rep 17-%1
|
%rep 17-%1
|
||||||
%assign n2 n/2
|
%assign n2 n/2
|
||||||
%assign n4 n/4
|
%assign n4 n/4
|
||||||
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
|
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
fft %+ n %+ %3%2:
|
fft %+ n %+ fullsuffix:
|
||||||
call fft %+ n2 %+ %2
|
call fft %+ n2 %+ cpusuffix
|
||||||
add r0, n*4 - (n&(-2<<%1))
|
add r0, n*4 - (n&(-2<<%1))
|
||||||
call fft %+ n4 %+ %2
|
call fft %+ n4 %+ cpusuffix
|
||||||
add r0, n*2 - (n2&(-2<<%1))
|
add r0, n*2 - (n2&(-2<<%1))
|
||||||
call fft %+ n4 %+ %2
|
call fft %+ n4 %+ cpusuffix
|
||||||
sub r0, n*6 + (n2&(-2<<%1))
|
sub r0, n*6 + (n2&(-2<<%1))
|
||||||
lea r1, [cos_ %+ n]
|
lea r1, [cos_ %+ n]
|
||||||
mov r2d, n4/2
|
mov r2d, n4/2
|
||||||
jmp pass%3%2
|
jmp pass %+ fullsuffix
|
||||||
|
|
||||||
%assign n n*2
|
%assign n n*2
|
||||||
%endrep
|
%endrep
|
||||||
%undef n
|
%undef n
|
||||||
|
|
||||||
align 8
|
align 8
|
||||||
dispatch_tab%3%2: pointer list_of_fft
|
dispatch_tab %+ fullsuffix: pointer list_of_fft
|
||||||
|
|
||||||
section .text
|
section .text
|
||||||
|
|
||||||
; On x86_32, this function does the register saving and restoring for all of fft.
|
; On x86_32, this function does the register saving and restoring for all of fft.
|
||||||
; The others pass args in registers and don't spill anything.
|
; The others pass args in registers and don't spill anything.
|
||||||
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
|
cglobal fft_dispatch%2, 2,5,8, z, nbits
|
||||||
FFT_DISPATCH %3%2, nbits
|
FFT_DISPATCH fullsuffix, nbits
|
||||||
%ifidn %2, _avx
|
%if mmsize == 32
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
RET
|
RET
|
||||||
%endmacro ; DECL_FFT
|
%endmacro ; DECL_FFT
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
INIT_YMM
|
INIT_YMM avx
|
||||||
DECL_FFT 6, _avx
|
DECL_FFT 6
|
||||||
DECL_FFT 6, _avx, _interleave
|
DECL_FFT 6, _interleave
|
||||||
%endif
|
%endif
|
||||||
INIT_XMM
|
INIT_XMM sse
|
||||||
DECL_FFT 5, _sse
|
DECL_FFT 5
|
||||||
DECL_FFT 5, _sse, _interleave
|
DECL_FFT 5, _interleave
|
||||||
INIT_MMX
|
INIT_MMX 3dnow
|
||||||
DECL_FFT 4, _3dn
|
DECL_FFT 4
|
||||||
DECL_FFT 4, _3dn, _interleave
|
DECL_FFT 4, _interleave
|
||||||
DECL_FFT 4, _3dn2
|
INIT_MMX 3dnow2
|
||||||
DECL_FFT 4, _3dn2, _interleave
|
DECL_FFT 4
|
||||||
|
DECL_FFT 4, _interleave
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM sse
|
||||||
%undef mulps
|
%undef mulps
|
||||||
%undef addps
|
%undef addps
|
||||||
%undef subps
|
%undef subps
|
||||||
@ -749,8 +752,8 @@ INIT_XMM
|
|||||||
jl .post
|
jl .post
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro DECL_IMDCT 2
|
%macro DECL_IMDCT 1
|
||||||
cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
|
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
%define rrevtab r7
|
%define rrevtab r7
|
||||||
%define rtcos r8
|
%define rtcos r8
|
||||||
@ -822,7 +825,7 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
|
|||||||
mov r0, r1
|
mov r0, r1
|
||||||
mov r1d, [r5+FFTContext.nbits]
|
mov r1d, [r5+FFTContext.nbits]
|
||||||
|
|
||||||
FFT_DISPATCH %1, r1
|
FFT_DISPATCH _ %+ cpuname, r1
|
||||||
|
|
||||||
mov r0d, [r5+FFTContext.mdctsize]
|
mov r0d, [r5+FFTContext.mdctsize]
|
||||||
add r6, r0
|
add r6, r0
|
||||||
@ -836,20 +839,20 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
|
|||||||
neg r0
|
neg r0
|
||||||
mov r1, -mmsize
|
mov r1, -mmsize
|
||||||
sub r1, r0
|
sub r1, r0
|
||||||
%2 r0, r1, r6, rtcos, rtsin
|
%1 r0, r1, r6, rtcos, rtsin
|
||||||
%if ARCH_X86_64 == 0
|
%if ARCH_X86_64 == 0
|
||||||
add esp, 12
|
add esp, 12
|
||||||
%endif
|
%endif
|
||||||
%ifidn avx_enabled, 1
|
%if mmsize == 32
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
DECL_IMDCT _sse, POSROTATESHUF
|
DECL_IMDCT POSROTATESHUF
|
||||||
|
|
||||||
INIT_YMM
|
INIT_YMM avx
|
||||||
|
|
||||||
%if HAVE_AVX
|
%if HAVE_AVX
|
||||||
DECL_IMDCT _avx, POSROTATESHUF_AVX
|
DECL_IMDCT POSROTATESHUF_AVX
|
||||||
%endif
|
%endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user