PPC: add _interleave versions of fft{4,6,16}_altivec

This removes the need for a post-swizzle with the small FFTs.

Originally committed as revision 24025 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Måns Rullgård 2010-07-03 18:36:10 +00:00
parent f054aaf731
commit a075902f3d
2 changed files with 52 additions and 23 deletions

View File

@ -38,19 +38,6 @@
extern void *ff_fft_dispatch_altivec[2][15];
#if HAVE_GNU_AS
// Convert from simd order to C order.
static void swizzle(vec_f *z, int n)
{
int i;
n >>= 1;
for (i = 0; i < n; i += 2) {
vec_f re = z[i];
vec_f im = z[i+1];
z[i] = vec_mergeh(re, im);
z[i+1] = vec_mergel(re, im);
}
}
static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
{
register vec_f v14 __asm__("v14") = {0,0,0,0};
@ -84,8 +71,6 @@ static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_s
: "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
);
if (do_swizzle && s->nbits <= 4)
swizzle((vec_f*)z, 1<<s->nbits);
}
static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)

View File

@ -143,28 +143,53 @@
vaddfp \d0,\s0,\s1
.endm
fft4_altivec:
.macro zip d0,d1,s0,s1
vmrghw \d0,\s0,\s1
vmrglw \d1,\s0,\s1
.endm
.macro def_fft4 interleave
fft4\interleave\()_altivec:
lvx v0, 0,r3
lvx v1,r9,r3
FFT4 v0,v1,v2,v3
.ifnb \interleave
zip v0,v1,v2,v3
stvx v0, 0,r3
stvx v1,r9,r3
.else
stvx v2, 0,r3
stvx v3,r9,r3
.endif
blr
.endm
fft8_altivec:
.macro def_fft8 interleave
fft8\interleave\()_altivec:
addi r4,r3,32
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
zip v4,v5,v0,v1
zip v6,v7,v2,v3
stvx v4, 0,r3
stvx v5,r9,r3
stvx v6, 0,r4
stvx v7,r9,r4
.else
stvx v0, 0,r3
stvx v1,r9,r3
stvx v2, 0,r4
stvx v3,r9,r4
.endif
blr
.endm
fft16_altivec:
.macro def_fft16 interleave
fft16\interleave\()_altivec:
addi r5,r3,64
addi r6,r3,96
addi r4,r3,32
@ -190,17 +215,33 @@ fft16_altivec:
BF v11,v13,v9,v11
BF v0,v4,v0,v10
BF v3,v7,v3,v12
BF v1,v5,v1,v11
BF v2,v6,v2,v13
.ifnb \interleave
zip v8, v9,v0,v1
zip v10,v11,v2,v3
zip v12,v13,v4,v5
zip v14,v15,v6,v7
stvx v8, 0,r3
stvx v9,r9,r3
stvx v10, 0,r4
stvx v11,r9,r4
stvx v12, 0,r5
stvx v13,r9,r5
stvx v14, 0,r6
stvx v15,r9,r6
.else
stvx v0, 0,r3
stvx v4, 0,r5
stvx v3,r9,r4
stvx v7,r9,r6
BF v1,v5,v1,v11
BF v2,v6,v2,v13
stvx v1,r9,r3
stvx v5,r9,r5
stvx v2, 0,r4
stvx v6, 0,r6
.endif
blr
.endm
// void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix
@ -297,6 +338,9 @@ fft\n\suffix\()_altivec:
.macro DECL_FFTS interleave, suffix
.text
def_fft4 \suffix
def_fft8 \suffix
def_fft16 \suffix
PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16
@ -314,9 +358,9 @@ fft\n\suffix\()_altivec:
.rodata
.global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
PTR fft4_altivec
PTR fft8_altivec
PTR fft16_altivec
PTR fft4\suffix\()_altivec
PTR fft8\suffix\()_altivec
PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec