Move vorbis_inverse_coupling from dsputil to vorbisdspcontext.

Conveniently (together with Justin's earlier patches), this makes our vorbis decoder entirely independent of dsputil.
2013-01-19 22:21:10 -08:00 · 2013-01-19 22:21:10 -08:00 · fef906c77c
commit fef906c77c
parent aeaf268e52
17 changed files with 358 additions and 167 deletions
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -388,7 +388,7 @@ OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
 OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdav.o
 OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdav.o
 OBJS-$(CONFIG_VMNC_DECODER)            += vmnc.o
-OBJS-$(CONFIG_VORBIS_DECODER)          += vorbisdec.o vorbis.o \
+OBJS-$(CONFIG_VORBIS_DECODER)          += vorbisdec.o vorbisdsp.o vorbis.o \
                                          vorbis_data.o xiph.o
 OBJS-$(CONFIG_VORBIS_ENCODER)          += vorbisenc.o vorbis.o \
                                          vorbis_data.o
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@ -15,6 +15,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o

 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
+OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
 OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
@ -86,6 +87,8 @@ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                          arm/rv40dsp_neon.o            \
                                          arm/h264cmc_neon.o            \

+NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
+
 NEON-OBJS-$(CONFIG_VP3DSP)             += arm/vp3dsp_neon.o

 NEON-OBJS-$(CONFIG_VP5_DECODER)        += arm/vp56dsp_neon.o            \
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@ -154,8 +154,6 @@ void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
 void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
                               int32_t max, unsigned int len);

-void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
-
 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                             const int16_t *v3, int len, int mul);
@ -307,9 +305,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->vector_clipf               = ff_vector_clipf_neon;
    c->vector_clip_int32          = ff_vector_clip_int32_neon;

-    if (CONFIG_VORBIS_DECODER)
-        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
-
    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;

--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@ -19,7 +19,6 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

-#include "config.h"
 #include "libavutil/arm/asm.S"

 function ff_clear_block_neon, export=1
@ -532,69 +531,6 @@ function ff_add_pixels_clamped_neon, export=1
        bx              lr
 endfunc

-#if CONFIG_VORBIS_DECODER
-function ff_vorbis_inverse_coupling_neon, export=1
-        vmov.i32        q10, #1<<31
-        subs            r2,  r2,  #4
-        mov             r3,  r0
-        mov             r12, r1
-        beq             3f
-
-        vld1.32         {d24-d25},[r1,:128]!
-        vld1.32         {d22-d23},[r0,:128]!
-        vcle.s32        q8,  q12, #0
-        vand            q9,  q11, q10
-        veor            q12, q12, q9
-        vand            q2,  q12, q8
-        vbic            q3,  q12, q8
-        vadd.f32        q12, q11, q2
-        vsub.f32        q11, q11, q3
-1:      vld1.32         {d2-d3},  [r1,:128]!
-        vld1.32         {d0-d1},  [r0,:128]!
-        vcle.s32        q8,  q1,  #0
-        vand            q9,  q0,  q10
-        veor            q1,  q1,  q9
-        vst1.32         {d24-d25},[r3, :128]!
-        vst1.32         {d22-d23},[r12,:128]!
-        vand            q2,  q1,  q8
-        vbic            q3,  q1,  q8
-        vadd.f32        q1,  q0,  q2
-        vsub.f32        q0,  q0,  q3
-        subs            r2,  r2,  #8
-        ble             2f
-        vld1.32         {d24-d25},[r1,:128]!
-        vld1.32         {d22-d23},[r0,:128]!
-        vcle.s32        q8,  q12, #0
-        vand            q9,  q11, q10
-        veor            q12, q12, q9
-        vst1.32         {d2-d3},  [r3, :128]!
-        vst1.32         {d0-d1},  [r12,:128]!
-        vand            q2,  q12, q8
-        vbic            q3,  q12, q8
-        vadd.f32        q12, q11, q2
-        vsub.f32        q11, q11, q3
-        b               1b
-
-2:      vst1.32         {d2-d3},  [r3, :128]!
-        vst1.32         {d0-d1},  [r12,:128]!
-        it              lt
-        bxlt            lr
-
-3:      vld1.32         {d2-d3},  [r1,:128]
-        vld1.32         {d0-d1},  [r0,:128]
-        vcle.s32        q8,  q1,  #0
-        vand            q9,  q0,  q10
-        veor            q1,  q1,  q9
-        vand            q2,  q1,  q8
-        vbic            q3,  q1,  q8
-        vadd.f32        q1,  q0,  q2
-        vsub.f32        q0,  q0,  q3
-        vst1.32         {d2-d3},  [r0,:128]!
-        vst1.32         {d0-d1},  [r1,:128]!
-        bx              lr
-endfunc
-#endif
-
 function ff_butterflies_float_neon, export=1
 1:      vld1.32         {q0},[r0,:128]
        vld1.32         {q1},[r1,:128]
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@ -0,0 +1,36 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
+
+void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+    }
+}
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@ -0,0 +1,83 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_vorbis_inverse_coupling_neon, export=1
+        vmov.i32        q10, #1<<31
+        subs            r2,  r2,  #4
+        mov             r3,  r0
+        mov             r12, r1
+        beq             3f
+
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+1:      vld1.32         {d2-d3},  [r1,:128]!
+        vld1.32         {d0-d1},  [r0,:128]!
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vst1.32         {d24-d25},[r3, :128]!
+        vst1.32         {d22-d23},[r12,:128]!
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        subs            r2,  r2,  #8
+        ble             2f
+        vld1.32         {d24-d25},[r1,:128]!
+        vld1.32         {d22-d23},[r0,:128]!
+        vcle.s32        q8,  q12, #0
+        vand            q9,  q11, q10
+        veor            q12, q12, q9
+        vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        vand            q2,  q12, q8
+        vbic            q3,  q12, q8
+        vadd.f32        q12, q11, q2
+        vsub.f32        q11, q11, q3
+        b               1b
+
+2:      vst1.32         {d2-d3},  [r3, :128]!
+        vst1.32         {d0-d1},  [r12,:128]!
+        it              lt
+        bxlt            lr
+
+3:      vld1.32         {d2-d3},  [r1,:128]
+        vld1.32         {d0-d1},  [r0,:128]
+        vcle.s32        q8,  q1,  #0
+        vand            q9,  q0,  q10
+        veor            q1,  q1,  q9
+        vand            q2,  q1,  q8
+        vbic            q3,  q1,  q8
+        vadd.f32        q1,  q0,  q2
+        vsub.f32        q0,  q0,  q3
+        vst1.32         {d2-d3},  [r0,:128]!
+        vst1.32         {d0-d1},  [r1,:128]!
+        bx              lr
+endfunc
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -36,7 +36,6 @@
 #include "mathops.h"
 #include "mpegvideo.h"
 #include "config.h"
-#include "vorbis.h"

 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
 uint32_t ff_squareTbl[512] = {0, };
@ -2817,9 +2816,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->try_8x8basis= try_8x8basis_c;
    c->add_8x8basis= add_8x8basis_c;

-#if CONFIG_VORBIS_DECODER
-    c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
-#endif
    c->vector_fmul_reverse = vector_fmul_reverse_c;
    c->vector_fmul_add = vector_fmul_add_c;
    c->vector_clipf = vector_clipf_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@ -346,8 +346,6 @@ typedef struct DSPContext {

    void (*h261_loop_filter)(uint8_t *src, int stride);

-    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
-    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
    /* assume len is a multiple of 16, and arrays are 32-byte aligned */
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@ -1,6 +1,7 @@
 OBJS                                   += ppc/dsputil_ppc.o             \
                                          ppc/videodsp_ppc.o            \

+OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o

 FFT-OBJS-$(HAVE_GNU_AS)                += ppc/fft_altivec_s.o
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@ -1283,29 +1283,6 @@ static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ui
    return score;
 }

-static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
-                                            int blocksize)
-{
-    int i;
-    vector float m, a;
-    vector bool int t0, t1;
-    const vector unsigned int v_31 = //XXX
-        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
-    for (i = 0; i < blocksize; i += 4) {
-        m = vec_ld(0, mag+i);
-        a = vec_ld(0, ang+i);
-        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
-        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
-        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
-        t0 = (vector bool int)vec_and(a, t1);
-        t1 = (vector bool int)vec_andc(a, t1);
-        a = vec_sub(m, (vector float)t1);
-        m = vec_add(m, (vector float)t0);
-        vec_stl(a, 0, ang+i);
-        vec_stl(m, 0, mag+i);
-    }
-}
-
 /* next one assumes that ((line_size % 8) == 0) */
 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@ -1403,6 +1380,4 @@ void ff_dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)

    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
-    if (CONFIG_VORBIS_DECODER)
-        c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
 }
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/vorbisdsp.h"
+
+#if HAVE_ALTIVEC
+static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
+                                            int blocksize)
+{
+    int i;
+    vector float m, a;
+    vector bool int t0, t1;
+    const vector unsigned int v_31 = //XXX
+        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
+    for (i = 0; i < blocksize; i += 4) {
+        m = vec_ld(0, mag+i);
+        a = vec_ld(0, ang+i);
+        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
+        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
+        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
+        t0 = (vector bool int)vec_and(a, t1);
+        t1 = (vector bool int)vec_andc(a, t1);
+        a = vec_sub(m, (vector float)t1);
+        m = vec_add(m, (vector float)t0);
+        vec_stl(a, 0, ang+i);
+        vec_stl(m, 0, mag+i);
+    }
+}
+#endif /* HAVE_ALTIVEC */
+
+void ff_vorbisdsp_init_ppc(VorbisDSPContext* c)
+{
+#if HAVE_ALTIVEC
+    if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
+      c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
+    }
+#endif /* HAVE_ALTIVEC */
+}
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@ -29,12 +29,12 @@
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "get_bits.h"
-#include "dsputil.h"
 #include "fft.h"
 #include "fmtconvert.h"
 #include "internal.h"

 #include "vorbis.h"
+#include "vorbisdsp.h"
 #include "xiph.h"

 #define V_NB_BITS 8
@ -125,7 +125,7 @@ typedef struct vorbis_context_s {
    AVCodecContext *avccontext;
    AVFrame frame;
    GetBitContext gb;
-    DSPContext dsp;
+    VorbisDSPContext dsp;
    AVFloatDSPContext fdsp;
    FmtConvertContext fmt_conv;

@ -981,7 +981,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
    int hdr_type, ret;

    vc->avccontext = avccontext;
-    ff_dsputil_init(&vc->dsp, avccontext);
+    ff_vorbisdsp_init(&vc->dsp);
    avpriv_float_dsp_init(&vc->fdsp, avccontext->flags & CODEC_FLAG_BITEXACT);
    ff_fmt_convert_init(&vc->fmt_conv, avccontext);

--- a/libavcodec/vorbisdsp.c
+++ b/libavcodec/vorbisdsp.c
@ -0,0 +1,33 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "vorbisdsp.h"
+#include "vorbis.h"
+
+void ff_vorbisdsp_init(VorbisDSPContext *dsp)
+{
+    dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
+
+    if (ARCH_X86)
+        ff_vorbisdsp_init_x86(dsp);
+    if (ARCH_PPC)
+        ff_vorbisdsp_init_ppc(dsp);
+    if (ARCH_ARM)
+        ff_vorbisdsp_init_arm(dsp);
+}
--- a/libavcodec/vorbisdsp.h
+++ b/libavcodec/vorbisdsp.h
@ -0,0 +1,34 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VORBISDSP_H
+#define AVCODEC_VORBISDSP_H
+
+typedef struct VorbisDSPContext {
+    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
+    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
+} VorbisDSPContext;
+
+void ff_vorbisdsp_init(VorbisDSPContext *dsp);
+
+/* for internal use only */
+void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp);
+void ff_vorbisdsp_init_arm(VorbisDSPContext *dsp);
+void ff_vorbisdsp_init_ppc(VorbisDSPContext *dsp);
+
+#endif /* AVCODEC_VORBISDSP_H */
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -20,6 +20,7 @@ OBJS-$(CONFIG_RV40_DECODER)            += x86/rv34dsp_init.o            \
 OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
 OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
+OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_VP5_DECODER)             += x86/vp56dsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp56dsp_init.o
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -1829,65 +1829,6 @@ void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
    avg_pixels8_mmxext(dst, src, stride, 8);
 }

-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
-{
-    int i;
-    __asm__ volatile ("pxor %%mm7, %%mm7":);
-    for (i = 0; i < blocksize; i += 2) {
-        __asm__ volatile (
-            "movq       %0, %%mm0   \n\t"
-            "movq       %1, %%mm1   \n\t"
-            "movq    %%mm0, %%mm2   \n\t"
-            "movq    %%mm1, %%mm3   \n\t"
-            "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
-            "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
-            "pslld     $31, %%mm2   \n\t" // keep only the sign bit
-            "pxor    %%mm2, %%mm1   \n\t"
-            "movq    %%mm3, %%mm4   \n\t"
-            "pand    %%mm1, %%mm3   \n\t"
-            "pandn   %%mm1, %%mm4   \n\t"
-            "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movq    %%mm3, %1      \n\t"
-            "movq    %%mm0, %0      \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
-    __asm__ volatile ("femms");
-}
-
-static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
-{
-    int i;
-
-    __asm__ volatile (
-        "movaps  %0, %%xmm5 \n\t"
-        :: "m"(ff_pdw_80000000[0])
-    );
-    for (i = 0; i < blocksize; i += 4) {
-        __asm__ volatile (
-            "movaps      %0, %%xmm0 \n\t"
-            "movaps      %1, %%xmm1 \n\t"
-            "xorps   %%xmm2, %%xmm2 \n\t"
-            "xorps   %%xmm3, %%xmm3 \n\t"
-            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
-            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
-            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
-            "xorps   %%xmm2, %%xmm1 \n\t"
-            "movaps  %%xmm3, %%xmm4 \n\t"
-            "andps   %%xmm1, %%xmm3 \n\t"
-            "andnps  %%xmm1, %%xmm4 \n\t"
-            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movaps  %%xmm3, %1     \n\t"
-            "movaps  %%xmm0, %0     \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
-}
-
 static void vector_clipf_sse(float *dst, const float *src,
                             float min, float max, int len)
 {
@ -2238,8 +2179,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
    }
-
-    c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
 #endif /* HAVE_INLINE_ASM */

 #if HAVE_YASM
@ -2263,8 +2202,6 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
        }
    }

-    c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-
    c->vector_clipf = vector_clipf_sse;
 #endif /* HAVE_INLINE_ASM */

--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+#include "dsputil_mmx.h" // for ff_pdw_80000000
+
+#if HAVE_INLINE_ASM
+#if ARCH_X86_32
+static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
+{
+    int i;
+    __asm__ volatile ("pxor %%mm7, %%mm7":);
+    for (i = 0; i < blocksize; i += 2) {
+        __asm__ volatile (
+            "movq       %0, %%mm0   \n\t"
+            "movq       %1, %%mm1   \n\t"
+            "movq    %%mm0, %%mm2   \n\t"
+            "movq    %%mm1, %%mm3   \n\t"
+            "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
+            "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
+            "pslld     $31, %%mm2   \n\t" // keep only the sign bit
+            "pxor    %%mm2, %%mm1   \n\t"
+            "movq    %%mm3, %%mm4   \n\t"
+            "pand    %%mm1, %%mm3   \n\t"
+            "pandn   %%mm1, %%mm4   \n\t"
+            "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
+            "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
+            "movq    %%mm3, %1      \n\t"
+            "movq    %%mm0, %0      \n\t"
+            : "+m"(mag[i]), "+m"(ang[i])
+            :: "memory"
+        );
+    }
+    __asm__ volatile ("femms");
+}
+#endif
+
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+    int i;
+
+    __asm__ volatile (
+        "movaps  %0, %%xmm5 \n\t"
+        :: "m"(ff_pdw_80000000[0])
+    );
+    for (i = 0; i < blocksize; i += 4) {
+        __asm__ volatile (
+            "movaps      %0, %%xmm0 \n\t"
+            "movaps      %1, %%xmm1 \n\t"
+            "xorps   %%xmm2, %%xmm2 \n\t"
+            "xorps   %%xmm3, %%xmm3 \n\t"
+            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
+            "xorps   %%xmm2, %%xmm1 \n\t"
+            "movaps  %%xmm3, %%xmm4 \n\t"
+            "andps   %%xmm1, %%xmm3 \n\t"
+            "andnps  %%xmm1, %%xmm4 \n\t"
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
+            "movaps  %%xmm3, %1     \n\t"
+            "movaps  %%xmm0, %0     \n\t"
+            : "+m"(mag[i]), "+m"(ang[i])
+            :: "memory"
+        );
+    }
+}
+#endif
+
+void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (mm_flags & AV_CPU_FLAG_3DNOW)
+        dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+#endif /* ARCH_X86_32 */
+    if (mm_flags & AV_CPU_FLAG_SSE)
+        dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
+#endif /* HAVE_INLINE_ASM */
+}