x86/swr: convert resample_{common, linear}_double_sse2 to yasm

Signed-off-by: James Almer <jamrial@gmail.com> 312531 -> 311528 dezicycles Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2014-06-30 13:06:00 -03:00 · 2014-06-30 13:06:00 -03:00 · dd2c9034b1
commit dd2c9034b1
parent fb318def5d
4 changed files with 74 additions and 169 deletions
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@ -25,23 +25,15 @@
 * @author Michael Niedermayer <michaelni@gmx.at>
 */

-#if    defined(TEMPLATE_RESAMPLE_DBL)     \
-    || defined(TEMPLATE_RESAMPLE_DBL_SSE2)
+#if defined(TEMPLATE_RESAMPLE_DBL)

+#    define RENAME(N) N ## _double
 #    define FILTER_SHIFT 0
 #    define DELEM  double
 #    define FELEM  double
 #    define FELEM2 double
 #    define OUT(d, v) d = v

-#    if defined(TEMPLATE_RESAMPLE_DBL)
-#        define RENAME(N) N ## _double
-#    elif defined(TEMPLATE_RESAMPLE_DBL_SSE2)
-#        define COMMON_CORE COMMON_CORE_DBL_SSE2
-#        define LINEAR_CORE LINEAR_CORE_DBL_SSE2
-#        define RENAME(N) N ## _double_sse2
-#    endif
-
 #elif    defined(TEMPLATE_RESAMPLE_FLT)

 #    define RENAME(N) N ## _float
@ -104,16 +96,12 @@ int RENAME(swri_resample_common)(ResampleContext *c,
    for (dst_index = 0; dst_index < n; dst_index++) {
        FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;

-#ifdef COMMON_CORE
-        COMMON_CORE
-#else
        FELEM2 val=0;
        int i;
        for (i = 0; i < c->filter_length; i++) {
            val += src[sample_index + i] * (FELEM2)filter[i];
        }
        OUT(dst[dst_index], val);
-#endif

        frac  += c->dst_incr_mod;
        index += c->dst_incr_div;
@ -150,15 +138,11 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
        FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
        FELEM2 val=0, v2 = 0;

-#ifdef LINEAR_CORE
-        LINEAR_CORE
-#else
        int i;
        for (i = 0; i < c->filter_length; i++) {
            val += src[sample_index + i] * (FELEM2)filter[i];
            v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc];
        }
-#endif
 #ifdef FELEML
        val += (v2 - val) * (FELEML) frac / c->src_incr;
 #else
@ -188,8 +172,6 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
    return sample_index;
 }

-#undef COMMON_CORE
-#undef LINEAR_CORE
 #undef RENAME
 #undef FILTER_SHIFT
 #undef DELEM
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@ -50,11 +50,12 @@ endstruc
 SECTION_RODATA

 pf_1:      dd 1.0
+pdbl_1:    dq 1.0
 pd_0x4000: dd 0x4000

 SECTION .text

-%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
+%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
 ; int resample_common_$format(ResampleContext *ctx, $format *dst,
 ;                             const $format *src, int size, int update_ctx)
 %if ARCH_X86_64 ; unix64 and win64
@ -165,21 +166,21 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    lea                      filterq, [min_filter_count_x4q+filterq*%2]
    mov         min_filter_count_x4q, min_filter_length_x4q
 %endif
-%ifidn %1, float
-    xorps                         m0, m0, m0
-%else ; int16
+%ifidn %1, int16
    movd                          m0, [pd_0x4000]
+%else ; float/double
+    xorps                         m0, m0, m0
 %endif

    align 16
 .inner_loop:
    movu                          m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
-    mulps                         m1, m1, [filterq+min_filter_count_x4q*1]
-    addps                         m0, m0, m1
-%else ; int16
+%ifidn %1, int16
    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
    paddd                         m0, m1
+%else ; float/double
+    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
+    addp%4                        m0, m0, m1
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop
@ -189,16 +190,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    addps                        xm0, xm1
 %endif

-    ; horizontal sum & store
-%ifidn %1, float
-    movhlps                      xm1, xm0
-    addps                        xm0, xm1
-    shufps                       xm1, xm0, xm0, q0001
-    add                        fracd, dst_incr_modd
-    addps                        xm0, xm1
-    add                       indexd, dst_incr_divd
-    movss                     [dstq], xm0
-%else ; int16
+%ifidn %1, int16
 %if mmsize == 16
    pshufd                        m1, m0, q0032
    paddd                         m0, m1
@ -212,6 +204,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    packssdw                      m0, m0
    add                       indexd, dst_incr_divd
    movd                      [dstq], m0
+%else ; float/double
+    ; horizontal sum & store
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
 %endif
    cmp                        fracd, src_incrd
    jl .skip
@ -307,12 +310,12 @@ cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index,
    mov                   ctx_stackq, ctxq
    mov            phase_mask_stackd, phase_maskd
    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
-%ifidn %1, float
-    cvtsi2ss                     xm0, src_incrd
-    movss                        xm4, [pf_1]
-    divss                        xm4, xm0
-%else ; int16
+%ifidn %1, int16
    movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, src_incrd
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
 %endif
    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
    shl           min_filter_len_x4d, %3
@ -360,12 +363,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    mov                           r3, dword [ctxq+ResampleContext.src_incr]
    PUSH                              dword [ctxq+ResampleContext.phase_mask]
    PUSH                              r3d
-%ifidn %1, float
-    cvtsi2ss                     xm0, r3d
-    movss                        xm4, [pf_1]
-    divss                        xm4, xm0
-%else ; int16
+%ifidn %1, int16
    movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, r3d
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
 %endif
    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
    mov                       indexd, [ctxq+ResampleContext.index]
@ -409,27 +412,27 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    mov                     filter2q, filter1q
    add                     filter2q, filter_alloc_x4q
 %endif
-%ifidn %1, float
-    xorps                         m0, m0, m0
-    xorps                         m2, m2, m2
-%else ; int16
+%ifidn %1, int16
    mova                          m0, m4
    mova                          m2, m4
+%else ; float/double
+    xorps                         m0, m0, m0
+    xorps                         m2, m2, m2
 %endif

    align 16
 .inner_loop:
    movu                          m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
-    mulps                         m3, m1, [filter2q+min_filter_count_x4q*1]
-    mulps                         m1, m1, [filter1q+min_filter_count_x4q*1]
-    addps                         m2, m2, m3
-    addps                         m0, m0, m1
-%else ; int16
+%ifidn %1, int16
    pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
    pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
    paddd                         m2, m3
    paddd                         m0, m1
+%else ; float/double
+    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
+    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
+    addp%4                        m2, m2, m3
+    addp%4                        m0, m0, m1
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop
@ -441,24 +444,7 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    addps                        xm2, xm3
 %endif

-%ifidn %1, float
-    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
-    cvtsi2ss                     xm1, fracd
-    subps                        xm2, xm0
-    mulps                        xm1, xm4
-    shufps                       xm1, xm1, q0000
-    mulps                        xm2, xm1
-    addps                        xm0, xm2
-
-    ; horizontal sum & store
-    movhlps                      xm1, xm0
-    addps                        xm0, xm1
-    shufps                       xm1, xm0, xm0, q0001
-    add                        fracd, dst_incr_modd
-    addps                        xm0, xm1
-    add                       indexd, dst_incr_divd
-    movss                     [dstq], xm0
-%else ; int16
+%ifidn %1, int16
 %if mmsize == 16
    pshufd                        m3, m2, q0032
    pshufd                        m1, m0, q0032
@ -491,6 +477,25 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    ; - 32bit: eax=r0[filter1], edx=r2[filter2]
    ; - win64: eax=r6[filter1], edx=r1[todo]
    ; - unix64: eax=r6[filter1], edx=r2[todo]
+%else ; float/double
+    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+    cvtsi2s%4                    xm1, fracd
+    subp%4                       xm2, xm0
+    mulp%4                       xm1, xm4
+    shufp%4                      xm1, xm1, q0000
+    mulp%4                       xm2, xm1
+    addp%4                       xm0, xm2
+
+    ; horizontal sum & store
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
 %endif
    cmp                        fracd, src_incrd
    jl .skip
@ -553,11 +558,11 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
 %endmacro

 INIT_XMM sse
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1

 %if HAVE_AVX_EXTERNAL
 INIT_YMM avx
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif

 %if ARCH_X86_32
@ -567,3 +572,4 @@ RESAMPLE_FNS int16, 2, 1

 INIT_XMM sse2
 RESAMPLE_FNS int16, 2, 1
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
--- a/libswresample/x86/resample_mmx.h
+++ b/libswresample/x86/resample_mmx.h
@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86/asm.h"
-#include "libavutil/cpu.h"
-#include "libswresample/swresample_internal.h"
-
-#define COMMON_CORE_DBL_SSE2 \
-    x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
-    "xorpd     %%xmm0, %%xmm0     \n\t"\
-    "1:                           \n\t"\
-    "movupd  (%1, %0), %%xmm1     \n\t"\
-    "mulpd   (%2, %0), %%xmm1     \n\t"\
-    "addpd     %%xmm1, %%xmm0     \n\t"\
-    "add       $16, %0            \n\t"\
-    " js 1b                       \n\t"\
-    "movhlps   %%xmm0, %%xmm1     \n\t"\
-    "addpd     %%xmm1, %%xmm0     \n\t"\
-    "movsd     %%xmm0, (%3)       \n\t"\
-    : "+r" (len)\
-    : "r" (((uint8_t*)(src+sample_index))-len),\
-      "r" (((uint8_t*)filter)-len),\
-      "r" (dst+dst_index)\
-    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
-);
-
-#define LINEAR_CORE_DBL_SSE2 \
-    x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
-    "xorpd      %%xmm0, %%xmm0    \n\t"\
-    "xorpd      %%xmm2, %%xmm2    \n\t"\
-    "1:                           \n\t"\
-    "movupd   (%3, %0), %%xmm1    \n\t"\
-    "movapd     %%xmm1, %%xmm3    \n\t"\
-    "mulpd    (%4, %0), %%xmm1    \n\t"\
-    "mulpd    (%5, %0), %%xmm3    \n\t"\
-    "addpd      %%xmm1, %%xmm0    \n\t"\
-    "addpd      %%xmm3, %%xmm2    \n\t"\
-    "add           $16, %0        \n\t"\
-    " js 1b                       \n\t"\
-    "movhlps    %%xmm0, %%xmm1    \n\t"\
-    "movhlps    %%xmm2, %%xmm3    \n\t"\
-    "addpd      %%xmm1, %%xmm0    \n\t"\
-    "addpd      %%xmm3, %%xmm2    \n\t"\
-    "movsd      %%xmm0, %1        \n\t"\
-    "movsd      %%xmm2, %2        \n\t"\
-    : "+r" (len),\
-      "=m" (val),\
-      "=m" (v2)\
-    : "r" (((uint8_t*)(src+sample_index))-len),\
-      "r" (((uint8_t*)filter)-len),\
-      "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
-    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
-);
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@ -27,21 +27,6 @@

 #include "libswresample/resample.h"

-int swri_resample_common_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
-int swri_resample_linear_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
-
-#if HAVE_SSE2_INLINE
-#define DO_RESAMPLE_ONE 0
-
-#include "resample_mmx.h"
-
-#define TEMPLATE_RESAMPLE_DBL_SSE2
-#include "libswresample/resample_template.c"
-#undef TEMPLATE_RESAMPLE_DBL_SSE2
-#endif
-
-#undef DO_RESAMPLE_ONE
-
 int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
                                    const uint8_t *src, int sz, int upd);
 int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
@ -62,6 +47,11 @@ int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
 int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
                                 const uint8_t *src, int sz, int upd);

+int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
+                                   const uint8_t *src, int sz, int upd);
+int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
+                                   const uint8_t *src, int sz, int upd);
+
 void swresample_dsp_x86_init(ResampleContext *c)
 {
    int av_unused mm_flags = av_get_cpu_flags();
@ -78,10 +68,9 @@ void swresample_dsp_x86_init(ResampleContext *c)
    if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
        c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
        c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
-    }
-    if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
-        c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
-        c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
+
+        c->dsp.resample_common[FNIDX(DBLP)] = ff_resample_common_double_sse2;
+        c->dsp.resample_linear[FNIDX(DBLP)] = ff_resample_linear_double_sse2;
    }
    if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) {
        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;