yadif: x86 assembly for 9 to 14-bit samples

These smaller samples do not need to be unpacked to double words
allowing the code to process more pixels every iteration (still 2 in MMX
but 6 in SSE2).  It also avoids emulating the missing double word
instructions on older instruction sets.

Like with the previous code for 16-bit samples this has been tested on
an Athlon64 and a Core2Quad.

Athlon64:
1809275 decicycles in C,    32718 runs, 50 skips
 911675 decicycles in mmx,  32727 runs, 41 skips, 2.0x faster
 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster

Core2Quad:
 921363 decicycles in C,     32756 runs, 12 skips
 486537 decicycles in mmx,   32764 runs,  4 skips, 1.9x faster
 293296 decicycles in sse2,  32759 runs,  9 skips, 3.1x faster
 284910 decicycles in ssse3, 32759 runs,  9 skips, 3.2x faster

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Darnley 2013-03-16 21:42:24 +01:00 committed by Michael Niedermayer
parent 17e7b49501
commit 0a5814c9ba
3 changed files with 305 additions and 2 deletions

View File

@ -5,4 +5,4 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o
YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o

View File

@ -49,6 +49,16 @@ void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
av_cold void ff_yadif_init_x86(YADIFContext *yadif)
{
int cpu_flags = av_get_cpu_flags();
@ -56,7 +66,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
: yadif->csp->comp[0].depth_minus1 + 1;
#if HAVE_YASM
if (bit_depth > 8) {
if (bit_depth >= 15) {
#if ARCH_X86_32
if (EXTERNAL_MMXEXT(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
@ -67,6 +77,15 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
} else if ( bit_depth >= 9 && bit_depth <= 14) {
#if ARCH_X86_32
if (EXTERNAL_MMXEXT(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
} else {
#if ARCH_X86_32
if (EXTERNAL_MMXEXT(cpu_flags))

View File

@ -0,0 +1,284 @@
;*****************************************************************************
;* x86-optimized functions for yadif filter
;*
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License along
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
SECTION .text
%macro PABS 2
%if cpuflag(ssse3)
pabsw %1, %1
%else
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
%endif
%endmacro
%macro PMAXUW 2
%if cpuflag(sse4)
pmaxuw %1, %2
%else
psubusw %1, %2
paddusw %1, %2
%endif
%endmacro
%macro CHECK 2
movu m2, [curq+t1+%1*2]
movu m3, [curq+t0+%2*2]
mova m4, m2
mova m5, m2
pxor m4, m3
pavgw m5, m3
pand m4, [pw_1]
psubusw m5, m4
%if mmsize == 16
psrldq m5, 2
%else
psrlq m5, 16
%endif
mova m4, m2
psubusw m2, m3
psubusw m3, m4
PMAXUW m2, m3
mova m3, m2
mova m4, m2
%if mmsize == 16
psrldq m3, 2
psrldq m4, 4
%else
psrlq m3, 16
psrlq m4, 32
%endif
paddw m2, m3
paddw m2, m4
%endmacro
%macro CHECK1 0
mova m3, m0
pcmpgtw m3, m2
pminsw m0, m2
mova m6, m3
pand m5, m3
pandn m3, m1
por m3, m5
mova m1, m3
%endmacro
; %macro CHECK2 0
; paddw m6, [pw_1]
; psllw m6, 14
; paddsw m2, m6
; mova m3, m0
; pcmpgtw m3, m2
; pminsw m0, m2
; pand m5, m3
; pandn m3, m1
; por m3, m5
; mova m1, m3
; %endmacro
; This version of CHECK2 is required for 14-bit samples. The left-shift trick
; in the old code is not large enough to correctly select pixels or scores.
%macro CHECK2 0
mova m3, m0
pcmpgtw m0, m2
pand m0, m6
mova m6, m0
pand m5, m6
pand m2, m0
pandn m6, m1
pandn m0, m3
por m6, m5
por m0, m2
mova m1, m6
%endmacro
%macro LOAD 2
movu m%1, %2
%endmacro
%macro FILTER 3
.loop%1:
pxor m7, m7
LOAD 0, [curq+t1]
LOAD 1, [curq+t0]
LOAD 2, [%2]
LOAD 3, [%3]
mova m4, m3
paddw m3, m2
psraw m3, 1
mova [rsp+ 0], m0
mova [rsp+16], m3
mova [rsp+32], m1
psubw m2, m4
PABS m2, m4
LOAD 3, [prevq+t1]
LOAD 4, [prevq+t0]
psubw m3, m0
psubw m4, m1
PABS m3, m5
PABS m4, m5
paddw m3, m4
psrlw m2, 1
psrlw m3, 1
pmaxsw m2, m3
LOAD 3, [nextq+t1]
LOAD 4, [nextq+t0]
psubw m3, m0
psubw m4, m1
PABS m3, m5
PABS m4, m5
paddw m3, m4
psrlw m3, 1
pmaxsw m2, m3
mova [rsp+48], m2
paddw m1, m0
paddw m0, m0
psubw m0, m1
psrlw m1, 1
PABS m0, m2
movu m2, [curq+t1-1*2]
movu m3, [curq+t0-1*2]
mova m4, m2
psubusw m2, m3
psubusw m3, m4
PMAXUW m2, m3
%if mmsize == 16
mova m3, m2
psrldq m3, 4
%else
mova m3, m2
psrlq m3, 32
%endif
paddw m0, m2
paddw m0, m3
psubw m0, [pw_1]
CHECK -2, 0
CHECK1
CHECK -3, 1
CHECK2
CHECK 0, -2
CHECK1
CHECK 1, -3
CHECK2
mova m6, [rsp+48]
cmp DWORD r8m, 2
jge .end%1
LOAD 2, [%2+t1*2]
LOAD 4, [%3+t1*2]
LOAD 3, [%2+t0*2]
LOAD 5, [%3+t0*2]
paddw m2, m4
paddw m3, m5
psrlw m2, 1
psrlw m3, 1
mova m4, [rsp+ 0]
mova m5, [rsp+16]
mova m7, [rsp+32]
psubw m2, m4
psubw m3, m7
mova m0, m5
psubw m5, m4
psubw m0, m7
mova m4, m2
pminsw m2, m3
pmaxsw m3, m4
pmaxsw m2, m5
pminsw m3, m5
pmaxsw m2, m0
pminsw m3, m0
pxor m4, m4
pmaxsw m6, m3
psubw m4, m2
pmaxsw m6, m4
.end%1:
mova m2, [rsp+16]
mova m3, m2
psubw m2, m6
paddw m3, m6
pmaxsw m1, m2
pminsw m1, m3
movu [dstq], m1
add dstq, mmsize-4
add prevq, mmsize-4
add curq, mmsize-4
add nextq, mmsize-4
sub DWORD r4m, mmsize/2-2
jg .loop%1
%endmacro
%macro YADIF 0
%if ARCH_X86_32
cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
prefs, mrefs, parity, mode
%else
cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
prefs, mrefs, parity, mode
%endif
cmp DWORD wm, 0
jle .ret
%if ARCH_X86_32
mov r4, r5mp
mov r5, r6mp
DECLARE_REG_TMP 4,5
%else
movsxd r5, DWORD r5m
movsxd r6, DWORD r6m
DECLARE_REG_TMP 5,6
%endif
cmp DWORD paritym, 0
je .parity0
FILTER 1, prevq, curq
jmp .ret
.parity0:
FILTER 0, curq, nextq
.ret:
RET
%endmacro
INIT_XMM ssse3
YADIF
INIT_XMM sse2
YADIF
%if ARCH_X86_32
INIT_MMX mmxext
YADIF
%endif