yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
17e7b49501
commit
0a5814c9ba
@ -5,4 +5,4 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
|
||||
|
||||
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
|
||||
YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
|
||||
YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o
|
||||
YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
|
||||
|
@ -49,6 +49,16 @@ void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
|
||||
void *next, int w, int prefs,
|
||||
int mrefs, int parity, int mode);
|
||||
|
||||
void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
|
||||
void *next, int w, int prefs,
|
||||
int mrefs, int parity, int mode);
|
||||
void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
|
||||
void *next, int w, int prefs,
|
||||
int mrefs, int parity, int mode);
|
||||
void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
|
||||
void *next, int w, int prefs,
|
||||
int mrefs, int parity, int mode);
|
||||
|
||||
av_cold void ff_yadif_init_x86(YADIFContext *yadif)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -56,7 +66,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
|
||||
: yadif->csp->comp[0].depth_minus1 + 1;
|
||||
|
||||
#if HAVE_YASM
|
||||
if (bit_depth > 8) {
|
||||
if (bit_depth >= 15) {
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
|
||||
@ -67,6 +77,15 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
|
||||
yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
|
||||
} else if ( bit_depth >= 9 && bit_depth <= 14) {
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
|
||||
#endif /* ARCH_X86_32 */
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
|
||||
if (EXTERNAL_SSSE3(cpu_flags))
|
||||
yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
|
||||
} else {
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
|
284
libavfilter/x86/yadif-10.asm
Normal file
284
libavfilter/x86/yadif-10.asm
Normal file
@ -0,0 +1,284 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for yadif filter
|
||||
;*
|
||||
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License along
|
||||
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_1: times 8 dw 1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro PABS 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %1
|
||||
%else
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PMAXUW 2
|
||||
%if cpuflag(sse4)
|
||||
pmaxuw %1, %2
|
||||
%else
|
||||
psubusw %1, %2
|
||||
paddusw %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CHECK 2
|
||||
movu m2, [curq+t1+%1*2]
|
||||
movu m3, [curq+t0+%2*2]
|
||||
mova m4, m2
|
||||
mova m5, m2
|
||||
pxor m4, m3
|
||||
pavgw m5, m3
|
||||
pand m4, [pw_1]
|
||||
psubusw m5, m4
|
||||
%if mmsize == 16
|
||||
psrldq m5, 2
|
||||
%else
|
||||
psrlq m5, 16
|
||||
%endif
|
||||
mova m4, m2
|
||||
psubusw m2, m3
|
||||
psubusw m3, m4
|
||||
PMAXUW m2, m3
|
||||
mova m3, m2
|
||||
mova m4, m2
|
||||
%if mmsize == 16
|
||||
psrldq m3, 2
|
||||
psrldq m4, 4
|
||||
%else
|
||||
psrlq m3, 16
|
||||
psrlq m4, 32
|
||||
%endif
|
||||
paddw m2, m3
|
||||
paddw m2, m4
|
||||
%endmacro
|
||||
|
||||
%macro CHECK1 0
|
||||
mova m3, m0
|
||||
pcmpgtw m3, m2
|
||||
pminsw m0, m2
|
||||
mova m6, m3
|
||||
pand m5, m3
|
||||
pandn m3, m1
|
||||
por m3, m5
|
||||
mova m1, m3
|
||||
%endmacro
|
||||
|
||||
; %macro CHECK2 0
|
||||
; paddw m6, [pw_1]
|
||||
; psllw m6, 14
|
||||
; paddsw m2, m6
|
||||
; mova m3, m0
|
||||
; pcmpgtw m3, m2
|
||||
; pminsw m0, m2
|
||||
; pand m5, m3
|
||||
; pandn m3, m1
|
||||
; por m3, m5
|
||||
; mova m1, m3
|
||||
; %endmacro
|
||||
|
||||
; This version of CHECK2 is required for 14-bit samples. The left-shift trick
|
||||
; in the old code is not large enough to correctly select pixels or scores.
|
||||
|
||||
%macro CHECK2 0
|
||||
mova m3, m0
|
||||
pcmpgtw m0, m2
|
||||
pand m0, m6
|
||||
mova m6, m0
|
||||
pand m5, m6
|
||||
pand m2, m0
|
||||
pandn m6, m1
|
||||
pandn m0, m3
|
||||
por m6, m5
|
||||
por m0, m2
|
||||
mova m1, m6
|
||||
%endmacro
|
||||
|
||||
%macro LOAD 2
|
||||
movu m%1, %2
|
||||
%endmacro
|
||||
|
||||
%macro FILTER 3
|
||||
.loop%1:
|
||||
pxor m7, m7
|
||||
LOAD 0, [curq+t1]
|
||||
LOAD 1, [curq+t0]
|
||||
LOAD 2, [%2]
|
||||
LOAD 3, [%3]
|
||||
mova m4, m3
|
||||
paddw m3, m2
|
||||
psraw m3, 1
|
||||
mova [rsp+ 0], m0
|
||||
mova [rsp+16], m3
|
||||
mova [rsp+32], m1
|
||||
psubw m2, m4
|
||||
PABS m2, m4
|
||||
LOAD 3, [prevq+t1]
|
||||
LOAD 4, [prevq+t0]
|
||||
psubw m3, m0
|
||||
psubw m4, m1
|
||||
PABS m3, m5
|
||||
PABS m4, m5
|
||||
paddw m3, m4
|
||||
psrlw m2, 1
|
||||
psrlw m3, 1
|
||||
pmaxsw m2, m3
|
||||
LOAD 3, [nextq+t1]
|
||||
LOAD 4, [nextq+t0]
|
||||
psubw m3, m0
|
||||
psubw m4, m1
|
||||
PABS m3, m5
|
||||
PABS m4, m5
|
||||
paddw m3, m4
|
||||
psrlw m3, 1
|
||||
pmaxsw m2, m3
|
||||
mova [rsp+48], m2
|
||||
|
||||
paddw m1, m0
|
||||
paddw m0, m0
|
||||
psubw m0, m1
|
||||
psrlw m1, 1
|
||||
PABS m0, m2
|
||||
|
||||
movu m2, [curq+t1-1*2]
|
||||
movu m3, [curq+t0-1*2]
|
||||
mova m4, m2
|
||||
psubusw m2, m3
|
||||
psubusw m3, m4
|
||||
PMAXUW m2, m3
|
||||
%if mmsize == 16
|
||||
mova m3, m2
|
||||
psrldq m3, 4
|
||||
%else
|
||||
mova m3, m2
|
||||
psrlq m3, 32
|
||||
%endif
|
||||
paddw m0, m2
|
||||
paddw m0, m3
|
||||
psubw m0, [pw_1]
|
||||
|
||||
CHECK -2, 0
|
||||
CHECK1
|
||||
CHECK -3, 1
|
||||
CHECK2
|
||||
CHECK 0, -2
|
||||
CHECK1
|
||||
CHECK 1, -3
|
||||
CHECK2
|
||||
|
||||
mova m6, [rsp+48]
|
||||
cmp DWORD r8m, 2
|
||||
jge .end%1
|
||||
LOAD 2, [%2+t1*2]
|
||||
LOAD 4, [%3+t1*2]
|
||||
LOAD 3, [%2+t0*2]
|
||||
LOAD 5, [%3+t0*2]
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
psrlw m2, 1
|
||||
psrlw m3, 1
|
||||
mova m4, [rsp+ 0]
|
||||
mova m5, [rsp+16]
|
||||
mova m7, [rsp+32]
|
||||
psubw m2, m4
|
||||
psubw m3, m7
|
||||
mova m0, m5
|
||||
psubw m5, m4
|
||||
psubw m0, m7
|
||||
mova m4, m2
|
||||
pminsw m2, m3
|
||||
pmaxsw m3, m4
|
||||
pmaxsw m2, m5
|
||||
pminsw m3, m5
|
||||
pmaxsw m2, m0
|
||||
pminsw m3, m0
|
||||
pxor m4, m4
|
||||
pmaxsw m6, m3
|
||||
psubw m4, m2
|
||||
pmaxsw m6, m4
|
||||
|
||||
.end%1:
|
||||
mova m2, [rsp+16]
|
||||
mova m3, m2
|
||||
psubw m2, m6
|
||||
paddw m3, m6
|
||||
pmaxsw m1, m2
|
||||
pminsw m1, m3
|
||||
|
||||
movu [dstq], m1
|
||||
add dstq, mmsize-4
|
||||
add prevq, mmsize-4
|
||||
add curq, mmsize-4
|
||||
add nextq, mmsize-4
|
||||
sub DWORD r4m, mmsize/2-2
|
||||
jg .loop%1
|
||||
%endmacro
|
||||
|
||||
%macro YADIF 0
|
||||
%if ARCH_X86_32
|
||||
cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
||||
prefs, mrefs, parity, mode
|
||||
%else
|
||||
cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
|
||||
prefs, mrefs, parity, mode
|
||||
%endif
|
||||
cmp DWORD wm, 0
|
||||
jle .ret
|
||||
%if ARCH_X86_32
|
||||
mov r4, r5mp
|
||||
mov r5, r6mp
|
||||
DECLARE_REG_TMP 4,5
|
||||
%else
|
||||
movsxd r5, DWORD r5m
|
||||
movsxd r6, DWORD r6m
|
||||
DECLARE_REG_TMP 5,6
|
||||
%endif
|
||||
|
||||
cmp DWORD paritym, 0
|
||||
je .parity0
|
||||
FILTER 1, prevq, curq
|
||||
jmp .ret
|
||||
|
||||
.parity0:
|
||||
FILTER 0, curq, nextq
|
||||
|
||||
.ret:
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
YADIF
|
||||
INIT_XMM sse2
|
||||
YADIF
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
YADIF
|
||||
%endif
|
Loading…
x
Reference in New Issue
Block a user