yadif sse2/ssse3 optimizations

Originally committed as revision 25874 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Baptiste Coudurier 2010-12-04 05:23:44 +00:00
parent 23b8342af4
commit 1ef64490e1
3 changed files with 292 additions and 210 deletions

View File

@ -304,7 +304,11 @@ static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
if (args) sscanf(args, "%d:%d", &yadif->mode, &yadif->parity);
yadif->filter_line = filter_line_c;
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
yadif->filter_line = ff_yadif_filter_line_ssse3;
else if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
yadif->filter_line = ff_yadif_filter_line_sse2;
else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
yadif->filter_line = ff_yadif_filter_line_mmx;
av_log(ctx, AV_LOG_INFO, "mode:%d parity:%d\n", yadif->mode, yadif->parity);

View File

@ -22,214 +22,24 @@
#include "libavutil/x86_cpu.h"
#include "libavfilter/yadif.h"
#define LOAD4(mem,dst) \
"movd "mem", "#dst" \n\t"\
"punpcklbw %%mm7, "#dst" \n\t"
#if HAVE_SSSE3
#define COMPILE_TEMPLATE_SSE 1
#define COMPILE_TEMPLATE_SSSE3 1
#undef RENAME
#define RENAME(a) a ## _ssse3
#include "yadif_template.c"
#undef COMPILE_TEMPLATE_SSSE3
#endif
#define PABS(tmp,dst) \
"pxor "#tmp", "#tmp" \n\t"\
"psubw "#dst", "#tmp" \n\t"\
"pmaxsw "#tmp", "#dst" \n\t"
#if HAVE_SSE
#undef RENAME
#define RENAME(a) a ## _sse2
#include "yadif_template.c"
#undef COMPILE_TEMPLATE_SSE
#endif
#define CHECK(pj,mj) \
"movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
"movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
"movq %%mm2, %%mm4 \n\t"\
"movq %%mm2, %%mm5 \n\t"\
"pxor %%mm3, %%mm4 \n\t"\
"pavgb %%mm3, %%mm5 \n\t"\
"pand "MANGLE(pb_1)", %%mm4 \n\t"\
"psubusb %%mm4, %%mm5 \n\t"\
"psrlq $8, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
"psrlq $8, %%mm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
"psrlq $16, %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm4, %%mm2 \n\t" /* score */
#define CHECK1 \
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
"pminsw %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
"movq %%mm3, %%mm6 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw "MANGLE(pw_1)", %%mm6 \n\t"\
"psllw $14, %%mm6 \n\t"\
"paddsw %%mm6, %%mm2 \n\t"\
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t"\
"pminsw %%mm2, %%mm0 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t"
DECLARE_ASM_CONST(16, uint64_t, pw_1) = 0x0001000100010001ULL;
DECLARE_ASM_CONST(16, uint64_t, pb_1) = 0x0101010101010101ULL;
void ff_yadif_filter_line_mmx(uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int refs, int parity, int mode)
{
uint64_t tmp0, tmp1, tmp2, tmp3;
int x;
#define FILTER\
for(x=0; x<w; x+=4){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\
LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
LOAD4("(%["next2"])", %%mm3) /* next2[x] */\
"movq %%mm3, %%mm4 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"psraw $1, %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
"movq %%mm0, %[tmp0] \n\t" /* c */\
"movq %%mm3, %[tmp1] \n\t" /* d */\
"movq %%mm1, %[tmp2] \n\t" /* e */\
"psubw %%mm4, %%mm2 \n\t"\
PABS( %%mm4, %%mm2) /* temporal_diff0 */\
LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\
LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\
"psubw %%mm0, %%mm3 \n\t"\
"psubw %%mm1, %%mm4 \n\t"\
PABS( %%mm5, %%mm3)\
PABS( %%mm5, %%mm4)\
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
"psrlw $1, %%mm2 \n\t"\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
"psubw %%mm0, %%mm3 \n\t"\
"psubw %%mm1, %%mm4 \n\t"\
PABS( %%mm5, %%mm3)\
PABS( %%mm5, %%mm4)\
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
"movq %%mm2, %[tmp3] \n\t" /* diff */\
\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm0, %%mm0 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psrlw $1, %%mm1 \n\t" /* spatial_pred */\
PABS( %%mm2, %%mm0) /* ABS(c-e) */\
\
"movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
"movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"pshufw $9,%%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psubw "MANGLE(pw_1)", %%mm0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
"movq %[tmp3], %%mm6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psrlw $1, %%mm2 \n\t" /* b */\
"psrlw $1, %%mm3 \n\t" /* f */\
"movq %[tmp0], %%mm4 \n\t" /* c */\
"movq %[tmp1], %%mm5 \n\t" /* d */\
"movq %[tmp2], %%mm7 \n\t" /* e */\
"psubw %%mm4, %%mm2 \n\t" /* b-c */\
"psubw %%mm7, %%mm3 \n\t" /* f-e */\
"movq %%mm5, %%mm0 \n\t"\
"psubw %%mm4, %%mm5 \n\t" /* d-c */\
"psubw %%mm7, %%mm0 \n\t" /* d-e */\
"movq %%mm2, %%mm4 \n\t"\
"pminsw %%mm3, %%mm2 \n\t"\
"pmaxsw %%mm4, %%mm3 \n\t"\
"pmaxsw %%mm5, %%mm2 \n\t"\
"pminsw %%mm5, %%mm3 \n\t"\
"pmaxsw %%mm0, %%mm2 \n\t" /* max */\
"pminsw %%mm0, %%mm3 \n\t" /* min */\
"pxor %%mm4, %%mm4 \n\t"\
"pmaxsw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm4 \n\t" /* -max */\
"pmaxsw %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
"movq %[tmp1], %%mm2 \n\t" /* d */\
"movq %%mm2, %%mm3 \n\t"\
"psubw %%mm6, %%mm2 \n\t" /* d-diff */\
"paddw %%mm6, %%mm3 \n\t" /* d+diff */\
"pmaxsw %%mm2, %%mm1 \n\t"\
"pminsw %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb %%mm1, %%mm1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)refs),\
[mrefs]"r"((x86_reg)-refs),\
[mode] "g"(mode)\
);\
__asm__ volatile("movd %%mm1, %0" :"=m"(*dst));\
dst += 4;\
prev+= 4;\
cur += 4;\
next+= 4;\
}
if (parity) {
#define prev2 "prev"
#define next2 "cur"
FILTER
#undef prev2
#undef next2
} else {
#define prev2 "cur"
#define next2 "next"
FILTER
#undef prev2
#undef next2
}
}
#undef LOAD4
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER
#if HAVE_MMX
#undef RENAME
#define RENAME(a) a ## _mmx
#include "yadif_template.c"
#endif

View File

@ -0,0 +1,268 @@
/*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifdef COMPILE_TEMPLATE_SSE
#define MM "%%xmm"
#define MOV "movq"
#define MOVQ "movdqa"
#define MOVQU "movdqu"
#define STEP 8
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrldq $1, "reg" \n\t"
#define PSRL2(reg) "psrldq $2, "reg" \n\t"
#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
"psrldq $2, "src" \n\t"
#else
#define MM "%%mm"
#define MOV "movd"
#define MOVQ "movq"
#define MOVQU "movq"
#define STEP 4
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrlq $8, "reg" \n\t"
#define PSRL2(reg) "psrlq $16, "reg" \n\t"
#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
#endif
#ifdef COMPILE_TEMPLATE_SSSE3
#define PABS(tmp,dst) \
"pabsw "dst", "dst" \n\t"
#else
#define PABS(tmp,dst) \
"pxor "tmp", "tmp" \n\t"\
"psubw "dst", "tmp" \n\t"\
"pmaxsw "tmp", "dst" \n\t"
#endif
#define CHECK(pj,mj) \
MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
MOVQ" "MM"2, "MM"4 \n\t"\
MOVQ" "MM"2, "MM"5 \n\t"\
"pxor "MM"3, "MM"4 \n\t"\
"pavgb "MM"3, "MM"5 \n\t"\
"pand "MANGLE(ff_pb_1)", "MM"4 \n\t"\
"psubusb "MM"4, "MM"5 \n\t"\
PSRL1(MM"5") \
"punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, "MM"3 \n\t"\
MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw "MM"7, "MM"2 \n\t"\
"punpcklbw "MM"7, "MM"3 \n\t"\
"punpcklbw "MM"7, "MM"4 \n\t"\
"paddw "MM"3, "MM"2 \n\t"\
"paddw "MM"4, "MM"2 \n\t" /* score */
#define CHECK1 \
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
"pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
MOVQ" "MM"3, "MM"6 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw "MANGLE(ff_pw_1)", "MM"6 \n\t"\
"psllw $14, "MM"6 \n\t"\
"paddsw "MM"6, "MM"2 \n\t"\
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t"\
"pminsw "MM"2, "MM"0 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t"
void RENAME(ff_yadif_filter_line)(uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int refs, int parity, int mode)
{
DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
int x;
#define FILTER\
for(x=0; x<w; x+=STEP){\
__asm__ volatile(\
"pxor "MM"7, "MM"7 \n\t"\
LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
LOAD("(%["next2"])", MM"3") /* next2[x] */\
MOVQ" "MM"3, "MM"4 \n\t"\
"paddw "MM"2, "MM"3 \n\t"\
"psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t"\
PABS( MM"4", MM"2") /* temporal_diff0 */\
LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
"psrlw $1, "MM"2 \n\t"\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
\
"paddw "MM"0, "MM"1 \n\t"\
"paddw "MM"0, "MM"0 \n\t"\
"psubw "MM"1, "MM"0 \n\t"\
"psrlw $1, "MM"1 \n\t" /* spatial_pred */\
PABS( MM"2", MM"0") /* ABS(c-e) */\
\
MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
PSHUF(MM"3", MM"2") \
"punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw "MM"2, "MM"0 \n\t"\
"paddw "MM"3, "MM"0 \n\t"\
"psubw "MANGLE(ff_pw_1)", "MM"0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
"paddw "MM"4, "MM"2 \n\t"\
"paddw "MM"5, "MM"3 \n\t"\
"psrlw $1, "MM"2 \n\t" /* b */\
"psrlw $1, "MM"3 \n\t" /* f */\
MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t" /* b-c */\
"psubw "MM"7, "MM"3 \n\t" /* f-e */\
MOVQ" "MM"5, "MM"0 \n\t"\
"psubw "MM"4, "MM"5 \n\t" /* d-c */\
"psubw "MM"7, "MM"0 \n\t" /* d-e */\
MOVQ" "MM"2, "MM"4 \n\t"\
"pminsw "MM"3, "MM"2 \n\t"\
"pmaxsw "MM"4, "MM"3 \n\t"\
"pmaxsw "MM"5, "MM"2 \n\t"\
"pminsw "MM"5, "MM"3 \n\t"\
"pmaxsw "MM"0, "MM"2 \n\t" /* max */\
"pminsw "MM"0, "MM"3 \n\t" /* min */\
"pxor "MM"4, "MM"4 \n\t"\
"pmaxsw "MM"3, "MM"6 \n\t"\
"psubw "MM"2, "MM"4 \n\t" /* -max */\
"pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
MOVQ" "MM"2, "MM"3 \n\t"\
"psubw "MM"6, "MM"2 \n\t" /* d-diff */\
"paddw "MM"6, "MM"3 \n\t" /* d+diff */\
"pmaxsw "MM"2, "MM"1 \n\t"\
"pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb "MM"1, "MM"1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)refs),\
[mrefs]"r"((x86_reg)-refs),\
[mode] "g"(mode)\
);\
__asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
dst += STEP;\
prev+= STEP;\
cur += STEP;\
next+= STEP;\
}
if (parity) {
#define prev2 "prev"
#define next2 "cur"
FILTER
#undef prev2
#undef next2
} else {
#define prev2 "cur"
#define next2 "next"
FILTER
#undef prev2
#undef next2
}
}
#undef STEP
#undef MM
#undef MOV
#undef MOVQ
#undef MOVQU
#undef PSHUF
#undef PSRL1
#undef PSRL2
#undef LOAD
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER