Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC

Originally committed as revision 23857 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2010-06-28 18:56:24 +00:00
parent 2ad4a3bc78
commit a173aa8940
2 changed files with 314 additions and 110 deletions

View File

@ -23,6 +23,8 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/vp8dsp.h"
#if HAVE_YASM
/*
* MC functions
*/
@ -65,96 +67,149 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
#define TAP_W16(OPT, TAPTYPE) \
static void ff_put_vp8_epel16_ ## TAPTYPE ## _ ## OPT(uint8_t *dst, \
int dststride, \
uint8_t *src, \
int srcstride, \
int height, \
int mx, int my) \
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
uint8_t *src, int srcstride,
int height, int mx, int my);
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, int dststride, uint8_t *src, \
int srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(dst, dststride, \
src, srcstride, \
height, mx, my); \
ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(dst + 8, dststride, \
src + 8, srcstride, \
height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
#define TAP_W8(OPT, TAPTYPE) \
static void ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(uint8_t *dst, \
int dststride, \
uint8_t *src, \
int srcstride, \
int height, \
int mx, int my) \
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, int dststride, uint8_t *src, \
int srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_epel4_ ## TAPTYPE ## _ ## OPT(dst, dststride, \
src, srcstride, \
height, mx, my); \
ff_put_vp8_epel4_ ## TAPTYPE ## _ ## OPT(dst + 4, dststride, \
src + 4, srcstride, \
height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
}
#if HAVE_YASM
TAP_W8 (mmxext, h4)
TAP_W8 (mmxext, h6)
TAP_W16(mmxext, h6)
TAP_W8 (mmxext, v4)
TAP_W8 (mmxext, v6)
TAP_W16(mmxext, v6)
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
TAP_W16(sse2, h6)
TAP_W16(sse2, v6)
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
TAP_W16(ssse3, h6)
TAP_W16(ssse3, v6)
#endif
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT \
(uint8_t *dst, int dststride, \
uint8_t *src, int srcstride, \
int height, int mx, int my) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
uint8_t *dst, int dststride, uint8_t *src, \
int srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
src -= srcstride * (TAPNUMY / 2 - 1); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT(tmp, SIZE, \
src, srcstride, \
height + TAPNUMY - 1, \
mx, my); \
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT(dst, dststride, \
tmpptr, SIZE, \
height, mx, my); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16)
#if HAVE_YASM
HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
HVTAP(mmxext, 8, 6, 6, 16, 16)
#endif
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
#if HAVE_YASM
HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
#endif
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, int dststride, uint8_t *src, \
int srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
tmp, SIZE, src, srcstride, height + 1, mx, my); \
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
HVBILIN(mmxext, 8, 4, 8)
HVBILIN(mmxext, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16)
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
#endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
{
@ -168,53 +223,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & FF_MM_MMXEXT) {
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_mmxext;
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_mmxext;
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_mmxext;
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_mmxext;
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_mmxext;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_mmxext;
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_mmxext;
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_mmxext;
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_mmxext;
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_mmxext;
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_mmxext;
c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_mmxext;
c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_mmxext;
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_mmxext;
c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_mmxext;
c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_mmxext;
c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_mmxext;
c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_mmxext;
c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_mmxext;
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_MC_FUNC(1, 4, mmxext);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(1, 4, mmxext);
}
if (mm_flags & FF_MM_SSE2) {
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_sse2;
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_sse2;
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_sse2;
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_sse2;
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_sse2;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_sse2;
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_sse2;
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_sse2;
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_sse2;
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_sse2;
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_sse2;
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
}
if (mm_flags & FF_MM_SSSE3) {
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_ssse3;
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_ssse3;
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_ssse3;
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_ssse3;
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_ssse3;
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_ssse3;
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_ssse3;
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_ssse3;
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_ssse3;
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_ssse3;
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_ssse3;
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
}
if (mm_flags & FF_MM_SSE4) {

View File

@ -98,13 +98,31 @@ sixtap_filter_v_m: times 8 dw 2
times 8 dw -11
times 8 dw 2
bilinear_filter_vw_m: times 8 dw 1
times 8 dw 2
times 8 dw 3
times 8 dw 4
times 8 dw 5
times 8 dw 6
times 8 dw 7
bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 6, 2
times 8 db 5, 3
times 8 db 4, 4
times 8 db 3, 5
times 8 db 2, 6
times 8 db 1, 7
%ifdef PIC
%define fourtap_filter_hw r11
%define sixtap_filter_hw r11
%define fourtap_filter_hb r11
%define sixtap_filter_hb r11
%define fourtap_filter_v r11
%define sixtap_filter_v r11
%define fourtap_filter_hw r11
%define sixtap_filter_hw r11
%define fourtap_filter_hb r11
%define sixtap_filter_hb r11
%define fourtap_filter_v r11
%define sixtap_filter_v r11
%define bilinear_filter_vw r11
%define bilinear_filter_vb r11
%else
%define fourtap_filter_hw fourtap_filter_hw_m
%define sixtap_filter_hw sixtap_filter_hw_m
@ -112,14 +130,16 @@ sixtap_filter_v_m: times 8 dw 2
%define sixtap_filter_hb sixtap_filter_hb_m
%define fourtap_filter_v fourtap_filter_v_m
%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
%endif
filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
cextern pw_4
cextern pw_64
@ -361,8 +381,8 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
shl r5d, 4
mova m2, [pw_64]
mova m3, [filter_v4_shuf1]
mova m4, [filter_v4_shuf2]
mova m3, [filter_h4_shuf]
mova m4, [filter_h6_shuf2]
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
%endif
@ -391,8 +411,8 @@ cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
lea r5d, [r5*3]
mova m3, [filter_v6_shuf1]
mova m4, [filter_v6_shuf2]
mova m3, [filter_h6_shuf1]
mova m4, [filter_h6_shuf2]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
%endif
@ -406,7 +426,7 @@ cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
mova m2, m0
pshufb m0, m3
pshufb m1, m4
pshufb m2, [filter_v6_shuf3]
pshufb m2, [filter_h6_shuf3]
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
@ -634,6 +654,162 @@ cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
jg .nextrow
REP_RET
%macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
mov r5d, 8*16
shl r6d, 4
sub r5d, r6d
%ifdef PIC
lea r11, [bilinear_filter_vw_m]
%endif
pxor m6, m6
mova m4, [bilinear_filter_vw+r5d-16]
mova m5, [bilinear_filter_vw+r6d-16]
.nextrow
movh m0, [r2+r3*0]
movh m1, [r2+r3*1]
movh m3, [r2+r3*2]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m3, m6
mova m2, m1
pmullw m0, m4
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%ifidn %1, mmxext
packuswb m0, m0
packuswb m2, m2
movh [r0+r1*0], m0
movh [r0+r1*1], m2
%else
packuswb m0, m2
movh [r0+r1*0], m0
movhps [r0+r1*1], m0
%endif
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub r4, 2
jg .nextrow
REP_RET
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
mov r6d, 8*16
shl r5d, 4
sub r6d, r5d
%ifdef PIC
lea r11, [bilinear_filter_vw_m]
%endif
pxor m6, m6
mova m4, [bilinear_filter_vw+r6d-16]
mova m5, [bilinear_filter_vw+r5d-16]
.nextrow
movh m0, [r2+r3*0+0]
movh m1, [r2+r3*0+1]
movh m2, [r2+r3*1+0]
movh m3, [r2+r3*1+1]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m2, m6
punpcklbw m3, m6
pmullw m0, m4
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%ifidn %1, mmxext
packuswb m0, m0
packuswb m2, m2
movh [r0+r1*0], m0
movh [r0+r1*1], m2
%else
packuswb m0, m2
movh [r0+r1*0], m0
movhps [r0+r1*1], m0
%endif
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub r4, 2
jg .nextrow
REP_RET
%endmacro
INIT_MMX
FILTER_BILINEAR mmxext, 4, 0
INIT_XMM
FILTER_BILINEAR sse2, 8, 7
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
shl r6d, 4
%ifdef PIC
lea r11, [bilinear_filter_vb_m]
%endif
pxor m4, m4
mova m3, [bilinear_filter_vb+r6d-16]
.nextrow
movh m0, [r2+r3*0]
movh m1, [r2+r3*1]
movh m2, [r2+r3*2]
punpcklbw m0, m1
punpcklbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
psraw m0, 2
psraw m1, 2
pavgw m0, m4
pavgw m1, m4
packuswb m0, m1
movh [r0+r1*0], m0
movhps [r0+r1*1], m0
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub r4, 2
jg .nextrow
REP_RET
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
shl r5d, 4
%ifdef PIC
lea r11, [bilinear_filter_vb_m]
%endif
pxor m4, m4
mova m2, [filter_h2_shuf]
mova m3, [bilinear_filter_vb+r5d-16]
.nextrow
movu m0, [r2+r3*0]
movu m1, [r2+r3*1]
pshufb m0, m2
pshufb m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
psraw m0, 2
psraw m1, 2
pavgw m0, m4
pavgw m1, m4
packuswb m0, m1
movh [r0+r1*0], m0
movhps [r0+r1*1], m0
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub r4, 2
jg .nextrow
REP_RET
;-----------------------------------------------------------------------------
; IDCT functions:
;