Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
Originally committed as revision 23857 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
2ad4a3bc78
commit
a173aa8940
@ -23,6 +23,8 @@
|
||||
#include "libavutil/x86_cpu.h"
|
||||
#include "libavcodec/vp8dsp.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
/*
|
||||
* MC functions
|
||||
*/
|
||||
@ -65,96 +67,149 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
#define TAP_W16(OPT, TAPTYPE) \
|
||||
static void ff_put_vp8_epel16_ ## TAPTYPE ## _ ## OPT(uint8_t *dst, \
|
||||
int dststride, \
|
||||
uint8_t *src, \
|
||||
int srcstride, \
|
||||
int height, \
|
||||
int mx, int my) \
|
||||
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
|
||||
uint8_t *src, int srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, int dststride, uint8_t *src, \
|
||||
int srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(dst, dststride, \
|
||||
src, srcstride, \
|
||||
height, mx, my); \
|
||||
ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(dst + 8, dststride, \
|
||||
src + 8, srcstride, \
|
||||
height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
|
||||
}
|
||||
#define TAP_W8(OPT, TAPTYPE) \
|
||||
static void ff_put_vp8_epel8_ ## TAPTYPE ## _ ## OPT(uint8_t *dst, \
|
||||
int dststride, \
|
||||
uint8_t *src, \
|
||||
int srcstride, \
|
||||
int height, \
|
||||
int mx, int my) \
|
||||
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, int dststride, uint8_t *src, \
|
||||
int srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_epel4_ ## TAPTYPE ## _ ## OPT(dst, dststride, \
|
||||
src, srcstride, \
|
||||
height, mx, my); \
|
||||
ff_put_vp8_epel4_ ## TAPTYPE ## _ ## OPT(dst + 4, dststride, \
|
||||
src + 4, srcstride, \
|
||||
height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
|
||||
}
|
||||
|
||||
#if HAVE_YASM
|
||||
TAP_W8 (mmxext, h4)
|
||||
TAP_W8 (mmxext, h6)
|
||||
TAP_W16(mmxext, h6)
|
||||
TAP_W8 (mmxext, v4)
|
||||
TAP_W8 (mmxext, v6)
|
||||
TAP_W16(mmxext, v6)
|
||||
TAP_W8 (mmxext, epel, h4)
|
||||
TAP_W8 (mmxext, epel, h6)
|
||||
TAP_W16(mmxext, epel, h6)
|
||||
TAP_W8 (mmxext, epel, v4)
|
||||
TAP_W8 (mmxext, epel, v6)
|
||||
TAP_W16(mmxext, epel, v6)
|
||||
TAP_W8 (mmxext, bilinear, h)
|
||||
TAP_W16(mmxext, bilinear, h)
|
||||
TAP_W8 (mmxext, bilinear, v)
|
||||
TAP_W16(mmxext, bilinear, v)
|
||||
|
||||
TAP_W16(sse2, h6)
|
||||
TAP_W16(sse2, v6)
|
||||
TAP_W16(sse2, epel, h6)
|
||||
TAP_W16(sse2, epel, v6)
|
||||
TAP_W16(sse2, bilinear, h)
|
||||
TAP_W16(sse2, bilinear, v)
|
||||
|
||||
TAP_W16(ssse3, h6)
|
||||
TAP_W16(ssse3, v6)
|
||||
#endif
|
||||
TAP_W16(ssse3, epel, h6)
|
||||
TAP_W16(ssse3, epel, v6)
|
||||
TAP_W16(ssse3, bilinear, h)
|
||||
TAP_W16(ssse3, bilinear, v)
|
||||
|
||||
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT \
|
||||
(uint8_t *dst, int dststride, \
|
||||
uint8_t *src, int srcstride, \
|
||||
int height, int mx, int my) \
|
||||
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
|
||||
uint8_t *dst, int dststride, uint8_t *src, \
|
||||
int srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
|
||||
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
|
||||
src -= srcstride * (TAPNUMY / 2 - 1); \
|
||||
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT(tmp, SIZE, \
|
||||
src, srcstride, \
|
||||
height + TAPNUMY - 1, \
|
||||
mx, my); \
|
||||
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT(dst, dststride, \
|
||||
tmpptr, SIZE, \
|
||||
height, mx, my); \
|
||||
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
|
||||
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
|
||||
dst, dststride, tmpptr, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8) \
|
||||
HVTAP(mmxext, 8, x, y, 8, 16)
|
||||
|
||||
#if HAVE_YASM
|
||||
HVTAPMMX(4, 4)
|
||||
HVTAPMMX(4, 6)
|
||||
HVTAPMMX(6, 4)
|
||||
HVTAPMMX(6, 6)
|
||||
HVTAP(mmxext, 8, 6, 6, 16, 16)
|
||||
#endif
|
||||
|
||||
#define HVTAPSSE2(x, y, w) \
|
||||
HVTAP(sse2, 16, x, y, w, 16) \
|
||||
HVTAP(ssse3, 16, x, y, w, 16)
|
||||
|
||||
#if HAVE_YASM
|
||||
HVTAPSSE2(4, 4, 8)
|
||||
HVTAPSSE2(4, 6, 8)
|
||||
HVTAPSSE2(6, 4, 8)
|
||||
HVTAPSSE2(6, 6, 8)
|
||||
HVTAPSSE2(6, 6, 16)
|
||||
#endif
|
||||
|
||||
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
|
||||
uint8_t *dst, int dststride, uint8_t *src, \
|
||||
int srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
|
||||
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + 1, mx, my); \
|
||||
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
|
||||
dst, dststride, tmp, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
HVBILIN(mmxext, 8, 4, 8)
|
||||
HVBILIN(mmxext, 8, 8, 16)
|
||||
HVBILIN(mmxext, 8, 16, 16)
|
||||
HVBILIN(sse2, 8, 8, 16)
|
||||
HVBILIN(sse2, 8, 16, 16)
|
||||
HVBILIN(ssse3, 8, 8, 16)
|
||||
HVBILIN(ssse3, 8, 16, 16)
|
||||
|
||||
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
#endif
|
||||
|
||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
|
||||
|
||||
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
|
||||
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
|
||||
|
||||
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
|
||||
|
||||
|
||||
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
{
|
||||
@ -168,53 +223,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (mm_flags & FF_MM_MMXEXT) {
|
||||
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_mmxext;
|
||||
c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_mmxext;
|
||||
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
||||
VP8_MC_FUNC(1, 8, mmxext);
|
||||
VP8_MC_FUNC(1, 4, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 4, mmxext);
|
||||
}
|
||||
|
||||
if (mm_flags & FF_MM_SSE2) {
|
||||
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_sse2;
|
||||
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_sse2;
|
||||
VP8_LUMA_MC_FUNC(0, 16, sse2);
|
||||
VP8_MC_FUNC(1, 8, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||
}
|
||||
|
||||
if (mm_flags & FF_MM_SSSE3) {
|
||||
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_ssse3;
|
||||
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_ssse3;
|
||||
VP8_LUMA_MC_FUNC(0, 16, ssse3);
|
||||
VP8_MC_FUNC(1, 8, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
|
||||
}
|
||||
|
||||
if (mm_flags & FF_MM_SSE4) {
|
||||
|
@ -98,13 +98,31 @@ sixtap_filter_v_m: times 8 dw 2
|
||||
times 8 dw -11
|
||||
times 8 dw 2
|
||||
|
||||
bilinear_filter_vw_m: times 8 dw 1
|
||||
times 8 dw 2
|
||||
times 8 dw 3
|
||||
times 8 dw 4
|
||||
times 8 dw 5
|
||||
times 8 dw 6
|
||||
times 8 dw 7
|
||||
|
||||
bilinear_filter_vb_m: times 8 db 7, 1
|
||||
times 8 db 6, 2
|
||||
times 8 db 5, 3
|
||||
times 8 db 4, 4
|
||||
times 8 db 3, 5
|
||||
times 8 db 2, 6
|
||||
times 8 db 1, 7
|
||||
|
||||
%ifdef PIC
|
||||
%define fourtap_filter_hw r11
|
||||
%define sixtap_filter_hw r11
|
||||
%define fourtap_filter_hb r11
|
||||
%define sixtap_filter_hb r11
|
||||
%define fourtap_filter_v r11
|
||||
%define sixtap_filter_v r11
|
||||
%define fourtap_filter_hw r11
|
||||
%define sixtap_filter_hw r11
|
||||
%define fourtap_filter_hb r11
|
||||
%define sixtap_filter_hb r11
|
||||
%define fourtap_filter_v r11
|
||||
%define sixtap_filter_v r11
|
||||
%define bilinear_filter_vw r11
|
||||
%define bilinear_filter_vb r11
|
||||
%else
|
||||
%define fourtap_filter_hw fourtap_filter_hw_m
|
||||
%define sixtap_filter_hw sixtap_filter_hw_m
|
||||
@ -112,14 +130,16 @@ sixtap_filter_v_m: times 8 dw 2
|
||||
%define sixtap_filter_hb sixtap_filter_hb_m
|
||||
%define fourtap_filter_v fourtap_filter_v_m
|
||||
%define sixtap_filter_v sixtap_filter_v_m
|
||||
%define bilinear_filter_vw bilinear_filter_vw_m
|
||||
%define bilinear_filter_vb bilinear_filter_vb_m
|
||||
%endif
|
||||
|
||||
filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
|
||||
filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
|
||||
|
||||
filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
||||
filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||
filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
||||
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
||||
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_64
|
||||
@ -361,8 +381,8 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
|
||||
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
|
||||
shl r5d, 4
|
||||
mova m2, [pw_64]
|
||||
mova m3, [filter_v4_shuf1]
|
||||
mova m4, [filter_v4_shuf2]
|
||||
mova m3, [filter_h4_shuf]
|
||||
mova m4, [filter_h6_shuf2]
|
||||
%ifdef PIC
|
||||
lea r11, [fourtap_filter_hb_m]
|
||||
%endif
|
||||
@ -391,8 +411,8 @@ cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
|
||||
|
||||
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
|
||||
lea r5d, [r5*3]
|
||||
mova m3, [filter_v6_shuf1]
|
||||
mova m4, [filter_v6_shuf2]
|
||||
mova m3, [filter_h6_shuf1]
|
||||
mova m4, [filter_h6_shuf2]
|
||||
%ifdef PIC
|
||||
lea r11, [sixtap_filter_hb_m]
|
||||
%endif
|
||||
@ -406,7 +426,7 @@ cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
|
||||
mova m2, m0
|
||||
pshufb m0, m3
|
||||
pshufb m1, m4
|
||||
pshufb m2, [filter_v6_shuf3]
|
||||
pshufb m2, [filter_h6_shuf3]
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
@ -634,6 +654,162 @@ cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
%macro FILTER_BILINEAR 3
|
||||
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
|
||||
mov r5d, 8*16
|
||||
shl r6d, 4
|
||||
sub r5d, r6d
|
||||
%ifdef PIC
|
||||
lea r11, [bilinear_filter_vw_m]
|
||||
%endif
|
||||
pxor m6, m6
|
||||
mova m4, [bilinear_filter_vw+r5d-16]
|
||||
mova m5, [bilinear_filter_vw+r6d-16]
|
||||
.nextrow
|
||||
movh m0, [r2+r3*0]
|
||||
movh m1, [r2+r3*1]
|
||||
movh m3, [r2+r3*2]
|
||||
punpcklbw m0, m6
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m3, m6
|
||||
mova m2, m1
|
||||
pmullw m0, m4
|
||||
pmullw m1, m5
|
||||
pmullw m2, m4
|
||||
pmullw m3, m5
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
psraw m0, 2
|
||||
psraw m2, 2
|
||||
pavgw m0, m6
|
||||
pavgw m2, m6
|
||||
%ifidn %1, mmxext
|
||||
packuswb m0, m0
|
||||
packuswb m2, m2
|
||||
movh [r0+r1*0], m0
|
||||
movh [r0+r1*1], m2
|
||||
%else
|
||||
packuswb m0, m2
|
||||
movh [r0+r1*0], m0
|
||||
movhps [r0+r1*1], m0
|
||||
%endif
|
||||
|
||||
lea r0, [r0+r1*2]
|
||||
lea r2, [r2+r3*2]
|
||||
sub r4, 2
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
|
||||
mov r6d, 8*16
|
||||
shl r5d, 4
|
||||
sub r6d, r5d
|
||||
%ifdef PIC
|
||||
lea r11, [bilinear_filter_vw_m]
|
||||
%endif
|
||||
pxor m6, m6
|
||||
mova m4, [bilinear_filter_vw+r6d-16]
|
||||
mova m5, [bilinear_filter_vw+r5d-16]
|
||||
.nextrow
|
||||
movh m0, [r2+r3*0+0]
|
||||
movh m1, [r2+r3*0+1]
|
||||
movh m2, [r2+r3*1+0]
|
||||
movh m3, [r2+r3*1+1]
|
||||
punpcklbw m0, m6
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m2, m6
|
||||
punpcklbw m3, m6
|
||||
pmullw m0, m4
|
||||
pmullw m1, m5
|
||||
pmullw m2, m4
|
||||
pmullw m3, m5
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
psraw m0, 2
|
||||
psraw m2, 2
|
||||
pavgw m0, m6
|
||||
pavgw m2, m6
|
||||
%ifidn %1, mmxext
|
||||
packuswb m0, m0
|
||||
packuswb m2, m2
|
||||
movh [r0+r1*0], m0
|
||||
movh [r0+r1*1], m2
|
||||
%else
|
||||
packuswb m0, m2
|
||||
movh [r0+r1*0], m0
|
||||
movhps [r0+r1*1], m0
|
||||
%endif
|
||||
|
||||
lea r0, [r0+r1*2]
|
||||
lea r2, [r2+r3*2]
|
||||
sub r4, 2
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
FILTER_BILINEAR mmxext, 4, 0
|
||||
INIT_XMM
|
||||
FILTER_BILINEAR sse2, 8, 7
|
||||
|
||||
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
|
||||
shl r6d, 4
|
||||
%ifdef PIC
|
||||
lea r11, [bilinear_filter_vb_m]
|
||||
%endif
|
||||
pxor m4, m4
|
||||
mova m3, [bilinear_filter_vb+r6d-16]
|
||||
.nextrow
|
||||
movh m0, [r2+r3*0]
|
||||
movh m1, [r2+r3*1]
|
||||
movh m2, [r2+r3*2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m1, m2
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
psraw m0, 2
|
||||
psraw m1, 2
|
||||
pavgw m0, m4
|
||||
pavgw m1, m4
|
||||
packuswb m0, m1
|
||||
movh [r0+r1*0], m0
|
||||
movhps [r0+r1*1], m0
|
||||
|
||||
lea r0, [r0+r1*2]
|
||||
lea r2, [r2+r3*2]
|
||||
sub r4, 2
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
|
||||
shl r5d, 4
|
||||
%ifdef PIC
|
||||
lea r11, [bilinear_filter_vb_m]
|
||||
%endif
|
||||
pxor m4, m4
|
||||
mova m2, [filter_h2_shuf]
|
||||
mova m3, [bilinear_filter_vb+r5d-16]
|
||||
.nextrow
|
||||
movu m0, [r2+r3*0]
|
||||
movu m1, [r2+r3*1]
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
psraw m0, 2
|
||||
psraw m1, 2
|
||||
pavgw m0, m4
|
||||
pavgw m1, m4
|
||||
packuswb m0, m1
|
||||
movh [r0+r1*0], m0
|
||||
movhps [r0+r1*1], m0
|
||||
|
||||
lea r0, [r0+r1*2]
|
||||
lea r2, [r2+r3*2]
|
||||
sub r4, 2
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; IDCT functions:
|
||||
;
|
||||
|
Loading…
Reference in New Issue
Block a user