x86: hpeldsp: implement SSE2 versions
Those are mostly used in codecs older than H.264, eg MPEG-2. put16 versions: mmx mmx2 sse2 x2: 1888 1185 552 y2: 1778 1092 510 avg16 xy2: 3509(mmx2) -> 2169(sse2) Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
9eaa8c22bc
commit
f0aca50e0b
@ -35,21 +35,39 @@ SECTION_TEXT
|
||||
|
||||
; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_PIXELS8_X2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal put_pixels16_x2, 4,5,4
|
||||
%else
|
||||
cglobal put_pixels8_x2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+r2]
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m1, [r1+r2+1]
|
||||
movu m0, [r1+1]
|
||||
movu m1, [r1+r2+1]
|
||||
%if cpuflag(sse2)
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
pavgb m0, m2
|
||||
pavgb m1, m3
|
||||
%else
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
%endif
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
add r1, r4
|
||||
add r0, r4
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+r2]
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m1, [r1+r2+1]
|
||||
movu m0, [r1+1]
|
||||
movu m1, [r1+r2+1]
|
||||
%if cpuflag(sse2)
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
pavgb m0, m2
|
||||
pavgb m1, m3
|
||||
%else
|
||||
PAVGB m0, [r1]
|
||||
PAVGB m1, [r1+r2]
|
||||
%endif
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m1
|
||||
@ -107,6 +125,9 @@ INIT_MMX mmxext
|
||||
PUT_PIXELS_16
|
||||
INIT_MMX 3dnow
|
||||
PUT_PIXELS_16
|
||||
; The 8_X2 macro can easily be used here
|
||||
INIT_XMM sse2
|
||||
PUT_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ -199,20 +220,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT
|
||||
|
||||
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro PUT_PIXELS8_Y2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal put_pixels16_y2, 4,5,3
|
||||
%else
|
||||
cglobal put_pixels8_y2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
mova m0, [r1]
|
||||
movu m0, [r1]
|
||||
sub r0, r2
|
||||
.loop:
|
||||
mova m1, [r1+r2]
|
||||
mova m2, [r1+r4]
|
||||
movu m1, [r1+r2]
|
||||
movu m2, [r1+r4]
|
||||
add r1, r4
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
mova [r0+r2], m0
|
||||
mova [r0+r4], m1
|
||||
mova m1, [r1+r2]
|
||||
mova m0, [r1+r4]
|
||||
movu m1, [r1+r2]
|
||||
movu m0, [r1+r4]
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
PAVGB m2, m1
|
||||
@ -229,6 +254,9 @@ INIT_MMX mmxext
|
||||
PUT_PIXELS8_Y2
|
||||
INIT_MMX 3dnow
|
||||
PUT_PIXELS8_Y2
|
||||
; actually, put_pixels16_y2_sse2
|
||||
INIT_XMM sse2
|
||||
PUT_PIXELS8_Y2
|
||||
|
||||
|
||||
; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ -352,34 +380,50 @@ AVG_PIXELS8
|
||||
%endmacro
|
||||
|
||||
%macro AVG_PIXELS8_X2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal avg_pixels16_x2, 4,5,4
|
||||
%else
|
||||
cglobal avg_pixels8_x2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
%if notcpuflag(mmxext)
|
||||
pcmpeqd m5, m5
|
||||
paddb m5, m5
|
||||
%endif
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+r2]
|
||||
movu m0, [r1]
|
||||
movu m2, [r1+r2]
|
||||
%if notcpuflag(mmxext)
|
||||
PAVGB_MMX [r1+1], m0, m3, m5
|
||||
PAVGB_MMX [r1+r2+1], m2, m4, m5
|
||||
PAVGB_MMX [r0], m0, m3, m5
|
||||
PAVGB_MMX [r0+r2], m2, m4, m5
|
||||
%else
|
||||
%if cpuflag(sse2)
|
||||
movu m1, [r1+1]
|
||||
movu m3, [r1+r2+1]
|
||||
pavgb m0, m1
|
||||
pavgb m2, m3
|
||||
%else
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m2, [r1+r2+1]
|
||||
%endif
|
||||
PAVGB m0, [r0]
|
||||
PAVGB m2, [r0+r2]
|
||||
%endif
|
||||
add r1, r4
|
||||
mova [r0], m0
|
||||
mova [r0+r2], m2
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+r2]
|
||||
movu m0, [r1]
|
||||
movu m2, [r1+r2]
|
||||
%if notcpuflag(mmxext)
|
||||
PAVGB_MMX [r1+1], m0, m3, m5
|
||||
PAVGB_MMX [r1+r2+1], m2, m4, m5
|
||||
%elif cpuflag(sse2)
|
||||
movu m1, [r1+1]
|
||||
movu m3, [r1+r2+1]
|
||||
pavgb m0, m1
|
||||
pavgb m2, m3
|
||||
%else
|
||||
PAVGB m0, [r1+1]
|
||||
PAVGB m2, [r1+r2+1]
|
||||
@ -389,6 +433,9 @@ cglobal avg_pixels8_x2, 4,5
|
||||
%if notcpuflag(mmxext)
|
||||
PAVGB_MMX [r0], m0, m3, m5
|
||||
PAVGB_MMX [r0+r2], m2, m4, m5
|
||||
%elif cpuflag(sse2)
|
||||
pavgb m0, [r0]
|
||||
pavgb m2, [r0+r2]
|
||||
%else
|
||||
PAVGB m0, [r0]
|
||||
PAVGB m2, [r0+r2]
|
||||
@ -407,36 +454,39 @@ INIT_MMX mmxext
|
||||
AVG_PIXELS8_X2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS8_X2
|
||||
; actually avg_pixels16_x2
|
||||
INIT_XMM sse2
|
||||
AVG_PIXELS8_X2
|
||||
|
||||
|
||||
; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro AVG_PIXELS8_Y2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal avg_pixels16_y2, 4,5,3
|
||||
%else
|
||||
cglobal avg_pixels8_y2, 4,5
|
||||
%endif
|
||||
lea r4, [r2*2]
|
||||
mova m0, [r1]
|
||||
movu m0, [r1]
|
||||
sub r0, r2
|
||||
.loop:
|
||||
mova m1, [r1+r2]
|
||||
mova m2, [r1+r4]
|
||||
movu m1, [r1+r2]
|
||||
movu m2, [r1+r4]
|
||||
add r1, r4
|
||||
PAVGB m0, m1
|
||||
PAVGB m1, m2
|
||||
mova m3, [r0+r2]
|
||||
mova m4, [r0+r4]
|
||||
PAVGB m0, m3
|
||||
PAVGB m1, m4
|
||||
PAVGB m0, [r0+r2]
|
||||
PAVGB m1, [r0+r4]
|
||||
mova [r0+r2], m0
|
||||
mova [r0+r4], m1
|
||||
mova m1, [r1+r2]
|
||||
mova m0, [r1+r4]
|
||||
movu m1, [r1+r2]
|
||||
movu m0, [r1+r4]
|
||||
PAVGB m2, m1
|
||||
PAVGB m1, m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
mova m3, [r0+r2]
|
||||
mova m4, [r0+r4]
|
||||
PAVGB m2, m3
|
||||
PAVGB m1, m4
|
||||
PAVGB m2, [r0+r2]
|
||||
PAVGB m1, [r0+r4]
|
||||
mova [r0+r2], m2
|
||||
mova [r0+r4], m1
|
||||
add r0, r4
|
||||
@ -449,6 +499,9 @@ INIT_MMX mmxext
|
||||
AVG_PIXELS8_Y2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS8_Y2
|
||||
; actually avg_pixels16_y2
|
||||
INIT_XMM sse2
|
||||
AVG_PIXELS8_Y2
|
||||
|
||||
|
||||
; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
@ -571,3 +624,5 @@ INIT_MMX mmxext
|
||||
AVG_PIXELS_XY2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS_XY2
|
||||
INIT_XMM sse2
|
||||
AVG_PIXELS_XY2
|
||||
|
@ -40,6 +40,16 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
@ -284,7 +294,12 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
// these functions are slower than mmx on AMD, but faster on Intel
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
|
||||
}
|
||||
#endif /* HAVE_SSE2_EXTERNAL */
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user