x86: hpeldsp: avg_pixels_xy2 for mmx2&3dnow
This is a port of the inline assembly of the mmx version to use the pavg(us|)b instruction. 8 16 mmx 1498 4355 mmx2 1242 3509 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
17ac998055
commit
c081ca851c
@ -29,6 +29,7 @@
|
||||
|
||||
SECTION_RODATA
|
||||
cextern pb_1
|
||||
cextern pw_2
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
@ -494,3 +495,79 @@ INIT_MMX mmxext
|
||||
AVG_APPROX_PIXELS8_XY2
|
||||
INIT_MMX 3dnow
|
||||
AVG_APPROX_PIXELS8_XY2
|
||||
|
||||
|
||||
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
||||
%macro AVG_PIXELS_XY2 0
|
||||
%if cpuflag(sse2)
|
||||
cglobal avg_pixels16_xy2, 4,5,8
|
||||
%else
|
||||
cglobal avg_pixels8_xy2, 4,5
|
||||
%endif
|
||||
pxor m7, m7
|
||||
mova m6, [pw_2]
|
||||
movu m0, [r1]
|
||||
movu m4, [r1+1]
|
||||
mova m1, m0
|
||||
mova m5, m4
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m1, m7
|
||||
punpckhbw m5, m7
|
||||
paddusw m4, m0
|
||||
paddusw m5, m1
|
||||
xor r4, r4
|
||||
add r1, r2
|
||||
.loop:
|
||||
movu m0, [r1+r4]
|
||||
movu m2, [r1+r4+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m1, m7
|
||||
punpckhbw m3, m7
|
||||
paddusw m0, m2
|
||||
paddusw m1, m3
|
||||
paddusw m4, m6
|
||||
paddusw m5, m6
|
||||
paddusw m4, m0
|
||||
paddusw m5, m1
|
||||
psrlw m4, 2
|
||||
psrlw m5, 2
|
||||
mova m3, [r0+r4]
|
||||
packuswb m4, m5
|
||||
PAVGB m4, m3
|
||||
mova [r0+r4], m4
|
||||
add r4, r2
|
||||
|
||||
movu m2, [r1+r4]
|
||||
movu m4, [r1+r4+1]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m3, m7
|
||||
punpckhbw m5, m7
|
||||
paddusw m4, m2
|
||||
paddusw m5, m3
|
||||
paddusw m0, m6
|
||||
paddusw m1, m6
|
||||
paddusw m0, m4
|
||||
paddusw m1, m5
|
||||
psrlw m0, 2
|
||||
psrlw m1, 2
|
||||
mova m3, [r0+r4]
|
||||
packuswb m0, m1
|
||||
PAVGB m0, m3
|
||||
mova [r0+r4], m0
|
||||
add r4, r2
|
||||
sub r3d, 2
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
AVG_PIXELS_XY2
|
||||
INIT_MMX 3dnow
|
||||
AVG_PIXELS_XY2
|
||||
|
@ -74,6 +74,10 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
|
||||
@ -156,6 +160,7 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
|
||||
CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
|
||||
CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
|
||||
|
||||
HPELDSP_AVG_PIXELS16(_3dnow)
|
||||
@ -209,6 +214,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
|
||||
@ -216,6 +222,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
|
||||
|
||||
if (!(flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
|
||||
@ -243,6 +250,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
|
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
|
||||
@ -250,6 +258,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
|
||||
|
||||
if (!(flags & CODEC_FLAG_BITEXACT)){
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
|
||||
|
Loading…
x
Reference in New Issue
Block a user