dirac: Fix mmx/sse haar wavelet compose

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2011-11-01 21:41:01 +01:00
parent 0dc22e92f4
commit 754539a409
2 changed files with 37 additions and 36 deletions

View File

@ -30,6 +30,8 @@ void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
\ \
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
{ \ { \
@ -83,6 +85,28 @@ static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
\ \
ff_vertical_compose_haar##ext(b0, b1, width_align); \ ff_vertical_compose_haar##ext(b0, b1, width_align); \
} \ } \
static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = tmp[x];\
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
}\
}\
static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = (tmp[x] + 1)>>1;\
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
}\
}\
\ \
#if HAVE_YASM #if HAVE_YASM
@ -95,11 +119,6 @@ COMPOSE_VERTICAL(_sse2, 8)
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
void ff_horizontal_compose_haar0i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w);
void ff_horizontal_compose_haar1i_mmx(IDWTELEM *b, IDWTELEM *tmp, int w);
void ff_horizontal_compose_haar0i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w);
void ff_horizontal_compose_haar1i_sse2(IDWTELEM *b, IDWTELEM *tmp, int w);
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x) void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
{ {
for (; x < w2; x++) { for (; x < w2; x++) {
@ -108,22 +127,6 @@ void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x
} }
} }
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
{
for (; x < w2; x++) {
b[2*x ] = tmp[x];
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);
}
}
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x)
{
for (; x < w2; x++) {
b[2*x ] = (tmp[x] + 1)>>1;
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;
}
}
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
{ {
#if HAVE_YASM #if HAVE_YASM
@ -148,11 +151,11 @@ void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
break; break;
case DWT_DIRAC_HAAR0: case DWT_DIRAC_HAAR0:
d->vertical_compose = vertical_compose_haar_mmx; d->vertical_compose = vertical_compose_haar_mmx;
d->horizontal_compose = ff_horizontal_compose_haar0i_mmx; d->horizontal_compose = horizontal_compose_haar0i_mmx;
break; break;
case DWT_DIRAC_HAAR1: case DWT_DIRAC_HAAR1:
d->vertical_compose = vertical_compose_haar_mmx; d->vertical_compose = vertical_compose_haar_mmx;
d->horizontal_compose = ff_horizontal_compose_haar1i_mmx; d->horizontal_compose = horizontal_compose_haar1i_mmx;
break; break;
} }
#endif #endif
@ -175,11 +178,11 @@ void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
break; break;
case DWT_DIRAC_HAAR0: case DWT_DIRAC_HAAR0:
d->vertical_compose = vertical_compose_haar_sse2; d->vertical_compose = vertical_compose_haar_sse2;
//MMXDISABLED d->horizontal_compose = ff_horizontal_compose_haar0i_sse2; d->horizontal_compose = horizontal_compose_haar0i_sse2;
break; break;
case DWT_DIRAC_HAAR1: case DWT_DIRAC_HAAR1:
d->vertical_compose = vertical_compose_haar_sse2; d->vertical_compose = vertical_compose_haar_sse2;
d->horizontal_compose = ff_horizontal_compose_haar1i_sse2; d->horizontal_compose = horizontal_compose_haar1i_sse2;
break; break;
} }

View File

@ -22,8 +22,6 @@
%include "x86inc.asm" %include "x86inc.asm"
cextern horizontal_compose_dd97i_end_c cextern horizontal_compose_dd97i_end_c
cextern horizontal_compose_haar0i_end_c
cextern horizontal_compose_haar1i_end_c
SECTION_RODATA SECTION_RODATA
pw_1: times 8 dw 1 pw_1: times 8 dw 1
@ -188,7 +186,7 @@ cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
mov w2d, wd mov w2d, wd
xor xd, xd xor xq, xq
shr w2d, 1 shr w2d, 1
lea b_w2q, [bq+wq] lea b_w2q, [bq+wq]
mova m3, [pw_1] mova m3, [pw_1]
@ -199,13 +197,13 @@ cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
psraw m1, 1 psraw m1, 1
psubw m0, m1 psubw m0, m1
mova [tmpq + 2*xq], m0 mova [tmpq + 2*xq], m0
add xd, mmsize/2 add xq, mmsize/2
cmp xd, w2d cmp xq, w2q
jl .lowpass_loop jl .lowpass_loop
xor xd, xd xor xq, xq
and w2d, ~(mmsize/2 - 1) and w2q, ~(mmsize/2 - 1)
cmp w2d, mmsize/2 cmp w2q, mmsize/2
jl .end jl .end
.highpass_loop: .highpass_loop:
@ -226,11 +224,11 @@ cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
mova [bq+4*xq], m0 mova [bq+4*xq], m0
mova [bq+4*xq+mmsize], m2 mova [bq+4*xq+mmsize], m2
add xd, mmsize/2 add xq, mmsize/2
cmp xd, w2d cmp xq, w2q
jl .highpass_loop jl .highpass_loop
.end: .end:
END_HORIZONTAL horizontal_compose_haar%2i_end_c REP_RET
%endmacro %endmacro