Merge "Added vp8_fast_quantize_b_sse2"

This commit is contained in:
Scott LaVarnway 2010-10-11 09:34:48 -07:00 committed by Code Review
commit 6b1b28a83c
3 changed files with 142 additions and 160 deletions

View File

@ -284,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
UNSHADOW_ARGS
pop rbp
ret
;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
; short *scan_mask, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr);
global sym(vp8_fast_quantize_b_impl_sse)
sym(vp8_fast_quantize_b_impl_sse):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
movdqa xmm0, [rsi]
mov rax, arg(1) ;zbin_ptr
movdqa xmm1, [rax]
movdqa xmm3, xmm0
psraw xmm0, 15
pxor xmm3, xmm0
psubw xmm3, xmm0 ; abs
movdqa xmm2, xmm3
pcmpgtw xmm1, xmm2
pandn xmm1, xmm2
movdqa xmm3, xmm1
mov rdx, arg(6) ; quant_ptr
movdqa xmm1, [rdx]
mov rcx, arg(5) ; round_ptr
movdqa xmm2, [rcx]
paddw xmm3, xmm2
pmulhuw xmm3, xmm1
pxor xmm3, xmm0
psubw xmm3, xmm0 ;gain the sign back
mov rdi, arg(2) ;qcoeff_ptr
movdqa xmm0, xmm3
movdqa [rdi], xmm3
mov rax, arg(3) ;dequant_ptr
movdqa xmm2, [rax]
pmullw xmm3, xmm2
mov rax, arg(7) ;dqcoeff_ptr
movdqa [rax], xmm3
; next 8
movdqa xmm4, [rsi+16]
mov rax, arg(1) ;zbin_ptr
movdqa xmm5, [rax+16]
movdqa xmm7, xmm4
psraw xmm4, 15
pxor xmm7, xmm4
psubw xmm7, xmm4 ; abs
movdqa xmm6, xmm7
pcmpgtw xmm5, xmm6
pandn xmm5, xmm6
movdqa xmm7, xmm5
movdqa xmm5, [rdx+16]
movdqa xmm6, [rcx+16]
paddw xmm7, xmm6
pmulhuw xmm7, xmm5
pxor xmm7, xmm4
psubw xmm7, xmm4;gain the sign back
mov rdi, arg(2) ;qcoeff_ptr
movdqa xmm1, xmm7
movdqa [rdi+16], xmm7
mov rax, arg(3) ;dequant_ptr
movdqa xmm6, [rax+16]
pmullw xmm7, xmm6
mov rax, arg(7) ;dqcoeff_ptr
movdqa [rax+16], xmm7
mov rdi, arg(4) ;scan_mask
pxor xmm7, xmm7
movdqa xmm2, [rdi]
movdqa xmm3, [rdi+16];
pcmpeqw xmm0, xmm7
pcmpeqw xmm1, xmm7
pcmpeqw xmm6, xmm6
pxor xmm0, xmm6
pxor xmm1, xmm6
psrlw xmm0, 15
psrlw xmm1, 15
pmaddwd xmm0, xmm2
pmaddwd xmm1, xmm3
movq xmm2, xmm0
movq xmm3, xmm1
psrldq xmm0, 8
psrldq xmm1, 8
paddd xmm0, xmm1
paddd xmm2, xmm3
paddd xmm0, xmm2
movq xmm1, xmm0
psrldq xmm0, 4
paddd xmm1, xmm0
movq rcx, xmm1
and rcx, 0xffff
xor rdx, rdx
sub rdx, rcx
bsr rax, rcx
inc rax
sar rdx, 31
and rax, rdx
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

View File

@ -252,3 +252,137 @@ rq_zigzag_1c:
UNSHADOW_ARGS
pop rbp
ret
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
; short *scan_mask, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr);
global sym(vp8_fast_quantize_b_impl_ssse2)
sym(vp8_fast_quantize_b_impl_ssse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
%define save_xmm6 0
%define save_xmm7 16
%define vp8_fastquantizeb_stack_size save_xmm7 + 16
sub rsp, vp8_fastquantizeb_stack_size
movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
mov rdx, arg(0) ;coeff_ptr
mov rcx, arg(2) ;dequant_ptr
mov rax, arg(3) ;scan_mask
mov rdi, arg(4) ;round_ptr
mov rsi, arg(5) ;quant_ptr
movdqa xmm0, XMMWORD PTR[rdx]
movdqa xmm4, XMMWORD PTR[rdx + 16]
movdqa xmm6, XMMWORD PTR[rdi] ;round lo
movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
movdqa xmm1, xmm0
movdqa xmm5, xmm4
psraw xmm0, 15 ;sign of z (aka sz)
psraw xmm4, 15 ;sign of z (aka sz)
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0 ;x = abs(z)
psubw xmm5, xmm4 ;x = abs(z)
paddw xmm1, xmm6
paddw xmm5, xmm7
pmulhw xmm1, XMMWORD PTR[rsi]
pmulhw xmm5, XMMWORD PTR[rsi + 16]
mov rdi, arg(1) ;qcoeff_ptr
mov rsi, arg(6) ;dqcoeff_ptr
movdqa xmm6, XMMWORD PTR[rcx]
movdqa xmm7, XMMWORD PTR[rcx + 16]
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
movdqa XMMWORD PTR[rdi], xmm1
movdqa XMMWORD PTR[rdi + 16], xmm5
pmullw xmm6, xmm1
pmullw xmm7, xmm5
movdqa xmm2, XMMWORD PTR[rax]
movdqa xmm3, XMMWORD PTR[rax+16];
pxor xmm4, xmm4 ;clear all bits
pcmpeqw xmm1, xmm4
pcmpeqw xmm5, xmm4
pcmpeqw xmm4, xmm4 ;set all bits
pxor xmm1, xmm4
pxor xmm5, xmm4
psrlw xmm1, 15
psrlw xmm5, 15
pmaddwd xmm1, xmm2
pmaddwd xmm5, xmm3
movq xmm2, xmm1
movq xmm3, xmm5
psrldq xmm1, 8
psrldq xmm5, 8
paddd xmm1, xmm5
paddd xmm2, xmm3
paddd xmm1, xmm2
movq xmm5, xmm1
psrldq xmm1, 4
paddd xmm5, xmm1
movq rcx, xmm5
and rcx, 0xffff
xor rdx, rdx
sub rdx, rcx
bsr rax, rcx
inc rax
sar rdx, 31
and rax, rdx
movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
add rsp, vp8_fastquantizeb_stack_size
pop rsp
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

View File

@ -88,24 +88,22 @@ void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
short *coeff_ptr = &b->coeff[0];
short *zbin_ptr = &b->zbin[0][0];
short *round_ptr = &b->round[0][0];
short *quant_ptr = &b->quant[0][0];
short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
short *dequant_ptr = &d->dequant[0][0];
d->eob = vp8_fast_quantize_b_impl_sse(
d->eob = vp8_fast_quantize_b_impl_ssse2(
coeff_ptr,
zbin_ptr,
qcoeff_ptr,
dequant_ptr,
scan_mask,
@ -116,6 +114,7 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
);
}
int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr,short *dequant_ptr,
const int *default_zig_zag, short *round_ptr,
@ -285,8 +284,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
}
#endif