Merge "Added vp8_fast_quantize_b_sse2"
This commit is contained in:
commit
6b1b28a83c
@ -284,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
|
|||||||
UNSHADOW_ARGS
|
UNSHADOW_ARGS
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
|
|
||||||
; short *qcoeff_ptr,short *dequant_ptr,
|
|
||||||
; short *scan_mask, short *round_ptr,
|
|
||||||
; short *quant_ptr, short *dqcoeff_ptr);
|
|
||||||
global sym(vp8_fast_quantize_b_impl_sse)
|
|
||||||
sym(vp8_fast_quantize_b_impl_sse):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 8
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
mov rsi, arg(0) ;coeff_ptr
|
|
||||||
movdqa xmm0, [rsi]
|
|
||||||
|
|
||||||
mov rax, arg(1) ;zbin_ptr
|
|
||||||
movdqa xmm1, [rax]
|
|
||||||
|
|
||||||
movdqa xmm3, xmm0
|
|
||||||
psraw xmm0, 15
|
|
||||||
|
|
||||||
pxor xmm3, xmm0
|
|
||||||
psubw xmm3, xmm0 ; abs
|
|
||||||
|
|
||||||
movdqa xmm2, xmm3
|
|
||||||
pcmpgtw xmm1, xmm2
|
|
||||||
|
|
||||||
pandn xmm1, xmm2
|
|
||||||
movdqa xmm3, xmm1
|
|
||||||
|
|
||||||
mov rdx, arg(6) ; quant_ptr
|
|
||||||
movdqa xmm1, [rdx]
|
|
||||||
|
|
||||||
mov rcx, arg(5) ; round_ptr
|
|
||||||
movdqa xmm2, [rcx]
|
|
||||||
|
|
||||||
paddw xmm3, xmm2
|
|
||||||
pmulhuw xmm3, xmm1
|
|
||||||
|
|
||||||
pxor xmm3, xmm0
|
|
||||||
psubw xmm3, xmm0 ;gain the sign back
|
|
||||||
|
|
||||||
mov rdi, arg(2) ;qcoeff_ptr
|
|
||||||
movdqa xmm0, xmm3
|
|
||||||
|
|
||||||
movdqa [rdi], xmm3
|
|
||||||
|
|
||||||
mov rax, arg(3) ;dequant_ptr
|
|
||||||
movdqa xmm2, [rax]
|
|
||||||
|
|
||||||
pmullw xmm3, xmm2
|
|
||||||
mov rax, arg(7) ;dqcoeff_ptr
|
|
||||||
|
|
||||||
movdqa [rax], xmm3
|
|
||||||
|
|
||||||
; next 8
|
|
||||||
movdqa xmm4, [rsi+16]
|
|
||||||
|
|
||||||
mov rax, arg(1) ;zbin_ptr
|
|
||||||
movdqa xmm5, [rax+16]
|
|
||||||
|
|
||||||
movdqa xmm7, xmm4
|
|
||||||
psraw xmm4, 15
|
|
||||||
|
|
||||||
pxor xmm7, xmm4
|
|
||||||
psubw xmm7, xmm4 ; abs
|
|
||||||
|
|
||||||
movdqa xmm6, xmm7
|
|
||||||
pcmpgtw xmm5, xmm6
|
|
||||||
|
|
||||||
pandn xmm5, xmm6
|
|
||||||
movdqa xmm7, xmm5
|
|
||||||
|
|
||||||
movdqa xmm5, [rdx+16]
|
|
||||||
movdqa xmm6, [rcx+16]
|
|
||||||
|
|
||||||
|
|
||||||
paddw xmm7, xmm6
|
|
||||||
pmulhuw xmm7, xmm5
|
|
||||||
|
|
||||||
pxor xmm7, xmm4
|
|
||||||
psubw xmm7, xmm4;gain the sign back
|
|
||||||
|
|
||||||
mov rdi, arg(2) ;qcoeff_ptr
|
|
||||||
|
|
||||||
movdqa xmm1, xmm7
|
|
||||||
movdqa [rdi+16], xmm7
|
|
||||||
|
|
||||||
mov rax, arg(3) ;dequant_ptr
|
|
||||||
movdqa xmm6, [rax+16]
|
|
||||||
|
|
||||||
pmullw xmm7, xmm6
|
|
||||||
mov rax, arg(7) ;dqcoeff_ptr
|
|
||||||
|
|
||||||
movdqa [rax+16], xmm7
|
|
||||||
mov rdi, arg(4) ;scan_mask
|
|
||||||
|
|
||||||
pxor xmm7, xmm7
|
|
||||||
movdqa xmm2, [rdi]
|
|
||||||
|
|
||||||
movdqa xmm3, [rdi+16];
|
|
||||||
pcmpeqw xmm0, xmm7
|
|
||||||
|
|
||||||
pcmpeqw xmm1, xmm7
|
|
||||||
pcmpeqw xmm6, xmm6
|
|
||||||
|
|
||||||
pxor xmm0, xmm6
|
|
||||||
pxor xmm1, xmm6
|
|
||||||
|
|
||||||
psrlw xmm0, 15
|
|
||||||
psrlw xmm1, 15
|
|
||||||
|
|
||||||
pmaddwd xmm0, xmm2
|
|
||||||
pmaddwd xmm1, xmm3
|
|
||||||
|
|
||||||
movq xmm2, xmm0
|
|
||||||
movq xmm3, xmm1
|
|
||||||
|
|
||||||
psrldq xmm0, 8
|
|
||||||
psrldq xmm1, 8
|
|
||||||
|
|
||||||
paddd xmm0, xmm1
|
|
||||||
paddd xmm2, xmm3
|
|
||||||
|
|
||||||
paddd xmm0, xmm2
|
|
||||||
movq xmm1, xmm0
|
|
||||||
|
|
||||||
psrldq xmm0, 4
|
|
||||||
paddd xmm1, xmm0
|
|
||||||
|
|
||||||
movq rcx, xmm1
|
|
||||||
and rcx, 0xffff
|
|
||||||
|
|
||||||
xor rdx, rdx
|
|
||||||
sub rdx, rcx
|
|
||||||
|
|
||||||
bsr rax, rcx
|
|
||||||
inc rax
|
|
||||||
|
|
||||||
sar rdx, 31
|
|
||||||
and rax, rdx
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
@ -252,3 +252,137 @@ rq_zigzag_1c:
|
|||||||
UNSHADOW_ARGS
|
UNSHADOW_ARGS
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
|
||||||
|
; short *qcoeff_ptr,short *dequant_ptr,
|
||||||
|
; short *scan_mask, short *round_ptr,
|
||||||
|
; short *quant_ptr, short *dqcoeff_ptr);
|
||||||
|
global sym(vp8_fast_quantize_b_impl_ssse2)
|
||||||
|
sym(vp8_fast_quantize_b_impl_ssse2):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 7
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push rbx
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
ALIGN_STACK 16, rax
|
||||||
|
|
||||||
|
%define save_xmm6 0
|
||||||
|
%define save_xmm7 16
|
||||||
|
|
||||||
|
%define vp8_fastquantizeb_stack_size save_xmm7 + 16
|
||||||
|
|
||||||
|
sub rsp, vp8_fastquantizeb_stack_size
|
||||||
|
|
||||||
|
movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
|
||||||
|
movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
|
||||||
|
|
||||||
|
mov rdx, arg(0) ;coeff_ptr
|
||||||
|
mov rcx, arg(2) ;dequant_ptr
|
||||||
|
mov rax, arg(3) ;scan_mask
|
||||||
|
mov rdi, arg(4) ;round_ptr
|
||||||
|
mov rsi, arg(5) ;quant_ptr
|
||||||
|
|
||||||
|
movdqa xmm0, XMMWORD PTR[rdx]
|
||||||
|
movdqa xmm4, XMMWORD PTR[rdx + 16]
|
||||||
|
|
||||||
|
movdqa xmm6, XMMWORD PTR[rdi] ;round lo
|
||||||
|
movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
|
||||||
|
|
||||||
|
movdqa xmm1, xmm0
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
|
||||||
|
psraw xmm0, 15 ;sign of z (aka sz)
|
||||||
|
psraw xmm4, 15 ;sign of z (aka sz)
|
||||||
|
|
||||||
|
pxor xmm1, xmm0
|
||||||
|
pxor xmm5, xmm4
|
||||||
|
psubw xmm1, xmm0 ;x = abs(z)
|
||||||
|
psubw xmm5, xmm4 ;x = abs(z)
|
||||||
|
|
||||||
|
paddw xmm1, xmm6
|
||||||
|
paddw xmm5, xmm7
|
||||||
|
|
||||||
|
pmulhw xmm1, XMMWORD PTR[rsi]
|
||||||
|
pmulhw xmm5, XMMWORD PTR[rsi + 16]
|
||||||
|
|
||||||
|
mov rdi, arg(1) ;qcoeff_ptr
|
||||||
|
mov rsi, arg(6) ;dqcoeff_ptr
|
||||||
|
|
||||||
|
movdqa xmm6, XMMWORD PTR[rcx]
|
||||||
|
movdqa xmm7, XMMWORD PTR[rcx + 16]
|
||||||
|
|
||||||
|
pxor xmm1, xmm0
|
||||||
|
pxor xmm5, xmm4
|
||||||
|
psubw xmm1, xmm0
|
||||||
|
psubw xmm5, xmm4
|
||||||
|
|
||||||
|
movdqa XMMWORD PTR[rdi], xmm1
|
||||||
|
movdqa XMMWORD PTR[rdi + 16], xmm5
|
||||||
|
|
||||||
|
pmullw xmm6, xmm1
|
||||||
|
pmullw xmm7, xmm5
|
||||||
|
|
||||||
|
movdqa xmm2, XMMWORD PTR[rax]
|
||||||
|
movdqa xmm3, XMMWORD PTR[rax+16];
|
||||||
|
|
||||||
|
pxor xmm4, xmm4 ;clear all bits
|
||||||
|
pcmpeqw xmm1, xmm4
|
||||||
|
pcmpeqw xmm5, xmm4
|
||||||
|
|
||||||
|
pcmpeqw xmm4, xmm4 ;set all bits
|
||||||
|
pxor xmm1, xmm4
|
||||||
|
pxor xmm5, xmm4
|
||||||
|
|
||||||
|
psrlw xmm1, 15
|
||||||
|
psrlw xmm5, 15
|
||||||
|
|
||||||
|
pmaddwd xmm1, xmm2
|
||||||
|
pmaddwd xmm5, xmm3
|
||||||
|
|
||||||
|
movq xmm2, xmm1
|
||||||
|
movq xmm3, xmm5
|
||||||
|
|
||||||
|
psrldq xmm1, 8
|
||||||
|
psrldq xmm5, 8
|
||||||
|
|
||||||
|
paddd xmm1, xmm5
|
||||||
|
paddd xmm2, xmm3
|
||||||
|
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
movq xmm5, xmm1
|
||||||
|
|
||||||
|
psrldq xmm1, 4
|
||||||
|
paddd xmm5, xmm1
|
||||||
|
|
||||||
|
movq rcx, xmm5
|
||||||
|
and rcx, 0xffff
|
||||||
|
|
||||||
|
xor rdx, rdx
|
||||||
|
sub rdx, rcx
|
||||||
|
|
||||||
|
bsr rax, rcx
|
||||||
|
inc rax
|
||||||
|
|
||||||
|
sar rdx, 31
|
||||||
|
and rax, rdx
|
||||||
|
|
||||||
|
movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
|
||||||
|
movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
|
||||||
|
|
||||||
|
movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
|
||||||
|
movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
|
||||||
|
|
||||||
|
add rsp, vp8_fastquantizeb_stack_size
|
||||||
|
pop rsp
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rbx
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
@ -88,24 +88,22 @@ void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
|
|||||||
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
|
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
|
||||||
}
|
}
|
||||||
|
|
||||||
int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
|
int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
|
||||||
short *qcoeff_ptr, short *dequant_ptr,
|
short *qcoeff_ptr, short *dequant_ptr,
|
||||||
short *scan_mask, short *round_ptr,
|
short *scan_mask, short *round_ptr,
|
||||||
short *quant_ptr, short *dqcoeff_ptr);
|
short *quant_ptr, short *dqcoeff_ptr);
|
||||||
void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
|
void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
|
||||||
{
|
{
|
||||||
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
|
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
|
||||||
short *coeff_ptr = &b->coeff[0];
|
short *coeff_ptr = &b->coeff[0];
|
||||||
short *zbin_ptr = &b->zbin[0][0];
|
|
||||||
short *round_ptr = &b->round[0][0];
|
short *round_ptr = &b->round[0][0];
|
||||||
short *quant_ptr = &b->quant[0][0];
|
short *quant_ptr = &b->quant[0][0];
|
||||||
short *qcoeff_ptr = d->qcoeff;
|
short *qcoeff_ptr = d->qcoeff;
|
||||||
short *dqcoeff_ptr = d->dqcoeff;
|
short *dqcoeff_ptr = d->dqcoeff;
|
||||||
short *dequant_ptr = &d->dequant[0][0];
|
short *dequant_ptr = &d->dequant[0][0];
|
||||||
|
|
||||||
d->eob = vp8_fast_quantize_b_impl_sse(
|
d->eob = vp8_fast_quantize_b_impl_ssse2(
|
||||||
coeff_ptr,
|
coeff_ptr,
|
||||||
zbin_ptr,
|
|
||||||
qcoeff_ptr,
|
qcoeff_ptr,
|
||||||
dequant_ptr,
|
dequant_ptr,
|
||||||
scan_mask,
|
scan_mask,
|
||||||
@ -116,6 +114,7 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
|
int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
|
||||||
short *qcoeff_ptr,short *dequant_ptr,
|
short *qcoeff_ptr,short *dequant_ptr,
|
||||||
const int *default_zig_zag, short *round_ptr,
|
const int *default_zig_zag, short *round_ptr,
|
||||||
@ -285,8 +284,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
|||||||
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
|
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
|
||||||
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
|
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
|
||||||
|
|
||||||
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
|
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
||||||
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
|
||||||
|
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user