09202d8071
Change-Id: Ieebea089095d9073b3a94932791099f614ce120c
440 lines
12 KiB
NASM
440 lines
12 KiB
NASM
;
|
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
|
|
; short *qcoeff_ptr,short *dequant_ptr,
|
|
; short *scan_mask, short *round_ptr,
|
|
; short *quant_ptr, short *dqcoeff_ptr);
|
|
global sym(vp8_fast_quantize_b_impl_mmx)
|
|
sym(vp8_fast_quantize_b_impl_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 8
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
|
|
mov rsi, arg(0) ;coeff_ptr
|
|
movq mm0, [rsi]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movq mm1, [rax]
|
|
|
|
movq mm3, mm0
|
|
psraw mm0, 15
|
|
|
|
pxor mm3, mm0
|
|
psubw mm3, mm0 ; abs
|
|
|
|
movq mm2, mm3
|
|
pcmpgtw mm1, mm2
|
|
|
|
pandn mm1, mm2
|
|
movq mm3, mm1
|
|
|
|
mov rdx, arg(6) ;quant_ptr
|
|
movq mm1, [rdx]
|
|
|
|
mov rcx, arg(5) ;round_ptr
|
|
movq mm2, [rcx]
|
|
|
|
paddw mm3, mm2
|
|
pmulhuw mm3, mm1
|
|
|
|
pxor mm3, mm0
|
|
psubw mm3, mm0 ;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
movq mm0, mm3
|
|
|
|
movq [rdi], mm3
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movq mm2, [rax]
|
|
|
|
pmullw mm3, mm2
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movq [rax], mm3
|
|
|
|
; next 8
|
|
movq mm4, [rsi+8]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movq mm5, [rax+8]
|
|
|
|
movq mm7, mm4
|
|
psraw mm4, 15
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4 ; abs
|
|
|
|
movq mm6, mm7
|
|
pcmpgtw mm5, mm6
|
|
|
|
pandn mm5, mm6
|
|
movq mm7, mm5
|
|
|
|
movq mm5, [rdx+8]
|
|
movq mm6, [rcx+8]
|
|
|
|
paddw mm7, mm6
|
|
pmulhuw mm7, mm5
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movq mm1, mm7
|
|
movq [rdi+8], mm7
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movq mm6, [rax+8]
|
|
|
|
pmullw mm7, mm6
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movq [rax+8], mm7
|
|
|
|
|
|
; next 8
|
|
movq mm4, [rsi+16]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movq mm5, [rax+16]
|
|
|
|
movq mm7, mm4
|
|
psraw mm4, 15
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4 ; abs
|
|
|
|
movq mm6, mm7
|
|
pcmpgtw mm5, mm6
|
|
|
|
pandn mm5, mm6
|
|
movq mm7, mm5
|
|
|
|
movq mm5, [rdx+16]
|
|
movq mm6, [rcx+16]
|
|
|
|
paddw mm7, mm6
|
|
pmulhuw mm7, mm5
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movq mm1, mm7
|
|
movq [rdi+16], mm7
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movq mm6, [rax+16]
|
|
|
|
pmullw mm7, mm6
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movq [rax+16], mm7
|
|
|
|
|
|
; next 8
|
|
movq mm4, [rsi+24]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movq mm5, [rax+24]
|
|
|
|
movq mm7, mm4
|
|
psraw mm4, 15
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4 ; abs
|
|
|
|
movq mm6, mm7
|
|
pcmpgtw mm5, mm6
|
|
|
|
pandn mm5, mm6
|
|
movq mm7, mm5
|
|
|
|
movq mm5, [rdx+24]
|
|
movq mm6, [rcx+24]
|
|
|
|
paddw mm7, mm6
|
|
pmulhuw mm7, mm5
|
|
|
|
pxor mm7, mm4
|
|
psubw mm7, mm4;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movq mm1, mm7
|
|
movq [rdi+24], mm7
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movq mm6, [rax+24]
|
|
|
|
pmullw mm7, mm6
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movq [rax+24], mm7
|
|
|
|
|
|
|
|
mov rdi, arg(4) ;scan_mask
|
|
mov rsi, arg(2) ;qcoeff_ptr
|
|
|
|
pxor mm5, mm5
|
|
pxor mm7, mm7
|
|
|
|
movq mm0, [rsi]
|
|
movq mm1, [rsi+8]
|
|
|
|
movq mm2, [rdi]
|
|
movq mm3, [rdi+8];
|
|
|
|
pcmpeqw mm0, mm7
|
|
pcmpeqw mm1, mm7
|
|
|
|
pcmpeqw mm6, mm6
|
|
pxor mm0, mm6
|
|
|
|
pxor mm1, mm6
|
|
psrlw mm0, 15
|
|
|
|
psrlw mm1, 15
|
|
pmaddwd mm0, mm2
|
|
|
|
pmaddwd mm1, mm3
|
|
movq mm5, mm0
|
|
|
|
paddd mm5, mm1
|
|
|
|
movq mm0, [rsi+16]
|
|
movq mm1, [rsi+24]
|
|
|
|
movq mm2, [rdi+16]
|
|
movq mm3, [rdi+24];
|
|
|
|
pcmpeqw mm0, mm7
|
|
pcmpeqw mm1, mm7
|
|
|
|
pcmpeqw mm6, mm6
|
|
pxor mm0, mm6
|
|
|
|
pxor mm1, mm6
|
|
psrlw mm0, 15
|
|
|
|
psrlw mm1, 15
|
|
pmaddwd mm0, mm2
|
|
|
|
pmaddwd mm1, mm3
|
|
paddd mm5, mm0
|
|
|
|
paddd mm5, mm1
|
|
movq mm0, mm5
|
|
|
|
psrlq mm5, 32
|
|
paddd mm0, mm5
|
|
|
|
; eob adjustment begins here
|
|
movd rcx, mm0
|
|
and rcx, 0xffff
|
|
|
|
xor rdx, rdx
|
|
sub rdx, rcx ; rdx=-rcx
|
|
|
|
bsr rax, rcx
|
|
inc rax
|
|
|
|
sar rdx, 31
|
|
and rax, rdx
|
|
; Substitute the sse assembly for the old mmx mixed assembly/C. The
|
|
; following is kept as reference
|
|
; movd rcx, mm0
|
|
; bsr rax, rcx
|
|
;
|
|
; mov eob, rax
|
|
; mov eee, rcx
|
|
;
|
|
;if(eee==0)
|
|
;{
|
|
; eob=-1;
|
|
;}
|
|
;else if(eee<0)
|
|
;{
|
|
; eob=15;
|
|
;}
|
|
;d->eob = eob+1;
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
|
|
; short *qcoeff_ptr,short *dequant_ptr,
|
|
; short *scan_mask, short *round_ptr,
|
|
; short *quant_ptr, short *dqcoeff_ptr);
|
|
global sym(vp8_fast_quantize_b_impl_sse)
|
|
sym(vp8_fast_quantize_b_impl_sse):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 8
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
|
|
mov rsi, arg(0) ;coeff_ptr
|
|
movdqa xmm0, [rsi]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movdqa xmm1, [rax]
|
|
|
|
movdqa xmm3, xmm0
|
|
psraw xmm0, 15
|
|
|
|
pxor xmm3, xmm0
|
|
psubw xmm3, xmm0 ; abs
|
|
|
|
movdqa xmm2, xmm3
|
|
pcmpgtw xmm1, xmm2
|
|
|
|
pandn xmm1, xmm2
|
|
movdqa xmm3, xmm1
|
|
|
|
mov rdx, arg(6) ; quant_ptr
|
|
movdqa xmm1, [rdx]
|
|
|
|
mov rcx, arg(5) ; round_ptr
|
|
movdqa xmm2, [rcx]
|
|
|
|
paddw xmm3, xmm2
|
|
pmulhuw xmm3, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
psubw xmm3, xmm0 ;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
movdqa xmm0, xmm3
|
|
|
|
movdqa [rdi], xmm3
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movdqa xmm2, [rax]
|
|
|
|
pmullw xmm3, xmm2
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movdqa [rax], xmm3
|
|
|
|
; next 8
|
|
movdqa xmm4, [rsi+16]
|
|
|
|
mov rax, arg(1) ;zbin_ptr
|
|
movdqa xmm5, [rax+16]
|
|
|
|
movdqa xmm7, xmm4
|
|
psraw xmm4, 15
|
|
|
|
pxor xmm7, xmm4
|
|
psubw xmm7, xmm4 ; abs
|
|
|
|
movdqa xmm6, xmm7
|
|
pcmpgtw xmm5, xmm6
|
|
|
|
pandn xmm5, xmm6
|
|
movdqa xmm7, xmm5
|
|
|
|
movdqa xmm5, [rdx+16]
|
|
movdqa xmm6, [rcx+16]
|
|
|
|
|
|
paddw xmm7, xmm6
|
|
pmulhuw xmm7, xmm5
|
|
|
|
pxor xmm7, xmm4
|
|
psubw xmm7, xmm4;gain the sign back
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movdqa xmm1, xmm7
|
|
movdqa [rdi+16], xmm7
|
|
|
|
mov rax, arg(3) ;dequant_ptr
|
|
movdqa xmm6, [rax+16]
|
|
|
|
pmullw xmm7, xmm6
|
|
mov rax, arg(7) ;dqcoeff_ptr
|
|
|
|
movdqa [rax+16], xmm7
|
|
mov rdi, arg(4) ;scan_mask
|
|
|
|
pxor xmm7, xmm7
|
|
movdqa xmm2, [rdi]
|
|
|
|
movdqa xmm3, [rdi+16];
|
|
pcmpeqw xmm0, xmm7
|
|
|
|
pcmpeqw xmm1, xmm7
|
|
pcmpeqw xmm6, xmm6
|
|
|
|
pxor xmm0, xmm6
|
|
pxor xmm1, xmm6
|
|
|
|
psrlw xmm0, 15
|
|
psrlw xmm1, 15
|
|
|
|
pmaddwd xmm0, xmm2
|
|
pmaddwd xmm1, xmm3
|
|
|
|
movq xmm2, xmm0
|
|
movq xmm3, xmm1
|
|
|
|
psrldq xmm0, 8
|
|
psrldq xmm1, 8
|
|
|
|
paddd xmm0, xmm1
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm0, xmm2
|
|
movq xmm1, xmm0
|
|
|
|
psrldq xmm0, 4
|
|
paddd xmm1, xmm0
|
|
|
|
movd rcx, xmm1
|
|
and rcx, 0xffff
|
|
|
|
xor rdx, rdx
|
|
sub rdx, rcx
|
|
|
|
bsr rax, rcx
|
|
inc rax
|
|
|
|
sar rdx, 31
|
|
and rax, rdx
|
|
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|