Merge "Remove unused quantize optimizations." into experimental
This commit is contained in:
commit
9b94f647cd
@ -22,10 +22,6 @@
|
||||
#define prototype_quantize_mb(sym) \
|
||||
void (sym)(MACROBLOCK *x)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
#include "x86/vp9_quantize_x86.h"
|
||||
#endif
|
||||
|
||||
void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
|
||||
int y_blocks);
|
||||
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
|
||||
|
@ -1,286 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
|
||||
; short *qcoeff_ptr,short *dequant_ptr,
|
||||
; short *scan_mask, short *round_ptr,
|
||||
; short *quant_ptr, short *dqcoeff_ptr);
|
||||
global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE
|
||||
sym(vp9_fast_quantize_b_impl_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rsi, arg(0) ;coeff_ptr
|
||||
movq mm0, [rsi]
|
||||
|
||||
mov rax, arg(1) ;zbin_ptr
|
||||
movq mm1, [rax]
|
||||
|
||||
movq mm3, mm0
|
||||
psraw mm0, 15
|
||||
|
||||
pxor mm3, mm0
|
||||
psubw mm3, mm0 ; abs
|
||||
|
||||
movq mm2, mm3
|
||||
pcmpgtw mm1, mm2
|
||||
|
||||
pandn mm1, mm2
|
||||
movq mm3, mm1
|
||||
|
||||
mov rdx, arg(6) ;quant_ptr
|
||||
movq mm1, [rdx]
|
||||
|
||||
mov rcx, arg(5) ;round_ptr
|
||||
movq mm2, [rcx]
|
||||
|
||||
paddw mm3, mm2
|
||||
pmulhuw mm3, mm1
|
||||
|
||||
pxor mm3, mm0
|
||||
psubw mm3, mm0 ;gain the sign back
|
||||
|
||||
mov rdi, arg(2) ;qcoeff_ptr
|
||||
movq mm0, mm3
|
||||
|
||||
movq [rdi], mm3
|
||||
|
||||
mov rax, arg(3) ;dequant_ptr
|
||||
movq mm2, [rax]
|
||||
|
||||
pmullw mm3, mm2
|
||||
mov rax, arg(7) ;dqcoeff_ptr
|
||||
|
||||
movq [rax], mm3
|
||||
|
||||
; next 8
|
||||
movq mm4, [rsi+8]
|
||||
|
||||
mov rax, arg(1) ;zbin_ptr
|
||||
movq mm5, [rax+8]
|
||||
|
||||
movq mm7, mm4
|
||||
psraw mm4, 15
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4 ; abs
|
||||
|
||||
movq mm6, mm7
|
||||
pcmpgtw mm5, mm6
|
||||
|
||||
pandn mm5, mm6
|
||||
movq mm7, mm5
|
||||
|
||||
movq mm5, [rdx+8]
|
||||
movq mm6, [rcx+8]
|
||||
|
||||
paddw mm7, mm6
|
||||
pmulhuw mm7, mm5
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4;gain the sign back
|
||||
|
||||
mov rdi, arg(2) ;qcoeff_ptr
|
||||
|
||||
movq mm1, mm7
|
||||
movq [rdi+8], mm7
|
||||
|
||||
mov rax, arg(3) ;dequant_ptr
|
||||
movq mm6, [rax+8]
|
||||
|
||||
pmullw mm7, mm6
|
||||
mov rax, arg(7) ;dqcoeff_ptr
|
||||
|
||||
movq [rax+8], mm7
|
||||
|
||||
|
||||
; next 8
|
||||
movq mm4, [rsi+16]
|
||||
|
||||
mov rax, arg(1) ;zbin_ptr
|
||||
movq mm5, [rax+16]
|
||||
|
||||
movq mm7, mm4
|
||||
psraw mm4, 15
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4 ; abs
|
||||
|
||||
movq mm6, mm7
|
||||
pcmpgtw mm5, mm6
|
||||
|
||||
pandn mm5, mm6
|
||||
movq mm7, mm5
|
||||
|
||||
movq mm5, [rdx+16]
|
||||
movq mm6, [rcx+16]
|
||||
|
||||
paddw mm7, mm6
|
||||
pmulhuw mm7, mm5
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4;gain the sign back
|
||||
|
||||
mov rdi, arg(2) ;qcoeff_ptr
|
||||
|
||||
movq mm1, mm7
|
||||
movq [rdi+16], mm7
|
||||
|
||||
mov rax, arg(3) ;dequant_ptr
|
||||
movq mm6, [rax+16]
|
||||
|
||||
pmullw mm7, mm6
|
||||
mov rax, arg(7) ;dqcoeff_ptr
|
||||
|
||||
movq [rax+16], mm7
|
||||
|
||||
|
||||
; next 8
|
||||
movq mm4, [rsi+24]
|
||||
|
||||
mov rax, arg(1) ;zbin_ptr
|
||||
movq mm5, [rax+24]
|
||||
|
||||
movq mm7, mm4
|
||||
psraw mm4, 15
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4 ; abs
|
||||
|
||||
movq mm6, mm7
|
||||
pcmpgtw mm5, mm6
|
||||
|
||||
pandn mm5, mm6
|
||||
movq mm7, mm5
|
||||
|
||||
movq mm5, [rdx+24]
|
||||
movq mm6, [rcx+24]
|
||||
|
||||
paddw mm7, mm6
|
||||
pmulhuw mm7, mm5
|
||||
|
||||
pxor mm7, mm4
|
||||
psubw mm7, mm4;gain the sign back
|
||||
|
||||
mov rdi, arg(2) ;qcoeff_ptr
|
||||
|
||||
movq mm1, mm7
|
||||
movq [rdi+24], mm7
|
||||
|
||||
mov rax, arg(3) ;dequant_ptr
|
||||
movq mm6, [rax+24]
|
||||
|
||||
pmullw mm7, mm6
|
||||
mov rax, arg(7) ;dqcoeff_ptr
|
||||
|
||||
movq [rax+24], mm7
|
||||
|
||||
|
||||
|
||||
mov rdi, arg(4) ;scan_mask
|
||||
mov rsi, arg(2) ;qcoeff_ptr
|
||||
|
||||
pxor mm5, mm5
|
||||
pxor mm7, mm7
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rsi+8]
|
||||
|
||||
movq mm2, [rdi]
|
||||
movq mm3, [rdi+8];
|
||||
|
||||
pcmpeqw mm0, mm7
|
||||
pcmpeqw mm1, mm7
|
||||
|
||||
pcmpeqw mm6, mm6
|
||||
pxor mm0, mm6
|
||||
|
||||
pxor mm1, mm6
|
||||
psrlw mm0, 15
|
||||
|
||||
psrlw mm1, 15
|
||||
pmaddwd mm0, mm2
|
||||
|
||||
pmaddwd mm1, mm3
|
||||
movq mm5, mm0
|
||||
|
||||
paddd mm5, mm1
|
||||
|
||||
movq mm0, [rsi+16]
|
||||
movq mm1, [rsi+24]
|
||||
|
||||
movq mm2, [rdi+16]
|
||||
movq mm3, [rdi+24];
|
||||
|
||||
pcmpeqw mm0, mm7
|
||||
pcmpeqw mm1, mm7
|
||||
|
||||
pcmpeqw mm6, mm6
|
||||
pxor mm0, mm6
|
||||
|
||||
pxor mm1, mm6
|
||||
psrlw mm0, 15
|
||||
|
||||
psrlw mm1, 15
|
||||
pmaddwd mm0, mm2
|
||||
|
||||
pmaddwd mm1, mm3
|
||||
paddd mm5, mm0
|
||||
|
||||
paddd mm5, mm1
|
||||
movq mm0, mm5
|
||||
|
||||
psrlq mm5, 32
|
||||
paddd mm0, mm5
|
||||
|
||||
; eob adjustment begins here
|
||||
movq rcx, mm0
|
||||
and rcx, 0xffff
|
||||
|
||||
xor rdx, rdx
|
||||
sub rdx, rcx ; rdx=-rcx
|
||||
|
||||
bsr rax, rcx
|
||||
inc rax
|
||||
|
||||
sar rdx, 31
|
||||
and rax, rdx
|
||||
; Substitute the sse assembly for the old mmx mixed assembly/C. The
|
||||
; following is kept as reference
|
||||
; movq rcx, mm0
|
||||
; bsr rax, rcx
|
||||
;
|
||||
; mov eob, rax
|
||||
; mov eee, rcx
|
||||
;
|
||||
;if(eee==0)
|
||||
;{
|
||||
; eob=-1;
|
||||
;}
|
||||
;else if(eee<0)
|
||||
;{
|
||||
; eob=15;
|
||||
;}
|
||||
;d->eob = eob+1;
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
@ -1,379 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
|
||||
; void vp9_regular_quantize_b_sse2 | arg
|
||||
; (BLOCK *b, | 0
|
||||
; BLOCKD *d) | 1
|
||||
|
||||
global sym(vp9_regular_quantize_b_sse2) PRIVATE
|
||||
sym(vp9_regular_quantize_b_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
push rdi
|
||||
push rsi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
push rdi
|
||||
push rsi
|
||||
%endif
|
||||
%endif
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
%define zrun_zbin_boost 0 ; 8
|
||||
%define abs_minus_zbin 8 ; 32
|
||||
%define temp_qcoeff 40 ; 32
|
||||
%define qcoeff 72 ; 32
|
||||
%define stack_size 104
|
||||
sub rsp, stack_size
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
mov rdi, arg(0) ; BLOCK *b
|
||||
mov rsi, arg(1) ; BLOCKD *d
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
mov rdi, rcx ; BLOCK *b
|
||||
mov rsi, rdx ; BLOCKD *d
|
||||
%else
|
||||
;mov rdi, rdi ; BLOCK *b
|
||||
;mov rsi, rsi ; BLOCKD *d
|
||||
%endif
|
||||
%endif
|
||||
|
||||
mov rdx, [rdi + vp9_block_coeff] ; coeff_ptr
|
||||
mov rcx, [rdi + vp9_block_zbin] ; zbin_ptr
|
||||
movd xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
|
||||
|
||||
; z
|
||||
movdqa xmm0, [rdx]
|
||||
movdqa xmm4, [rdx + 16]
|
||||
mov rdx, [rdi + vp9_block_round] ; round_ptr
|
||||
|
||||
pshuflw xmm7, xmm7, 0
|
||||
punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm5, xmm4
|
||||
|
||||
; sz
|
||||
psraw xmm0, 15
|
||||
psraw xmm4, 15
|
||||
|
||||
; (z ^ sz)
|
||||
pxor xmm1, xmm0
|
||||
pxor xmm5, xmm4
|
||||
|
||||
; x = abs(z)
|
||||
psubw xmm1, xmm0
|
||||
psubw xmm5, xmm4
|
||||
|
||||
movdqa xmm2, [rcx]
|
||||
movdqa xmm3, [rcx + 16]
|
||||
mov rcx, [rdi + vp9_block_quant] ; quant_ptr
|
||||
|
||||
; *zbin_ptr + zbin_oq_value
|
||||
paddw xmm2, xmm7
|
||||
paddw xmm3, xmm7
|
||||
|
||||
; x - (*zbin_ptr + zbin_oq_value)
|
||||
psubw xmm1, xmm2
|
||||
psubw xmm5, xmm3
|
||||
movdqa [rsp + abs_minus_zbin], xmm1
|
||||
movdqa [rsp + abs_minus_zbin + 16], xmm5
|
||||
|
||||
; add (zbin_ptr + zbin_oq_value) back
|
||||
paddw xmm1, xmm2
|
||||
paddw xmm5, xmm3
|
||||
|
||||
movdqa xmm2, [rdx]
|
||||
movdqa xmm6, [rdx + 16]
|
||||
|
||||
movdqa xmm3, [rcx]
|
||||
movdqa xmm7, [rcx + 16]
|
||||
|
||||
; x + round
|
||||
paddw xmm1, xmm2
|
||||
paddw xmm5, xmm6
|
||||
|
||||
; y = x * quant_ptr >> 16
|
||||
pmulhw xmm3, xmm1
|
||||
pmulhw xmm7, xmm5
|
||||
|
||||
; y += x
|
||||
paddw xmm1, xmm3
|
||||
paddw xmm5, xmm7
|
||||
|
||||
movdqa [rsp + temp_qcoeff], xmm1
|
||||
movdqa [rsp + temp_qcoeff + 16], xmm5
|
||||
|
||||
pxor xmm6, xmm6
|
||||
; zero qcoeff
|
||||
movdqa [rsp + qcoeff], xmm6
|
||||
movdqa [rsp + qcoeff + 16], xmm6
|
||||
|
||||
mov rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
|
||||
mov rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
|
||||
mov [rsp + zrun_zbin_boost], rdx
|
||||
|
||||
%macro ZIGZAG_LOOP 1
|
||||
; x
|
||||
movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
|
||||
|
||||
; if (x >= zbin)
|
||||
sub cx, WORD PTR[rdx] ; x - zbin
|
||||
lea rdx, [rdx + 2] ; zbin_boost_ptr++
|
||||
jl .rq_zigzag_loop_%1 ; x < zbin
|
||||
|
||||
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
|
||||
|
||||
; downshift by quant_shift[rc]
|
||||
movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
|
||||
sar edi, cl ; also sets Z bit
|
||||
je .rq_zigzag_loop_%1 ; !y
|
||||
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
||||
mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
|
||||
.rq_zigzag_loop_%1:
|
||||
%endmacro
|
||||
; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
|
||||
ZIGZAG_LOOP 0
|
||||
ZIGZAG_LOOP 1
|
||||
ZIGZAG_LOOP 4
|
||||
ZIGZAG_LOOP 8
|
||||
ZIGZAG_LOOP 5
|
||||
ZIGZAG_LOOP 2
|
||||
ZIGZAG_LOOP 3
|
||||
ZIGZAG_LOOP 6
|
||||
ZIGZAG_LOOP 9
|
||||
ZIGZAG_LOOP 12
|
||||
ZIGZAG_LOOP 13
|
||||
ZIGZAG_LOOP 10
|
||||
ZIGZAG_LOOP 7
|
||||
ZIGZAG_LOOP 11
|
||||
ZIGZAG_LOOP 14
|
||||
ZIGZAG_LOOP 15
|
||||
|
||||
movdqa xmm2, [rsp + qcoeff]
|
||||
movdqa xmm3, [rsp + qcoeff + 16]
|
||||
|
||||
mov rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
|
||||
mov rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
|
||||
|
||||
; y ^ sz
|
||||
pxor xmm2, xmm0
|
||||
pxor xmm3, xmm4
|
||||
; x = (y ^ sz) - sz
|
||||
psubw xmm2, xmm0
|
||||
psubw xmm3, xmm4
|
||||
|
||||
; dequant
|
||||
movdqa xmm0, [rcx]
|
||||
movdqa xmm1, [rcx + 16]
|
||||
|
||||
mov rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
|
||||
|
||||
pmullw xmm0, xmm2
|
||||
pmullw xmm1, xmm3
|
||||
|
||||
movdqa [rcx], xmm2 ; store qcoeff
|
||||
movdqa [rcx + 16], xmm3
|
||||
movdqa [rdi], xmm0 ; store dqcoeff
|
||||
movdqa [rdi + 16], xmm1
|
||||
|
||||
; select the last value (in zig_zag order) for EOB
|
||||
pcmpeqw xmm2, xmm6
|
||||
pcmpeqw xmm3, xmm6
|
||||
; !
|
||||
pcmpeqw xmm6, xmm6
|
||||
pxor xmm2, xmm6
|
||||
pxor xmm3, xmm6
|
||||
; mask inv_zig_zag
|
||||
pand xmm2, [GLOBAL(inv_zig_zag)]
|
||||
pand xmm3, [GLOBAL(inv_zig_zag + 16)]
|
||||
; select the max value
|
||||
pmaxsw xmm2, xmm3
|
||||
pshufd xmm3, xmm2, 00001110b
|
||||
pmaxsw xmm2, xmm3
|
||||
pshuflw xmm3, xmm2, 00001110b
|
||||
pmaxsw xmm2, xmm3
|
||||
pshuflw xmm3, xmm2, 00000001b
|
||||
pmaxsw xmm2, xmm3
|
||||
movd eax, xmm2
|
||||
and eax, 0xff
|
||||
mov [rsi + vp9_blockd_eob], eax
|
||||
|
||||
; begin epilog
|
||||
add rsp, stack_size
|
||||
pop rsp
|
||||
%if ABI_IS_32BIT
|
||||
pop rsi
|
||||
pop rdi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
pop rsi
|
||||
pop rdi
|
||||
%endif
|
||||
%endif
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; void vp9_fast_quantize_b_sse2 | arg
|
||||
; (BLOCK *b, | 0
|
||||
; BLOCKD *d) | 1
|
||||
|
||||
global sym(vp9_fast_quantize_b_sse2) PRIVATE
|
||||
sym(vp9_fast_quantize_b_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
GET_GOT rbx
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
push rdi
|
||||
push rsi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
push rdi
|
||||
push rsi
|
||||
%else
|
||||
; these registers are used for passing arguments
|
||||
%endif
|
||||
%endif
|
||||
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
mov rdi, arg(0) ; BLOCK *b
|
||||
mov rsi, arg(1) ; BLOCKD *d
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
mov rdi, rcx ; BLOCK *b
|
||||
mov rsi, rdx ; BLOCKD *d
|
||||
%else
|
||||
;mov rdi, rdi ; BLOCK *b
|
||||
;mov rsi, rsi ; BLOCKD *d
|
||||
%endif
|
||||
%endif
|
||||
|
||||
mov rax, [rdi + vp9_block_coeff]
|
||||
mov rcx, [rdi + vp9_block_round]
|
||||
mov rdx, [rdi + vp9_block_quant_fast]
|
||||
|
||||
; z = coeff
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm4, [rax + 16]
|
||||
|
||||
; dup z so we can save sz
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm5, xmm4
|
||||
|
||||
; sz = z >> 15
|
||||
psraw xmm0, 15
|
||||
psraw xmm4, 15
|
||||
|
||||
; x = abs(z) = (z ^ sz) - sz
|
||||
pxor xmm1, xmm0
|
||||
pxor xmm5, xmm4
|
||||
psubw xmm1, xmm0
|
||||
psubw xmm5, xmm4
|
||||
|
||||
; x += round
|
||||
paddw xmm1, [rcx]
|
||||
paddw xmm5, [rcx + 16]
|
||||
|
||||
mov rax, [rsi + vp9_blockd_qcoeff]
|
||||
mov rcx, [rsi + vp9_blockd_dequant]
|
||||
mov rdi, [rsi + vp9_blockd_dqcoeff]
|
||||
|
||||
; y = x * quant >> 16
|
||||
pmulhw xmm1, [rdx]
|
||||
pmulhw xmm5, [rdx + 16]
|
||||
|
||||
; x = (y ^ sz) - sz
|
||||
pxor xmm1, xmm0
|
||||
pxor xmm5, xmm4
|
||||
psubw xmm1, xmm0
|
||||
psubw xmm5, xmm4
|
||||
|
||||
; qcoeff = x
|
||||
movdqa [rax], xmm1
|
||||
movdqa [rax + 16], xmm5
|
||||
|
||||
; x * dequant
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm3, xmm5
|
||||
pmullw xmm2, [rcx]
|
||||
pmullw xmm3, [rcx + 16]
|
||||
|
||||
; dqcoeff = x * dequant
|
||||
movdqa [rdi], xmm2
|
||||
movdqa [rdi + 16], xmm3
|
||||
|
||||
pxor xmm4, xmm4 ;clear all bits
|
||||
pcmpeqw xmm1, xmm4
|
||||
pcmpeqw xmm5, xmm4
|
||||
|
||||
pcmpeqw xmm4, xmm4 ;set all bits
|
||||
pxor xmm1, xmm4
|
||||
pxor xmm5, xmm4
|
||||
|
||||
pand xmm1, [GLOBAL(inv_zig_zag)]
|
||||
pand xmm5, [GLOBAL(inv_zig_zag + 16)]
|
||||
|
||||
pmaxsw xmm1, xmm5
|
||||
|
||||
; now down to 8
|
||||
pshufd xmm5, xmm1, 00001110b
|
||||
|
||||
pmaxsw xmm1, xmm5
|
||||
|
||||
; only 4 left
|
||||
pshuflw xmm5, xmm1, 00001110b
|
||||
|
||||
pmaxsw xmm1, xmm5
|
||||
|
||||
; okay, just 2!
|
||||
pshuflw xmm5, xmm1, 00000001b
|
||||
|
||||
pmaxsw xmm1, xmm5
|
||||
|
||||
movd eax, xmm1
|
||||
and eax, 0xff
|
||||
mov [rsi + vp9_blockd_eob], eax
|
||||
|
||||
; begin epilog
|
||||
%if ABI_IS_32BIT
|
||||
pop rsi
|
||||
pop rdi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
pop rsi
|
||||
pop rdi
|
||||
%endif
|
||||
%endif
|
||||
|
||||
RESTORE_GOT
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
inv_zig_zag:
|
||||
dw 0x0001, 0x0002, 0x0006, 0x0007
|
||||
dw 0x0003, 0x0005, 0x0008, 0x000d
|
||||
dw 0x0004, 0x0009, 0x000c, 0x000e
|
||||
dw 0x000a, 0x000b, 0x000f, 0x0010
|
@ -1,253 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
|
||||
; void vp9_regular_quantize_b_sse4 | arg
|
||||
; (BLOCK *b, | 0
|
||||
; BLOCKD *d) | 1
|
||||
|
||||
global sym(vp9_regular_quantize_b_sse4) PRIVATE
|
||||
sym(vp9_regular_quantize_b_sse4):
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
GET_GOT rbx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
%define qcoeff 0 ; 32
|
||||
%define stack_size 32
|
||||
sub rsp, stack_size
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
SAVE_XMM 8, u
|
||||
push rdi
|
||||
push rsi
|
||||
%endif
|
||||
%endif
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
mov rdi, arg(0) ; BLOCK *b
|
||||
mov rsi, arg(1) ; BLOCKD *d
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
mov rdi, rcx ; BLOCK *b
|
||||
mov rsi, rdx ; BLOCKD *d
|
||||
%else
|
||||
;mov rdi, rdi ; BLOCK *b
|
||||
;mov rsi, rsi ; BLOCKD *d
|
||||
%endif
|
||||
%endif
|
||||
|
||||
mov rax, [rdi + vp9_block_coeff]
|
||||
mov rcx, [rdi + vp9_block_zbin]
|
||||
mov rdx, [rdi + vp9_block_round]
|
||||
movd xmm7, [rdi + vp9_block_zbin_extra]
|
||||
|
||||
; z
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm1, [rax + 16]
|
||||
|
||||
; duplicate zbin_oq_value
|
||||
pshuflw xmm7, xmm7, 0
|
||||
punpcklwd xmm7, xmm7
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
; sz
|
||||
psraw xmm0, 15
|
||||
psraw xmm1, 15
|
||||
|
||||
; (z ^ sz)
|
||||
pxor xmm2, xmm0
|
||||
pxor xmm3, xmm1
|
||||
|
||||
; x = abs(z)
|
||||
psubw xmm2, xmm0
|
||||
psubw xmm3, xmm1
|
||||
|
||||
; zbin
|
||||
movdqa xmm4, [rcx]
|
||||
movdqa xmm5, [rcx + 16]
|
||||
|
||||
; *zbin_ptr + zbin_oq_value
|
||||
paddw xmm4, xmm7
|
||||
paddw xmm5, xmm7
|
||||
|
||||
movdqa xmm6, xmm2
|
||||
movdqa xmm7, xmm3
|
||||
|
||||
; x - (*zbin_ptr + zbin_oq_value)
|
||||
psubw xmm6, xmm4
|
||||
psubw xmm7, xmm5
|
||||
|
||||
; round
|
||||
movdqa xmm4, [rdx]
|
||||
movdqa xmm5, [rdx + 16]
|
||||
|
||||
mov rax, [rdi + vp9_block_quant_shift]
|
||||
mov rcx, [rdi + vp9_block_quant]
|
||||
mov rdx, [rdi + vp9_block_zrun_zbin_boost]
|
||||
|
||||
; x + round
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
; quant
|
||||
movdqa xmm4, [rcx]
|
||||
movdqa xmm5, [rcx + 16]
|
||||
|
||||
; y = x * quant_ptr >> 16
|
||||
pmulhw xmm4, xmm2
|
||||
pmulhw xmm5, xmm3
|
||||
|
||||
; y += x
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
pxor xmm4, xmm4
|
||||
%if ABI_IS_32BIT
|
||||
movdqa [rsp + qcoeff], xmm4
|
||||
movdqa [rsp + qcoeff + 16], xmm4
|
||||
%else
|
||||
pxor xmm8, xmm8
|
||||
%endif
|
||||
|
||||
; quant_shift
|
||||
movdqa xmm5, [rax]
|
||||
|
||||
; zrun_zbin_boost
|
||||
mov rax, rdx
|
||||
|
||||
%macro ZIGZAG_LOOP 5
|
||||
; x
|
||||
pextrw ecx, %4, %2
|
||||
|
||||
; if (x >= zbin)
|
||||
sub cx, WORD PTR[rdx] ; x - zbin
|
||||
lea rdx, [rdx + 2] ; zbin_boost_ptr++
|
||||
jl .rq_zigzag_loop_%1 ; x < zbin
|
||||
|
||||
pextrw edi, %3, %2 ; y
|
||||
|
||||
; downshift by quant_shift[rc]
|
||||
pextrb ecx, xmm5, %1 ; quant_shift[rc]
|
||||
sar edi, cl ; also sets Z bit
|
||||
je .rq_zigzag_loop_%1 ; !y
|
||||
%if ABI_IS_32BIT
|
||||
mov WORD PTR[rsp + qcoeff + %1 *2], di
|
||||
%else
|
||||
pinsrw %5, edi, %2 ; qcoeff[rc]
|
||||
%endif
|
||||
mov rdx, rax ; reset to b->zrun_zbin_boost
|
||||
.rq_zigzag_loop_%1:
|
||||
%endmacro
|
||||
; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
|
||||
ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4
|
||||
ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
|
||||
ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
|
||||
|
||||
mov rcx, [rsi + vp9_blockd_dequant]
|
||||
mov rdi, [rsi + vp9_blockd_dqcoeff]
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
movdqa xmm4, [rsp + qcoeff]
|
||||
movdqa xmm5, [rsp + qcoeff + 16]
|
||||
%else
|
||||
%define xmm5 xmm8
|
||||
%endif
|
||||
|
||||
; y ^ sz
|
||||
pxor xmm4, xmm0
|
||||
pxor xmm5, xmm1
|
||||
; x = (y ^ sz) - sz
|
||||
psubw xmm4, xmm0
|
||||
psubw xmm5, xmm1
|
||||
|
||||
; dequant
|
||||
movdqa xmm0, [rcx]
|
||||
movdqa xmm1, [rcx + 16]
|
||||
|
||||
mov rcx, [rsi + vp9_blockd_qcoeff]
|
||||
|
||||
pmullw xmm0, xmm4
|
||||
pmullw xmm1, xmm5
|
||||
|
||||
; store qcoeff
|
||||
movdqa [rcx], xmm4
|
||||
movdqa [rcx + 16], xmm5
|
||||
|
||||
; store dqcoeff
|
||||
movdqa [rdi], xmm0
|
||||
movdqa [rdi + 16], xmm1
|
||||
|
||||
; select the last value (in zig_zag order) for EOB
|
||||
pxor xmm6, xmm6
|
||||
pcmpeqw xmm4, xmm6
|
||||
pcmpeqw xmm5, xmm6
|
||||
|
||||
packsswb xmm4, xmm5
|
||||
pshufb xmm4, [GLOBAL(zig_zag1d)]
|
||||
pmovmskb edx, xmm4
|
||||
xor rdi, rdi
|
||||
mov eax, -1
|
||||
xor dx, ax
|
||||
bsr eax, edx
|
||||
sub edi, edx
|
||||
sar edi, 31
|
||||
add eax, 1
|
||||
and eax, edi
|
||||
|
||||
mov [rsi + vp9_blockd_eob], eax
|
||||
|
||||
; begin epilog
|
||||
%if ABI_IS_32BIT
|
||||
add rsp, stack_size
|
||||
pop rsp
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
RESTORE_GOT
|
||||
pop rbp
|
||||
%else
|
||||
%undef xmm5
|
||||
%if LIBVPX_YASM_WIN64
|
||||
pop rsi
|
||||
pop rdi
|
||||
RESTORE_XMM
|
||||
%endif
|
||||
%endif
|
||||
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d
|
||||
zig_zag1d:
|
||||
db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
|
@ -1,137 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
|
||||
; void vp9_fast_quantize_b_ssse3 | arg
|
||||
; (BLOCK *b, | 0
|
||||
; BLOCKD *d) | 1
|
||||
;
|
||||
|
||||
global sym(vp9_fast_quantize_b_ssse3) PRIVATE
|
||||
sym(vp9_fast_quantize_b_ssse3):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
GET_GOT rbx
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
push rdi
|
||||
push rsi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
push rdi
|
||||
push rsi
|
||||
%endif
|
||||
%endif
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
mov rdi, arg(0) ; BLOCK *b
|
||||
mov rsi, arg(1) ; BLOCKD *d
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
mov rdi, rcx ; BLOCK *b
|
||||
mov rsi, rdx ; BLOCKD *d
|
||||
%else
|
||||
;mov rdi, rdi ; BLOCK *b
|
||||
;mov rsi, rsi ; BLOCKD *d
|
||||
%endif
|
||||
%endif
|
||||
|
||||
mov rax, [rdi + vp9_block_coeff]
|
||||
mov rcx, [rdi + vp9_block_round]
|
||||
mov rdx, [rdi + vp9_block_quant_fast]
|
||||
|
||||
; coeff
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm4, [rax + 16]
|
||||
|
||||
; round
|
||||
movdqa xmm2, [rcx]
|
||||
movdqa xmm3, [rcx + 16]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm5, xmm4
|
||||
|
||||
; sz = z >> 15
|
||||
psraw xmm0, 15
|
||||
psraw xmm4, 15
|
||||
|
||||
pabsw xmm1, xmm1
|
||||
pabsw xmm5, xmm5
|
||||
|
||||
paddw xmm1, xmm2
|
||||
paddw xmm5, xmm3
|
||||
|
||||
; quant_fast
|
||||
pmulhw xmm1, [rdx]
|
||||
pmulhw xmm5, [rdx + 16]
|
||||
|
||||
mov rax, [rsi + vp9_blockd_qcoeff]
|
||||
mov rdi, [rsi + vp9_blockd_dequant]
|
||||
mov rcx, [rsi + vp9_blockd_dqcoeff]
|
||||
|
||||
pxor xmm1, xmm0
|
||||
pxor xmm5, xmm4
|
||||
psubw xmm1, xmm0
|
||||
psubw xmm5, xmm4
|
||||
|
||||
movdqa [rax], xmm1
|
||||
movdqa [rax + 16], xmm5
|
||||
|
||||
movdqa xmm2, [rdi]
|
||||
movdqa xmm3, [rdi + 16]
|
||||
|
||||
pxor xmm4, xmm4
|
||||
pmullw xmm2, xmm1
|
||||
pmullw xmm3, xmm5
|
||||
|
||||
pcmpeqw xmm1, xmm4 ;non zero mask
|
||||
pcmpeqw xmm5, xmm4 ;non zero mask
|
||||
packsswb xmm1, xmm5
|
||||
pshufb xmm1, [GLOBAL(zz_shuf)]
|
||||
|
||||
pmovmskb edx, xmm1
|
||||
|
||||
xor rdi, rdi
|
||||
mov eax, -1
|
||||
xor dx, ax ;flip the bits for bsr
|
||||
bsr eax, edx
|
||||
|
||||
movdqa [rcx], xmm2 ;store dqcoeff
|
||||
movdqa [rcx + 16], xmm3 ;store dqcoeff
|
||||
|
||||
sub edi, edx ;check for all zeros in bit mask
|
||||
sar edi, 31 ;0 or -1
|
||||
add eax, 1
|
||||
and eax, edi ;if the bit mask was all zero,
|
||||
;then eob = 0
|
||||
mov [rsi + vp9_blockd_eob], eax
|
||||
|
||||
; begin epilog
|
||||
%if ABI_IS_32BIT
|
||||
pop rsi
|
||||
pop rdi
|
||||
%else
|
||||
%if LIBVPX_YASM_WIN64
|
||||
pop rsi
|
||||
pop rdi
|
||||
%endif
|
||||
%endif
|
||||
|
||||
RESTORE_GOT
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
zz_shuf:
|
||||
db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
|
@ -1,48 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license and patent
|
||||
* grant that can be found in the LICENSE file in the root of the source
|
||||
* tree. All contributing project authors may be found in the AUTHORS
|
||||
* file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
|
||||
#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
|
||||
|
||||
|
||||
/* Note:
|
||||
*
|
||||
* This platform is commonly built for runtime CPU detection. If you modify
|
||||
* any of the function mappings present in this file, be sure to also update
|
||||
* them in the function pointer initialization code
|
||||
*/
|
||||
#if HAVE_MMX
|
||||
|
||||
#endif /* HAVE_MMX */
|
||||
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
|
||||
#undef vp9_quantize_quantb
|
||||
#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
|
||||
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
|
||||
|
||||
#endif /* HAVE_SSE2 */
|
||||
|
||||
|
||||
#if HAVE_SSE4_1
|
||||
extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
|
||||
#undef vp9_quantize_quantb
|
||||
#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
|
||||
|
||||
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
|
||||
|
||||
#endif /* HAVE_SSE4_1 */
|
||||
|
||||
#endif /* QUANTIZE_X86_H */
|
14
vp9/vp9cx.mk
14
vp9/vp9cx.mk
@ -17,15 +17,6 @@ VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
|
||||
|
||||
VP9_CX_SRCS-yes += vp9_cx_iface.c
|
||||
|
||||
# encoder
|
||||
#INCLUDES += algo/vpx_common/vpx_mem/include
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += algo/vpx_ref/cpu_id/include
|
||||
#INCLUDES += common
|
||||
#INCLUDES += encoder
|
||||
|
||||
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_dct.c
|
||||
@ -81,7 +72,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_x86.h
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
|
||||
@ -94,17 +84,13 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
|
||||
#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
|
||||
#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
|
||||
#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user