vpx/vp8/encoder/x86/quantize_sse2.asm
Fritz Koenig e0cf330cde vp8 fast quantizer sse2 optimizations for eob.
Changed the end of block computation to use pmaxw.  Removed
additional pushing and popping of registers that was not needed.

Change-Id: I08cb9b424513cd8a2c7ad8cea53b4e2adc66ef98
2010-12-09 15:00:30 -08:00

357 lines
10 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
; const int *default_zig_zag, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr,
; unsigned short zbin_oq_value,
; short *zbin_boost_ptr);
;
global sym(vp8_regular_quantize_b_impl_sse2)
sym(vp8_regular_quantize_b_impl_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 10
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
%define abs_minus_zbin_lo 0
%define abs_minus_zbin_hi 16
%define temp_qcoeff_lo 32
%define temp_qcoeff_hi 48
%define save_xmm6 64
%define save_xmm7 80
%define eob 96
%define vp8_regularquantizeb_stack_size eob + 16
sub rsp, vp8_regularquantizeb_stack_size
movdqa OWORD PTR[rsp + save_xmm6], xmm6
movdqa OWORD PTR[rsp + save_xmm7], xmm7
mov rdx, arg(0) ;coeff_ptr
mov eax, arg(8) ;zbin_oq_value
mov rcx, arg(1) ;zbin_ptr
movd xmm7, eax
movdqa xmm0, OWORD PTR[rdx]
movdqa xmm4, OWORD PTR[rdx + 16]
movdqa xmm1, xmm0
movdqa xmm5, xmm4
psraw xmm0, 15 ;sign of z (aka sz)
psraw xmm4, 15 ;sign of z (aka sz)
pxor xmm1, xmm0
pxor xmm5, xmm4
movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr
movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr
pshuflw xmm7, xmm7, 0
psubw xmm1, xmm0 ;x = abs(z)
punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value
psubw xmm5, xmm4 ;x = abs(z)
paddw xmm2, xmm7
paddw xmm3, xmm7
psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value)
psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value)
mov rdi, arg(5) ;round_ptr
mov rsi, arg(6) ;quant_ptr
movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back
paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back
movdqa xmm2, OWORD PTR[rdi]
movdqa xmm3, OWORD PTR[rsi]
movdqa xmm6, OWORD PTR[rdi + 16]
movdqa xmm7, OWORD PTR[rsi + 16]
paddw xmm1, xmm2
paddw xmm5, xmm6
pmulhw xmm1, xmm3
pmulhw xmm5, xmm7
mov rsi, arg(2) ;qcoeff_ptr
pxor xmm6, xmm6
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1
movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5
movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff
movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff
xor rax, rax
mov rcx, -1
mov [rsp + eob], rcx
mov rsi, arg(9) ;zbin_boost_ptr
mov rbx, arg(4) ;default_zig_zag
rq_zigzag_loop:
movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
lea rsi, [rsi + 2] ;zbin_boost_ptr++
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
sub edx, edi ;x - zbin
jl rq_zigzag_1
mov rdi, arg(2) ;qcoeff_ptr
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
cmp edx, 0
je rq_zigzag_1
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rsi, arg(9) ;zbin_boost_ptr
mov [rsp + eob], rax ;eob = i
rq_zigzag_1:
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
lea rsi, [rsi + 2] ;zbin_boost_ptr++
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
lea rax, [rax + 1]
sub edx, edi ;x - zbin
jl rq_zigzag_1a
mov rdi, arg(2) ;qcoeff_ptr
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
cmp edx, 0
je rq_zigzag_1a
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rsi, arg(9) ;zbin_boost_ptr
mov [rsp + eob], rax ;eob = i
rq_zigzag_1a:
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
lea rsi, [rsi + 2] ;zbin_boost_ptr++
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
lea rax, [rax + 1]
sub edx, edi ;x - zbin
jl rq_zigzag_1b
mov rdi, arg(2) ;qcoeff_ptr
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
cmp edx, 0
je rq_zigzag_1b
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rsi, arg(9) ;zbin_boost_ptr
mov [rsp + eob], rax ;eob = i
rq_zigzag_1b:
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
lea rsi, [rsi + 2] ;zbin_boost_ptr++
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
lea rax, [rax + 1]
sub edx, edi ;x - zbin
jl rq_zigzag_1c
mov rdi, arg(2) ;qcoeff_ptr
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
cmp edx, 0
je rq_zigzag_1c
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rsi, arg(9) ;zbin_boost_ptr
mov [rsp + eob], rax ;eob = i
rq_zigzag_1c:
lea rax, [rax + 1]
cmp rax, 16
jl rq_zigzag_loop
mov rdi, arg(2) ;qcoeff_ptr
mov rcx, arg(3) ;dequant_ptr
mov rsi, arg(7) ;dqcoeff_ptr
movdqa xmm2, OWORD PTR[rdi]
movdqa xmm3, OWORD PTR[rdi + 16]
movdqa xmm0, OWORD PTR[rcx]
movdqa xmm1, OWORD PTR[rcx + 16]
pmullw xmm0, xmm2
pmullw xmm1, xmm3
movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff
movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff
mov rax, [rsp + eob]
movdqa xmm6, OWORD PTR[rsp + save_xmm6]
movdqa xmm7, OWORD PTR[rsp + save_xmm7]
add rax, 1
add rsp, vp8_regularquantizeb_stack_size
pop rsp
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
; short *inv_scan_order, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr);
global sym(vp8_fast_quantize_b_impl_sse2)
sym(vp8_fast_quantize_b_impl_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
mov rdx, arg(0) ;coeff_ptr
mov rcx, arg(2) ;dequant_ptr
mov rdi, arg(4) ;round_ptr
mov rsi, arg(5) ;quant_ptr
movdqa xmm0, XMMWORD PTR[rdx]
movdqa xmm4, XMMWORD PTR[rdx + 16]
movdqa xmm2, XMMWORD PTR[rdi] ;round lo
movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi
movdqa xmm1, xmm0
movdqa xmm5, xmm4
psraw xmm0, 15 ;sign of z (aka sz)
psraw xmm4, 15 ;sign of z (aka sz)
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0 ;x = abs(z)
psubw xmm5, xmm4 ;x = abs(z)
paddw xmm1, xmm2
paddw xmm5, xmm3
pmulhw xmm1, XMMWORD PTR[rsi]
pmulhw xmm5, XMMWORD PTR[rsi + 16]
mov rdi, arg(1) ;qcoeff_ptr
mov rsi, arg(6) ;dqcoeff_ptr
movdqa xmm2, XMMWORD PTR[rcx]
movdqa xmm3, XMMWORD PTR[rcx + 16]
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
movdqa XMMWORD PTR[rdi], xmm1
movdqa XMMWORD PTR[rdi + 16], xmm5
pmullw xmm2, xmm1
pmullw xmm3, xmm5
mov rdi, arg(3) ;inv_scan_order
; Start with 16
pxor xmm4, xmm4 ;clear all bits
pcmpeqw xmm1, xmm4
pcmpeqw xmm5, xmm4
pcmpeqw xmm4, xmm4 ;set all bits
pxor xmm1, xmm4
pxor xmm5, xmm4
pand xmm1, XMMWORD PTR[rdi]
pand xmm5, XMMWORD PTR[rdi+16]
pmaxsw xmm1, xmm5
; now down to 8
pshufd xmm5, xmm1, 00001110b
pmaxsw xmm1, xmm5
; only 4 left
pshuflw xmm5, xmm1, 00001110b
pmaxsw xmm1, xmm5
; okay, just 2!
pshuflw xmm5, xmm1, 00000001b
pmaxsw xmm1, xmm5
movd rax, xmm1
and rax, 0xff
movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff
movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret