d860f685b8
Moved vp8_fast_quantize_b_sse from quantize_mmx.asm into quantize_sse2.asm and renamed. Updated the assembly code to match the C version. Change-Id: I1766d9e1ca60e173f65badc0ca0c160c2b51b200
389 lines
11 KiB
NASM
389 lines
11 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license and patent
|
|
; grant that can be found in the LICENSE file in the root of the source
|
|
; tree. All contributing project authors may be found in the AUTHORS
|
|
; file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
|
|
;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
|
|
; short *qcoeff_ptr,short *dequant_ptr,
|
|
; const int *default_zig_zag, short *round_ptr,
|
|
; short *quant_ptr, short *dqcoeff_ptr,
|
|
; unsigned short zbin_oq_value,
|
|
; short *zbin_boost_ptr);
|
|
;
|
|
global sym(vp8_regular_quantize_b_impl_sse2)
|
|
sym(vp8_regular_quantize_b_impl_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 10
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
; end prolog
|
|
|
|
ALIGN_STACK 16, rax
|
|
|
|
%define abs_minus_zbin_lo 0
|
|
%define abs_minus_zbin_hi 16
|
|
%define temp_qcoeff_lo 32
|
|
%define temp_qcoeff_hi 48
|
|
%define save_xmm6 64
|
|
%define save_xmm7 80
|
|
%define eob 96
|
|
|
|
%define vp8_regularquantizeb_stack_size eob + 16
|
|
|
|
sub rsp, vp8_regularquantizeb_stack_size
|
|
|
|
movdqa OWORD PTR[rsp + save_xmm6], xmm6
|
|
movdqa OWORD PTR[rsp + save_xmm7], xmm7
|
|
|
|
mov rdx, arg(0) ;coeff_ptr
|
|
mov eax, arg(8) ;zbin_oq_value
|
|
|
|
mov rcx, arg(1) ;zbin_ptr
|
|
movd xmm7, eax
|
|
|
|
movdqa xmm0, OWORD PTR[rdx]
|
|
movdqa xmm4, OWORD PTR[rdx + 16]
|
|
|
|
movdqa xmm1, xmm0
|
|
movdqa xmm5, xmm4
|
|
|
|
psraw xmm0, 15 ;sign of z (aka sz)
|
|
psraw xmm4, 15 ;sign of z (aka sz)
|
|
|
|
pxor xmm1, xmm0
|
|
pxor xmm5, xmm4
|
|
|
|
movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr
|
|
movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr
|
|
|
|
pshuflw xmm7, xmm7, 0
|
|
psubw xmm1, xmm0 ;x = abs(z)
|
|
|
|
punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value
|
|
psubw xmm5, xmm4 ;x = abs(z)
|
|
|
|
paddw xmm2, xmm7
|
|
paddw xmm3, xmm7
|
|
|
|
psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value)
|
|
psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value)
|
|
|
|
mov rdi, arg(5) ;round_ptr
|
|
mov rsi, arg(6) ;quant_ptr
|
|
|
|
movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
|
|
movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
|
|
|
|
paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back
|
|
paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back
|
|
|
|
movdqa xmm2, OWORD PTR[rdi]
|
|
movdqa xmm3, OWORD PTR[rsi]
|
|
|
|
movdqa xmm6, OWORD PTR[rdi + 16]
|
|
movdqa xmm7, OWORD PTR[rsi + 16]
|
|
|
|
paddw xmm1, xmm2
|
|
paddw xmm5, xmm6
|
|
|
|
pmulhw xmm1, xmm3
|
|
pmulhw xmm5, xmm7
|
|
|
|
mov rsi, arg(2) ;qcoeff_ptr
|
|
pxor xmm6, xmm6
|
|
|
|
pxor xmm1, xmm0
|
|
pxor xmm5, xmm4
|
|
|
|
psubw xmm1, xmm0
|
|
psubw xmm5, xmm4
|
|
|
|
movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1
|
|
movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5
|
|
|
|
movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff
|
|
movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff
|
|
|
|
xor rax, rax
|
|
mov rcx, -1
|
|
|
|
mov [rsp + eob], rcx
|
|
mov rsi, arg(9) ;zbin_boost_ptr
|
|
|
|
mov rbx, arg(4) ;default_zig_zag
|
|
|
|
rq_zigzag_loop:
|
|
movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc
|
|
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
|
|
lea rsi, [rsi + 2] ;zbin_boost_ptr++
|
|
|
|
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
|
|
|
|
sub edx, edi ;x - zbin
|
|
jl rq_zigzag_1
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
|
|
|
|
cmp edx, 0
|
|
je rq_zigzag_1
|
|
|
|
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
|
|
|
mov rsi, arg(9) ;zbin_boost_ptr
|
|
mov [rsp + eob], rax ;eob = i
|
|
|
|
rq_zigzag_1:
|
|
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
|
|
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
|
|
lea rsi, [rsi + 2] ;zbin_boost_ptr++
|
|
|
|
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
|
|
lea rax, [rax + 1]
|
|
|
|
sub edx, edi ;x - zbin
|
|
jl rq_zigzag_1a
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
|
|
|
|
cmp edx, 0
|
|
je rq_zigzag_1a
|
|
|
|
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
|
|
|
mov rsi, arg(9) ;zbin_boost_ptr
|
|
mov [rsp + eob], rax ;eob = i
|
|
|
|
rq_zigzag_1a:
|
|
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
|
|
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
|
|
lea rsi, [rsi + 2] ;zbin_boost_ptr++
|
|
|
|
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
|
|
lea rax, [rax + 1]
|
|
|
|
sub edx, edi ;x - zbin
|
|
jl rq_zigzag_1b
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
|
|
|
|
cmp edx, 0
|
|
je rq_zigzag_1b
|
|
|
|
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
|
|
|
mov rsi, arg(9) ;zbin_boost_ptr
|
|
mov [rsp + eob], rax ;eob = i
|
|
|
|
rq_zigzag_1b:
|
|
movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
|
|
movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
|
|
lea rsi, [rsi + 2] ;zbin_boost_ptr++
|
|
|
|
movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
|
|
lea rax, [rax + 1]
|
|
|
|
sub edx, edi ;x - zbin
|
|
jl rq_zigzag_1c
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
|
|
movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
|
|
|
|
cmp edx, 0
|
|
je rq_zigzag_1c
|
|
|
|
mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
|
|
|
mov rsi, arg(9) ;zbin_boost_ptr
|
|
mov [rsp + eob], rax ;eob = i
|
|
|
|
rq_zigzag_1c:
|
|
lea rax, [rax + 1]
|
|
|
|
cmp rax, 16
|
|
jl rq_zigzag_loop
|
|
|
|
mov rdi, arg(2) ;qcoeff_ptr
|
|
mov rcx, arg(3) ;dequant_ptr
|
|
mov rsi, arg(7) ;dqcoeff_ptr
|
|
|
|
movdqa xmm2, OWORD PTR[rdi]
|
|
movdqa xmm3, OWORD PTR[rdi + 16]
|
|
|
|
movdqa xmm0, OWORD PTR[rcx]
|
|
movdqa xmm1, OWORD PTR[rcx + 16]
|
|
|
|
pmullw xmm0, xmm2
|
|
pmullw xmm1, xmm3
|
|
|
|
movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff
|
|
movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff
|
|
|
|
mov rax, [rsp + eob]
|
|
|
|
movdqa xmm6, OWORD PTR[rsp + save_xmm6]
|
|
movdqa xmm7, OWORD PTR[rsp + save_xmm7]
|
|
|
|
add rax, 1
|
|
|
|
add rsp, vp8_regularquantizeb_stack_size
|
|
pop rsp
|
|
|
|
; begin epilog
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
|
|
; short *qcoeff_ptr,short *dequant_ptr,
|
|
; short *scan_mask, short *round_ptr,
|
|
; short *quant_ptr, short *dqcoeff_ptr);
|
|
global sym(vp8_fast_quantize_b_impl_ssse2)
|
|
sym(vp8_fast_quantize_b_impl_ssse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 7
|
|
push rsi
|
|
push rdi
|
|
push rbx
|
|
; end prolog
|
|
|
|
ALIGN_STACK 16, rax
|
|
|
|
%define save_xmm6 0
|
|
%define save_xmm7 16
|
|
|
|
%define vp8_fastquantizeb_stack_size save_xmm7 + 16
|
|
|
|
sub rsp, vp8_fastquantizeb_stack_size
|
|
|
|
movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
|
|
movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
|
|
|
|
mov rdx, arg(0) ;coeff_ptr
|
|
mov rcx, arg(2) ;dequant_ptr
|
|
mov rax, arg(3) ;scan_mask
|
|
mov rdi, arg(4) ;round_ptr
|
|
mov rsi, arg(5) ;quant_ptr
|
|
|
|
movdqa xmm0, XMMWORD PTR[rdx]
|
|
movdqa xmm4, XMMWORD PTR[rdx + 16]
|
|
|
|
movdqa xmm6, XMMWORD PTR[rdi] ;round lo
|
|
movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
|
|
|
|
movdqa xmm1, xmm0
|
|
movdqa xmm5, xmm4
|
|
|
|
psraw xmm0, 15 ;sign of z (aka sz)
|
|
psraw xmm4, 15 ;sign of z (aka sz)
|
|
|
|
pxor xmm1, xmm0
|
|
pxor xmm5, xmm4
|
|
psubw xmm1, xmm0 ;x = abs(z)
|
|
psubw xmm5, xmm4 ;x = abs(z)
|
|
|
|
paddw xmm1, xmm6
|
|
paddw xmm5, xmm7
|
|
|
|
pmulhw xmm1, XMMWORD PTR[rsi]
|
|
pmulhw xmm5, XMMWORD PTR[rsi + 16]
|
|
|
|
mov rdi, arg(1) ;qcoeff_ptr
|
|
mov rsi, arg(6) ;dqcoeff_ptr
|
|
|
|
movdqa xmm6, XMMWORD PTR[rcx]
|
|
movdqa xmm7, XMMWORD PTR[rcx + 16]
|
|
|
|
pxor xmm1, xmm0
|
|
pxor xmm5, xmm4
|
|
psubw xmm1, xmm0
|
|
psubw xmm5, xmm4
|
|
|
|
movdqa XMMWORD PTR[rdi], xmm1
|
|
movdqa XMMWORD PTR[rdi + 16], xmm5
|
|
|
|
pmullw xmm6, xmm1
|
|
pmullw xmm7, xmm5
|
|
|
|
movdqa xmm2, XMMWORD PTR[rax]
|
|
movdqa xmm3, XMMWORD PTR[rax+16];
|
|
|
|
pxor xmm4, xmm4 ;clear all bits
|
|
pcmpeqw xmm1, xmm4
|
|
pcmpeqw xmm5, xmm4
|
|
|
|
pcmpeqw xmm4, xmm4 ;set all bits
|
|
pxor xmm1, xmm4
|
|
pxor xmm5, xmm4
|
|
|
|
psrlw xmm1, 15
|
|
psrlw xmm5, 15
|
|
|
|
pmaddwd xmm1, xmm2
|
|
pmaddwd xmm5, xmm3
|
|
|
|
movq xmm2, xmm1
|
|
movq xmm3, xmm5
|
|
|
|
psrldq xmm1, 8
|
|
psrldq xmm5, 8
|
|
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm1, xmm2
|
|
movq xmm5, xmm1
|
|
|
|
psrldq xmm1, 4
|
|
paddd xmm5, xmm1
|
|
|
|
movq rcx, xmm5
|
|
and rcx, 0xffff
|
|
|
|
xor rdx, rdx
|
|
sub rdx, rcx
|
|
|
|
bsr rax, rcx
|
|
inc rax
|
|
|
|
sar rdx, 31
|
|
and rax, rdx
|
|
|
|
movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
|
|
movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
|
|
|
|
movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
|
|
movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
|
|
|
|
add rsp, vp8_fastquantizeb_stack_size
|
|
pop rsp
|
|
|
|
; begin epilog
|
|
pop rbx
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|