vpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
2010-05-18 11:58:33 -04:00

118 lines
3.7 KiB
NASM

;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_fast_quantize_b_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 short *coeff_ptr
; r1 short *zbin_ptr
; r2 short *qcoeff_ptr
; r3 short *dqcoeff_ptr
; stack short *dequant_ptr
; stack short *scan_mask
; stack short *round_ptr
; stack short *quant_ptr
; return int * eob
|vp8_fast_quantize_b_neon_func| PROC
vld1.16 {q0, q1}, [r0] ;load z
vld1.16 {q10, q11}, [r1] ;load zbin
vabs.s16 q4, q0 ;calculate x = abs(z)
vabs.s16 q5, q1
vcge.s16 q10, q4, q10 ;x>=zbin
vcge.s16 q11, q5, q11
;if x<zbin (q10 & q11 are all 0), go to zero_output
vorr.s16 q6, q10, q11
vorr.s16 d12, d12, d13
vmov r0, r1, d12
orr r0, r0, r1
cmp r0, #0
beq zero_output
ldr r0, [sp, #8] ;load round_ptr
ldr r12, [sp, #12] ;load quant_ptr
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vshr.s16 q3, q1, #15
vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15]
vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15]
vadd.s16 q4, q6 ;x + Round
vadd.s16 q5, q7
ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr
vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16
vqdmulh.s16 q5, q9
vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order
vceq.s16 q8, q8 ;set q8 to all 1
vshr.s16 q4, #1 ;right shift 1 after vqdmulh
vshr.s16 q5, #1
;modify data to have its original sign
veor.s16 q4, q2 ; y^sz
veor.s16 q5, q3
ldr r12, [sp] ;load dequant_ptr
vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (-1) (two's complement)
vsub.s16 q5, q3
vand.s16 q4, q10 ;mask off x1 elements
vand.s16 q5, q11
vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i]
vtst.16 q14, q4, q8 ;now find eob
vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5
vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1
vand q0, q0, q14 ;get all valid number from rvsplus1_scan_order array
vand q1, q1, q15
vmax.u16 q0, q0, q1 ;find maximum value in q0, q1
vmax.u16 d0, d0, d1
vmovl.u16 q0, d0
vmul.s16 q6, q4 ;x * Dequant
vmul.s16 q7, q5
vmax.u32 d0, d0, d1
vpmax.u32 d0, d0, d0
vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant
vmov.32 r0, d0[0]
bx lr
zero_output
vst1.s16 {q10, q11}, [r2] ; qcoeff = 0
vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0
mov r0, #0
bx lr
ENDP
END