vpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
Yaowu Xu 361717d2be remove one set of 16x16 variance funcations
call to this set of functions are replaced by var16x16.

Change-Id: I5ff1effc6c1358ea06cda1517b88ec28ef551b0d
2011-06-09 11:23:05 -07:00

117 lines
3.0 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mse16x16_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;============================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;note: in this function, sum is never used. So, we can remove this part of calculation
;from vp8_variance().
|vp8_mse16x16_neon| PROC
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
vmov.i8 q10, #0
mov r12, #8
mse16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmlal.s16 q7, d22, d22
vmlal.s16 q8, d23, d23
subs r12, r12, #1
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vmlal.s16 q7, d26, d26
vmlal.s16 q8, d27, d27
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
ldr r12, [sp] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
vadd.u64 d0, d2, d3
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
bx lr
ENDP
;=============================
; r0 unsigned char *src_ptr,
; r1 int source_stride,
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmull.s16 q7, d22, d22
vmull.s16 q8, d24, d24
vmull.s16 q9, d26, d26
vmull.s16 q10, d28, d28
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
vadd.u32 q9, q7, q9
vpaddl.u32 q1, q9
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
bx lr
ENDP
END