vpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
2010-05-18 11:58:33 -04:00

173 lines
4.4 KiB
NASM

;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_mse16x16_neon|
EXPORT |vp8_get16x16pred_error_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;============================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;note: in this function, sum is never used. So, we can remove this part of calculation
;from vp8_variance().
|vp8_mse16x16_neon| PROC
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
vmov.i8 q10, #0
mov r12, #8
mse16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmlal.s16 q7, d22, d22
vmlal.s16 q8, d23, d23
subs r12, r12, #1
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vmlal.s16 q7, d26, d26
vmlal.s16 q8, d27, d27
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
ldr r12, [sp] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
vadd.u64 d0, d2, d3
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
bx lr
ENDP
;============================
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_get16x16pred_error_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - pred_error
vmov.i8 q10, #0
mov r12, #8
get16x16pred_error_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11
vmlal.s16 q9, d22, d22
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne get16x16pred_error_neon_loop
vadd.u32 q10, q9, q10
vpaddl.s32 q0, q8
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0]
bx lr
ENDP
;=============================
; r0 unsigned char *src_ptr,
; r1 int source_stride,
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmull.s16 q7, d22, d22
vmull.s16 q8, d24, d24
vmull.s16 q9, d26, d26
vmull.s16 q10, d28, d28
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
vadd.u32 q9, q7, q9
vpaddl.u32 q1, q9
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
bx lr
ENDP
END