vpx/vp8/common/arm/neon/variance_neon.asm
Johann e50f96a4a3 Move SAD and variance functions to common
The MFQE function of the postprocessor depends on these

Change-Id: I256a37c6de079fe92ce744b1f11e16526d06b50a
2012-03-05 16:50:33 -08:00

277 lines
8.1 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance16x16_neon|
EXPORT |vp8_variance16x8_neon|
EXPORT |vp8_variance8x16_neon|
EXPORT |vp8_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance16x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
;the results into the elements of the destination vector. The explanation
;in ARM guide is wrong.
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
;vmov.32 r0, d0[0] ;this instruction costs a lot
;vmov.32 r1, d1[0]
;mul r0, r0, r0
;str r1, [r12]
;sub r0, r1, r0, lsr #8
; while sum is signed, sum * sum is always positive and must be treated as
; unsigned to avoid propagating the sign bit.
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;================================
;unsigned int vp8_variance16x8_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance16x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #4
variance16x8_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;=================================
;unsigned int vp8_variance8x16_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance8x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance8x16_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d2, d6
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
bne variance8x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;==================================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance8x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
variance8x8_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #6
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
END