vpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    EXPORT  |vp8_mse16x16_neon|
    EXPORT  |vp8_get4x4sse_cs_neon|

    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
;============================
; r0    unsigned char *src_ptr
; r1    int source_stride
; r2    unsigned char *ref_ptr
; r3    int  recon_stride
; stack unsigned int *sse
;note: in this function, sum is never used. So, we can remove this part of calculation
;from vp9_variance().

|vp8_mse16x16_neon| PROC
    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
    vmov.i8         q8, #0
    vmov.i8         q9, #0
    vmov.i8         q10, #0

    mov             r12, #8

mse16x16_neon_loop
    vld1.8          {q0}, [r0], r1              ;Load up source and reference
    vld1.8          {q2}, [r2], r3
    vld1.8          {q1}, [r0], r1
    vld1.8          {q3}, [r2], r3

    vsubl.u8        q11, d0, d4
    vsubl.u8        q12, d1, d5
    vsubl.u8        q13, d2, d6
    vsubl.u8        q14, d3, d7

    vmlal.s16       q7, d22, d22
    vmlal.s16       q8, d23, d23

    subs            r12, r12, #1

    vmlal.s16       q9, d24, d24
    vmlal.s16       q10, d25, d25
    vmlal.s16       q7, d26, d26
    vmlal.s16       q8, d27, d27
    vmlal.s16       q9, d28, d28
    vmlal.s16       q10, d29, d29

    bne             mse16x16_neon_loop

    vadd.u32        q7, q7, q8
    vadd.u32        q9, q9, q10

    ldr             r12, [sp]               ;load *sse from stack

    vadd.u32        q10, q7, q9
    vpaddl.u32      q1, q10
    vadd.u64        d0, d2, d3

    vst1.32         {d0[0]}, [r12]
    vmov.32         r0, d0[0]

    bx              lr

    ENDP


;=============================
; r0    unsigned char *src_ptr,
; r1    int  source_stride,
; r2    unsigned char *ref_ptr,
; r3    int  recon_stride
|vp8_get4x4sse_cs_neon| PROC
    vld1.8          {d0}, [r0], r1              ;Load up source and reference
    vld1.8          {d4}, [r2], r3
    vld1.8          {d1}, [r0], r1
    vld1.8          {d5}, [r2], r3
    vld1.8          {d2}, [r0], r1
    vld1.8          {d6}, [r2], r3
    vld1.8          {d3}, [r0], r1
    vld1.8          {d7}, [r2], r3

    vsubl.u8        q11, d0, d4
    vsubl.u8        q12, d1, d5
    vsubl.u8        q13, d2, d6
    vsubl.u8        q14, d3, d7

    vmull.s16       q7, d22, d22
    vmull.s16       q8, d24, d24
    vmull.s16       q9, d26, d26
    vmull.s16       q10, d28, d28

    vadd.u32        q7, q7, q8
    vadd.u32        q9, q9, q10
    vadd.u32        q9, q7, q9

    vpaddl.u32      q1, q9
    vadd.u64        d0, d2, d3

    vmov.32         r0, d0[0]
    bx              lr

    ENDP

    END
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`EXPORT \|vp8_mse16x16_neon\|`
			`EXPORT \|vp8_get4x4sse_cs_neon\|`

			`ARM`
			`REQUIRE8`
			`PRESERVE8`

			`AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2`
			`;============================`
			`; r0 unsigned char *src_ptr`
			`; r1 int source_stride`
			`; r2 unsigned char *ref_ptr`
			`; r3 int recon_stride`
			`; stack unsigned int *sse`
			`;note: in this function, sum is never used. So, we can remove this part of calculation`
Remove vp8 in local symbols. For non-static functions, change the prefix to vp9_. For static functions, remove the prefix. Also fix some comments, remove unused code or unused function prototypes. Change-Id: I1f8be05362f66060fe421c3d4c9a906fdf835de5 2012-10-31 22:40:53 +01:00			`;from vp9_variance().`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`\|vp8_mse16x16_neon\| PROC`
			`vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse`
			`vmov.i8 q8, #0`
			`vmov.i8 q9, #0`
			`vmov.i8 q10, #0`

			`mov r12, #8`

			`mse16x16_neon_loop`
			`vld1.8 {q0}, [r0], r1 ;Load up source and reference`
			`vld1.8 {q2}, [r2], r3`
			`vld1.8 {q1}, [r0], r1`
			`vld1.8 {q3}, [r2], r3`

			`vsubl.u8 q11, d0, d4`
			`vsubl.u8 q12, d1, d5`
			`vsubl.u8 q13, d2, d6`
			`vsubl.u8 q14, d3, d7`

			`vmlal.s16 q7, d22, d22`
			`vmlal.s16 q8, d23, d23`

			`subs r12, r12, #1`

			`vmlal.s16 q9, d24, d24`
			`vmlal.s16 q10, d25, d25`
			`vmlal.s16 q7, d26, d26`
			`vmlal.s16 q8, d27, d27`
			`vmlal.s16 q9, d28, d28`
			`vmlal.s16 q10, d29, d29`

			`bne mse16x16_neon_loop`

			`vadd.u32 q7, q7, q8`
			`vadd.u32 q9, q9, q10`

			`ldr r12, [sp] ;load *sse from stack`

			`vadd.u32 q10, q7, q9`
			`vpaddl.u32 q1, q10`
			`vadd.u64 d0, d2, d3`

			`vst1.32 {d0[0]}, [r12]`
			`vmov.32 r0, d0[0]`

			`bx lr`

			`ENDP`


			`;=============================`
			`; r0 unsigned char *src_ptr,`
			`; r1 int source_stride,`
			`; r2 unsigned char *ref_ptr,`
			`; r3 int recon_stride`
			`\|vp8_get4x4sse_cs_neon\| PROC`
			`vld1.8 {d0}, [r0], r1 ;Load up source and reference`
			`vld1.8 {d4}, [r2], r3`
			`vld1.8 {d1}, [r0], r1`
			`vld1.8 {d5}, [r2], r3`
			`vld1.8 {d2}, [r0], r1`
			`vld1.8 {d6}, [r2], r3`
			`vld1.8 {d3}, [r0], r1`
			`vld1.8 {d7}, [r2], r3`

			`vsubl.u8 q11, d0, d4`
			`vsubl.u8 q12, d1, d5`
			`vsubl.u8 q13, d2, d6`
			`vsubl.u8 q14, d3, d7`

			`vmull.s16 q7, d22, d22`
			`vmull.s16 q8, d24, d24`
			`vmull.s16 q9, d26, d26`
			`vmull.s16 q10, d28, d28`

			`vadd.u32 q7, q7, q8`
			`vadd.u32 q9, q9, q10`
			`vadd.u32 q9, q7, q9`

			`vpaddl.u32 q1, q9`
			`vadd.u64 d0, d2, d3`

			`vmov.32 r0, d0[0]`
			`bx lr`

			`ENDP`

			`END`