vpx/vp8/encoder/arm/neon/fastfdct8x4_neon.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    EXPORT  |vp8_fast_fdct8x4_neon|

    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
;NOTE:
;The input *src_diff. src_diff is calculated as:
;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
;In which *src_ptr and *pred_ptr both are unsigned char.
;Therefore, *src_diff should be in the range of [-255, 255].
;CAUTION:
;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.

|vp8_fast_fdct8x4_neon| PROC
    vld1.16         {q1}, [r0], r2              ;load input
    ldr             r12, _ffdct8_coeff_
    vld1.16         {q2}, [r0], r2
    vld1.16         {q3}, [r0], r2
    vld1.16         {d0}, [r12]
    vld1.16         {q4}, [r0], r2

    ;First for-loop
    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
    vtrn.32         d2, d6
    vtrn.32         d3, d7
    vtrn.32         d4, d8
    vtrn.32         d5, d9
    vtrn.16         d2, d4
    vtrn.16         d3, d5
    vtrn.16         d6, d8
    vtrn.16         d7, d9

    vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
    vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
    vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
    vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
    vadd.s16        d22, d3, d9
    vadd.s16        d23, d5, d7
    vsub.s16        d24, d5, d7
    vsub.s16        d25, d3, d9

    vshl.i16        q5, q5, #1              ; a1, b1
    vshl.i16        q6, q6, #1              ; c1, d1
    vshl.i16        q1, q11, #1
    vshl.i16        q2, q12, #1

    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
    vadd.s16        d24, d2, d3
    vsub.s16        d25, d2, d3

    vqdmulh.s16     q8, q7, d0[1]
    vqdmulh.s16     q13, q12, d0[1]
    vqdmulh.s16     q10, q6, d0[0]
    vqdmulh.s16     q15, q2, d0[0]
    vqdmulh.s16     q9, q6, d0[2]
    vqdmulh.s16     q14, q2, d0[2]

    vshr.s16        q8, q8, #1
    vshr.s16        q13, q13, #1
    vshr.s16        q10, q10, #1
    vshr.s16        q15, q15, #1
    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
    vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1

    vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
    vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
    vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
    vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
    vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
    vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
    vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
    vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2

    ;Second for-loop
    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
    vtrn.32         d2, d6
    vtrn.32         d3, d7
    vtrn.32         d4, d8
    vtrn.32         d5, d9
    vtrn.16         d2, d4
    vtrn.16         d3, d5
    vtrn.16         d6, d8
    vtrn.16         d7, d9

    vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
    vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
    vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
    vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
    vadd.s16        d2, d3, d9
    vadd.s16        d4, d5, d7
    vsub.s16        d24, d5, d7
    vsub.s16        d25, d3, d9

    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
    vadd.s16        d22, d2, d4
    vsub.s16        d23, d2, d4

    vqdmulh.s16     q8, q7, d0[1]
    vqdmulh.s16     q13, q11, d0[1]
    vqdmulh.s16     q10, q6, d0[0]
    vqdmulh.s16     q15, q12, d0[0]
    vqdmulh.s16     q9, q6, d0[2]
    vqdmulh.s16     q14, q12, d0[2]

    vshr.s16        q8, q8, #1
    vshr.s16        q13, q13, #1
    vshr.s16        q10, q10, #1
    vshr.s16        q15, q15, #1
    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
    vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1

    vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
    vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
    vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
    vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
    vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
    vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
    vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
    vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2

    vclt.s16        q5, q1, #0
    vclt.s16        q6, q2, #0
    vclt.s16        q7, q3, #0
    vclt.s16        q8, q4, #0

    vsub.s16        q1, q1, q5
    vsub.s16        q2, q2, q6
    vsub.s16        q3, q3, q7
    vsub.s16        q4, q4, q8

    vshr.s16        q1, q1, #1
    vshr.s16        q2, q2, #1
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1

    vst1.16         {q1, q2}, [r1]!
    vst1.16         {q3, q4}, [r1]

    bx              lr

    ENDP

;-----------------

_ffdct8_coeff_
    DCD     ffdct8_coeff
ffdct8_coeff
; 60547 =  0xEC83
; 46341 =  0xB505
; 25080 =  0x61F8
    DCD     0xB505EC83, 0x000061F8

    END
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`EXPORT \|vp8_fast_fdct8x4_neon\|`

			`ARM`
			`REQUIRE8`
			`PRESERVE8`

			`AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2`
			`;void vp8_fast_fdct4x4_c(short input, short output, int pitch);`
			`;NOTE:`
			`;The input *src_diff. src_diff is calculated as:`
			`;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)`
			`;In which src_ptr and pred_ptr both are unsigned char.`
			`;Therefore, *src_diff should be in the range of [-255, 255].`
			`;CAUTION:`
			`;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].`
			`;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes`
			`;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.`

			`\|vp8_fast_fdct8x4_neon\| PROC`
			`vld1.16 {q1}, [r0], r2 ;load input`
			`ldr r12, _ffdct8_coeff_`
			`vld1.16 {q2}, [r0], r2`
			`vld1.16 {q3}, [r0], r2`
			`vld1.16 {d0}, [r12]`
			`vld1.16 {q4}, [r0], r2`

			`;First for-loop`
			`;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]`
			`;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]`
			`vtrn.32 d2, d6`
			`vtrn.32 d3, d7`
			`vtrn.32 d4, d8`
			`vtrn.32 d5, d9`
			`vtrn.16 d2, d4`
			`vtrn.16 d3, d5`
			`vtrn.16 d6, d8`
			`vtrn.16 d7, d9`

			`vadd.s16 d10, d2, d8 ;ip[0]+ip[3]`
			`vadd.s16 d11, d4, d6 ;ip[1]+ip[2]`
			`vsub.s16 d12, d4, d6 ;ip[1]-ip[2]`
			`vsub.s16 d13, d2, d8 ;ip[0]-ip[3]`
			`vadd.s16 d22, d3, d9`
			`vadd.s16 d23, d5, d7`
			`vsub.s16 d24, d5, d7`
			`vsub.s16 d25, d3, d9`

			`vshl.i16 q5, q5, #1 ; a1, b1`
			`vshl.i16 q6, q6, #1 ; c1, d1`
			`vshl.i16 q1, q11, #1`
			`vshl.i16 q2, q12, #1`

			`vadd.s16 d14, d10, d11 ;temp1 = a1 + b1`
			`vsub.s16 d15, d10, d11 ;temp2 = a1 - b1`
			`vadd.s16 d24, d2, d3`
			`vsub.s16 d25, d2, d3`

			`vqdmulh.s16 q8, q7, d0[1]`
			`vqdmulh.s16 q13, q12, d0[1]`
			`vqdmulh.s16 q10, q6, d0[0]`
			`vqdmulh.s16 q15, q2, d0[0]`
			`vqdmulh.s16 q9, q6, d0[2]`
			`vqdmulh.s16 q14, q2, d0[2]`

			`vshr.s16 q8, q8, #1`
			`vshr.s16 q13, q13, #1`
			`vshr.s16 q10, q10, #1`
			`vshr.s16 q15, q15, #1`
			`vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16`
			`vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16`
			`vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1`
			`vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1`

			`vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1`
			`vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1`
			`vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2`
			`vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2`
			`vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection`
			`vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection`
			`vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2`
			`vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2`

			`;Second for-loop`
			`;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]`
			`;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]`
			`vtrn.32 d2, d6`
			`vtrn.32 d3, d7`
			`vtrn.32 d4, d8`
			`vtrn.32 d5, d9`
			`vtrn.16 d2, d4`
			`vtrn.16 d3, d5`
			`vtrn.16 d6, d8`
			`vtrn.16 d7, d9`

			`vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12]`
			`vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8]`
			`vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8]`
			`vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12]`
			`vadd.s16 d2, d3, d9`
			`vadd.s16 d4, d5, d7`
			`vsub.s16 d24, d5, d7`
			`vsub.s16 d25, d3, d9`

			`vadd.s16 d14, d10, d11 ;temp1 = a1 + b1`
			`vsub.s16 d15, d10, d11 ;temp2 = a1 - b1`
			`vadd.s16 d22, d2, d4`
			`vsub.s16 d23, d2, d4`

			`vqdmulh.s16 q8, q7, d0[1]`
			`vqdmulh.s16 q13, q11, d0[1]`
			`vqdmulh.s16 q10, q6, d0[0]`
			`vqdmulh.s16 q15, q12, d0[0]`
			`vqdmulh.s16 q9, q6, d0[2]`
			`vqdmulh.s16 q14, q12, d0[2]`

			`vshr.s16 q8, q8, #1`
			`vshr.s16 q13, q13, #1`
			`vshr.s16 q10, q10, #1`
			`vshr.s16 q15, q15, #1`
			`vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16`
			`vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16`
			`vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1`
			`vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1`

			`vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1`
			`vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1`
			`vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2`
			`vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2`
			`vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection`
			`vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection`
			`vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2`
			`vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2`

			`vclt.s16 q5, q1, #0`
			`vclt.s16 q6, q2, #0`
			`vclt.s16 q7, q3, #0`
			`vclt.s16 q8, q4, #0`

			`vsub.s16 q1, q1, q5`
			`vsub.s16 q2, q2, q6`
			`vsub.s16 q3, q3, q7`
			`vsub.s16 q4, q4, q8`

			`vshr.s16 q1, q1, #1`
			`vshr.s16 q2, q2, #1`
			`vshr.s16 q3, q3, #1`
			`vshr.s16 q4, q4, #1`

			`vst1.16 {q1, q2}, [r1]!`
			`vst1.16 {q3, q4}, [r1]`

			`bx lr`

			`ENDP`

			`;-----------------`
Adds "armvX-none-rvct" targets Adds following targets to configure script to support RVCT compilation without operating system support (for Profiler or bare metal images). - armv5te-none-rvct - armv6-none-rvct - armv7-none-rvct To strip OS specific parts from the code "os_support"-config was added to script and CONFIG_OS_SUPPORT flag is used in the code to exclude OS specific parts such as OS specific includes and function calls for timers and threads etc. This was done to enable RVCT compilation for profiling purposes or running the image on bare metal target with Lauterbach. Removed separate AREA directives for READONLY data in armv6 and neon assembly files to fix the RVCT compilation. Otherwise "ldr <reg>, =label" syntax would have been needed to prevent linker errors. This syntax is not supported by older gnu assemblers. Change-Id: I14f4c68529e8c27397502fbc3010a54e505ddb43 2011-01-24 10:21:40 +01:00
Initial WebM release 2010-05-18 17:58:33 +02:00			`_ffdct8_coeff_`
			`DCD ffdct8_coeff`
			`ffdct8_coeff`
			`; 60547 = 0xEC83`
			`; 46341 = 0xB505`
			`; 25080 = 0x61F8`
			`DCD 0xB505EC83, 0x000061F8`

			`END`