vpx/vpx_dsp/arm/idct4x4_add_neon.asm

;
;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

    EXPORT  |vpx_idct4x4_16_add_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

    AREA     Block, CODE, READONLY ; name this block of code
;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0  int16_t input
; r1  uint8_t *dest
; r2  int dest_stride)

|vpx_idct4x4_16_add_neon| PROC

    ; The 2D transform is done with two passes which are actually pretty
    ; similar. We first transform the rows. This is done by transposing
    ; the inputs, doing an SIMD column transform (the columns are the
    ; transposed rows) and then transpose the results (so that it goes back
    ; in normal/row positions). Then, we transform the columns by doing
    ; another SIMD column transform.
    ; So, two passes of a transpose followed by a column transform.

    ; load the inputs into q8-q9, d16-d19
    vld1.s16        {q8,q9}, [r0]!

    ; generate scalar constants
    ; cospi_8_64 = 15137
    movw            r0, #0x3b21
    ; cospi_16_64 = 11585
    movw            r3, #0x2d41
    ; cospi_24_64 = 6270
    movw            r12, #0x187e

    ; transpose the input data
    ; 00 01 02 03   d16
    ; 10 11 12 13   d17
    ; 20 21 22 23   d18
    ; 30 31 32 33   d19
    vtrn.16         d16, d17
    vtrn.16         d18, d19

    ; generate constant vectors
    vdup.16         d20, r0         ; replicate cospi_8_64
    vdup.16         d21, r3         ; replicate cospi_16_64

    ; 00 10 02 12   d16
    ; 01 11 03 13   d17
    ; 20 30 22 32   d18
    ; 21 31 23 33   d19
    vtrn.32         q8, q9
    ; 00 10 20 30   d16
    ; 01 11 21 31   d17
    ; 02 12 22 32   d18
    ; 03 13 23 33   d19

    vdup.16         d22, r12        ; replicate cospi_24_64

    ; do the transform on transposed rows

    ; stage 1
    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

    ; (input[0] + input[2]) * cospi_16_64;
    ; (input[0] - input[2]) * cospi_16_64;
    vmull.s16 q13, d23, d21
    vmull.s16 q14, d24, d21

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
    vmlsl.s16 q15, d19, d20
    vmlal.s16 q1,  d19, d22

    ; dct_const_round_shift
    vqrshrn.s32 d26, q13, #14
    vqrshrn.s32 d27, q14, #14
    vqrshrn.s32 d29, q15, #14
    vqrshrn.s32 d28, q1,  #14

    ; stage 2
    ; output[0] = step[0] + step[3];
    ; output[1] = step[1] + step[2];
    ; output[3] = step[0] - step[3];
    ; output[2] = step[1] - step[2];
    vadd.s16 q8,  q13, q14
    vsub.s16 q9,  q13, q14
    vswp     d18, d19

    ; transpose the results
    ; 00 01 02 03   d16
    ; 10 11 12 13   d17
    ; 20 21 22 23   d18
    ; 30 31 32 33   d19
    vtrn.16         d16, d17
    vtrn.16         d18, d19
    ; 00 10 02 12   d16
    ; 01 11 03 13   d17
    ; 20 30 22 32   d18
    ; 21 31 23 33   d19
    vtrn.32         q8, q9
    ; 00 10 20 30   d16
    ; 01 11 21 31   d17
    ; 02 12 22 32   d18
    ; 03 13 23 33   d19

    ; do the transform on columns

    ; stage 1
    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

    ; (input[0] + input[2]) * cospi_16_64;
    ; (input[0] - input[2]) * cospi_16_64;
    vmull.s16 q13, d23, d21
    vmull.s16 q14, d24, d21

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
    vmlsl.s16 q15, d19, d20
    vmlal.s16 q1,  d19, d22

    ; dct_const_round_shift
    vqrshrn.s32 d26, q13, #14
    vqrshrn.s32 d27, q14, #14
    vqrshrn.s32 d29, q15, #14
    vqrshrn.s32 d28, q1,  #14

    ; stage 2
    ; output[0] = step[0] + step[3];
    ; output[1] = step[1] + step[2];
    ; output[3] = step[0] - step[3];
    ; output[2] = step[1] - step[2];
    vadd.s16 q8,  q13, q14
    vsub.s16 q9,  q13, q14

    ; The results are in two registers, one of them being swapped. This will
    ; be taken care of by loading the 'dest' value in a swapped fashion and
    ; also storing them in the same swapped fashion.
    ; temp_out[0, 1] = d16, d17 = q8
    ; temp_out[2, 3] = d19, d18 = q9 swapped

    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
    vrshr.s16 q8, q8, #4
    vrshr.s16 q9, q9, #4

    vld1.32 {d26[0]}, [r1], r2
    vld1.32 {d26[1]}, [r1], r2
    vld1.32 {d27[1]}, [r1], r2
    vld1.32 {d27[0]}, [r1]  ; no post-increment

    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
    vaddw.u8 q8, q8, d26
    vaddw.u8 q9, q9, d27

    ; clip_pixel
    vqmovun.s16 d26, q8
    vqmovun.s16 d27, q9

    ; do the stores in reverse order with negative post-increment, by changing
    ; the sign of the stride
    rsb r2, r2, #0
    vst1.32 {d27[0]}, [r1], r2
    vst1.32 {d27[1]}, [r1], r2
    vst1.32 {d26[1]}, [r1], r2
    vst1.32 {d26[0]}, [r1]  ; no post-increment
    bx              lr
    ENDP  ; |vpx_idct4x4_16_add_neon|

    END
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`;`
			`; Copyright (c) 2013 The WebM project authors. All Rights Reserved.`
			`;`
			`; Use of this source code is governed by a BSD-style license`
			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
			`; in the file PATENTS. All contributing project authors may`
			`; be found in the AUTHORS file in the root of the source tree.`
			`;`

Replace vp9_ prefix with vpx_ prefix in vpx_dsp function names This commit clears the function naming convention in vpx_dsp. It replaces vp9_ prefix of global functions with vpx_ prefix. It also removes the vp9_ prefix from static functions. Change-Id: I6394359a63b71a51dda01342eec6a3cc08dfeedf 2015-08-03 23:51:10 +02:00			`EXPORT \|vpx_idct4x4_16_add_neon\|`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`ARM`
			`REQUIRE8`
			`PRESERVE8`

			`AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2`

			`AREA Block, CODE, READONLY ; name this block of code`
Replace vp9_ prefix with vpx_ prefix in vpx_dsp function names This commit clears the function naming convention in vpx_dsp. It replaces vp9_ prefix of global functions with vpx_ prefix. It also removes the vp9_ prefix from static functions. Change-Id: I6394359a63b71a51dda01342eec6a3cc08dfeedf 2015-08-03 23:51:10 +02:00			`;void vpx_idct4x4_16_add_neon(int16_t input, uint8_t dest, int dest_stride)`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`;`
			`; r0 int16_t input`
			`; r1 uint8_t *dest`
			`; r2 int dest_stride)`

Replace vp9_ prefix with vpx_ prefix in vpx_dsp function names This commit clears the function naming convention in vpx_dsp. It replaces vp9_ prefix of global functions with vpx_ prefix. It also removes the vp9_ prefix from static functions. Change-Id: I6394359a63b71a51dda01342eec6a3cc08dfeedf 2015-08-03 23:51:10 +02:00			`\|vpx_idct4x4_16_add_neon\| PROC`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; The 2D transform is done with two passes which are actually pretty`
			`; similar. We first transform the rows. This is done by transposing`
			`; the inputs, doing an SIMD column transform (the columns are the`
			`; transposed rows) and then transpose the results (so that it goes back`
			`; in normal/row positions). Then, we transform the columns by doing`
			`; another SIMD column transform.`
			`; So, two passes of a transpose followed by a column transform.`

			`; load the inputs into q8-q9, d16-d19`
Add neon optimize vp9_short_idct10_8x8_add. vp9_short_idct10_8x8_add is used to handle the block that only have valid data at top left 4x4 block. All the other datas are 0. So we could cut several unnecessary calculations in order to save instructions. Change-Id: I34fda95e29082b789aded97c2df193991c2d9195 2013-08-17 01:36:07 +02:00			`vld1.s16 {q8,q9}, [r0]!`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; generate scalar constants`
vpx_dsp/idct*_neon.asm: simplify immediate loads mov supports 0-65535 Change-Id: I019de0d784836d7bd60e6b36f2cdeefb541cb3fd 2016-10-05 20:50:06 +02:00			`; cospi_8_64 = 15137`
			`movw r0, #0x3b21`
			`; cospi_16_64 = 11585`
			`movw r3, #0x2d41`
			`; cospi_24_64 = 6270`
			`movw r12, #0x187e`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; transpose the input data`
			`; 00 01 02 03 d16`
			`; 10 11 12 13 d17`
			`; 20 21 22 23 d18`
			`; 30 31 32 33 d19`
			`vtrn.16 d16, d17`
			`vtrn.16 d18, d19`
Optimise idct4x4: rearrange the instructions a bit to improve instruction scheduling. Change-Id: I5ea881a6e419f9e8ed4b3b619406403b4de24134 2013-08-22 20:02:22 +02:00
			`; generate constant vectors`
			`vdup.16 d20, r0 ; replicate cospi_8_64`
			`vdup.16 d21, r3 ; replicate cospi_16_64`

Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`; 00 10 02 12 d16`
			`; 01 11 03 13 d17`
			`; 20 30 22 32 d18`
			`; 21 31 23 33 d19`
			`vtrn.32 q8, q9`
			`; 00 10 20 30 d16`
			`; 01 11 21 31 d17`
			`; 02 12 22 32 d18`
			`; 03 13 23 33 d19`

Optimise idct4x4: rearrange the instructions a bit to improve instruction scheduling. Change-Id: I5ea881a6e419f9e8ed4b3b619406403b4de24134 2013-08-22 20:02:22 +02:00			`vdup.16 d22, r12 ; replicate cospi_24_64`

Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`; do the transform on transposed rows`

			`; stage 1`
Optimise idct4x4: rearrange the instructions a bit to improve instruction scheduling. Change-Id: I5ea881a6e419f9e8ed4b3b619406403b4de24134 2013-08-22 20:02:22 +02:00			`vadd.s16 d23, d16, d18 ; (input[0] + input[2])`
			`vsub.s16 d24, d16, d18 ; (input[0] - input[2])`

			`vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64`
			`vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64`

Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`; (input[0] + input[2]) * cospi_16_64;`
			`; (input[0] - input[2]) * cospi_16_64;`
			`vmull.s16 q13, d23, d21`
			`vmull.s16 q14, d24, d21`

			`; input[1] * cospi_24_64 - input[3] * cospi_8_64;`
			`; input[1] * cospi_8_64 + input[3] * cospi_24_64;`
Reduce instructions of idct4x4. Change-Id: Ia26a2526804e7e2f656b0051618a615fca8fc79d 2013-08-16 19:54:56 +02:00			`vmlsl.s16 q15, d19, d20`
			`vmlal.s16 q1, d19, d22`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; dct_const_round_shift`
			`vqrshrn.s32 d26, q13, #14`
			`vqrshrn.s32 d27, q14, #14`
Reduce instructions of idct4x4. Change-Id: Ia26a2526804e7e2f656b0051618a615fca8fc79d 2013-08-16 19:54:56 +02:00			`vqrshrn.s32 d29, q15, #14`
			`vqrshrn.s32 d28, q1, #14`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; stage 2`
			`; output[0] = step[0] + step[3];`
			`; output[1] = step[1] + step[2];`
			`; output[3] = step[0] - step[3];`
			`; output[2] = step[1] - step[2];`
			`vadd.s16 q8, q13, q14`
			`vsub.s16 q9, q13, q14`
			`vswp d18, d19`

			`; transpose the results`
			`; 00 01 02 03 d16`
			`; 10 11 12 13 d17`
			`; 20 21 22 23 d18`
			`; 30 31 32 33 d19`
			`vtrn.16 d16, d17`
			`vtrn.16 d18, d19`
			`; 00 10 02 12 d16`
			`; 01 11 03 13 d17`
			`; 20 30 22 32 d18`
			`; 21 31 23 33 d19`
			`vtrn.32 q8, q9`
			`; 00 10 20 30 d16`
			`; 01 11 21 31 d17`
			`; 02 12 22 32 d18`
			`; 03 13 23 33 d19`

			`; do the transform on columns`

			`; stage 1`
Optimise idct4x4: rearrange the instructions a bit to improve instruction scheduling. Change-Id: I5ea881a6e419f9e8ed4b3b619406403b4de24134 2013-08-22 20:02:22 +02:00			`vadd.s16 d23, d16, d18 ; (input[0] + input[2])`
			`vsub.s16 d24, d16, d18 ; (input[0] - input[2])`

			`vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64`
			`vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64`

Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00			`; (input[0] + input[2]) * cospi_16_64;`
			`; (input[0] - input[2]) * cospi_16_64;`
			`vmull.s16 q13, d23, d21`
			`vmull.s16 q14, d24, d21`

			`; input[1] * cospi_24_64 - input[3] * cospi_8_64;`
			`; input[1] * cospi_8_64 + input[3] * cospi_24_64;`
Reduce instructions of idct4x4. Change-Id: Ia26a2526804e7e2f656b0051618a615fca8fc79d 2013-08-16 19:54:56 +02:00			`vmlsl.s16 q15, d19, d20`
			`vmlal.s16 q1, d19, d22`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; dct_const_round_shift`
			`vqrshrn.s32 d26, q13, #14`
			`vqrshrn.s32 d27, q14, #14`
Reduce instructions of idct4x4. Change-Id: Ia26a2526804e7e2f656b0051618a615fca8fc79d 2013-08-16 19:54:56 +02:00			`vqrshrn.s32 d29, q15, #14`
			`vqrshrn.s32 d28, q1, #14`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`; stage 2`
			`; output[0] = step[0] + step[3];`
			`; output[1] = step[1] + step[2];`
			`; output[3] = step[0] - step[3];`
			`; output[2] = step[1] - step[2];`
			`vadd.s16 q8, q13, q14`
			`vsub.s16 q9, q13, q14`

			`; The results are in two registers, one of them being swapped. This will`
			`; be taken care of by loading the 'dest' value in a swapped fashion and`
			`; also storing them in the same swapped fashion.`
			`; temp_out[0, 1] = d16, d17 = q8`
			`; temp_out[2, 3] = d19, d18 = q9 swapped`

			`; ROUND_POWER_OF_TWO(temp_out[j], 4)`
			`vrshr.s16 q8, q8, #4`
			`vrshr.s16 q9, q9, #4`

			`vld1.32 {d26[0]}, [r1], r2`
			`vld1.32 {d26[1]}, [r1], r2`
			`vld1.32 {d27[1]}, [r1], r2`
			`vld1.32 {d27[0]}, [r1] ; no post-increment`

			`; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]`
			`vaddw.u8 q8, q8, d26`
			`vaddw.u8 q9, q9, d27`

			`; clip_pixel`
			`vqmovun.s16 d26, q8`
			`vqmovun.s16 d27, q9`

			`; do the stores in reverse order with negative post-increment, by changing`
			`; the sign of the stride`
			`rsb r2, r2, #0`
			`vst1.32 {d27[0]}, [r1], r2`
			`vst1.32 {d27[1]}, [r1], r2`
			`vst1.32 {d26[1]}, [r1], r2`
			`vst1.32 {d26[0]}, [r1] ; no post-increment`
			`bx lr`
Replace vp9_ prefix with vpx_ prefix in vpx_dsp function names This commit clears the function naming convention in vpx_dsp. It replaces vp9_ prefix of global functions with vpx_ prefix. It also removes the vp9_ prefix from static functions. Change-Id: I6394359a63b71a51dda01342eec6a3cc08dfeedf 2015-08-03 23:51:10 +02:00			`ENDP ; \|vpx_idct4x4_16_add_neon\|`
Neon version of vp9_short_idct4x4_add. Change-Id: Idec4cae0cb9b3a29835fd2750d354c1393d47aa4 2013-08-05 02:37:05 +02:00
			`END`