Update NEON wide idcts

Expand 93c32a55 which used SSE2 instructions to do two
idct/dequant/recons at a time to NEON. Initial working
commit. More work needs to be put into rearranging and
interlacing the data to take advantage of quadword
operations, which is when we'll hopefully see a much
better boost

Change-Id: I86d59d96f15e0d0f9710253e2c098ac2ff2865d1
This commit is contained in:
Johann 2010-09-07 14:21:27 -04:00
parent edcbb1c199
commit 14ba764219
7 changed files with 577 additions and 223 deletions

View File

@ -1,136 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride,
; int Dc);
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *pred
; r3 unsigned char *dest
; sp int pitch
; sp+4 int stride
; sp+8 int Dc
|vp8_dequant_dc_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
ldr r1, [sp, #8] ;load Dc from stack
ldr r12, _CONSTANTS_
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
vmov.16 d2[0], r1
ldr r1, [sp] ; pitch
vld1.32 {d14[0]}, [r2], r1
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
ldr r1, [sp, #4] ; stride
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; memset(input, 0, 32) -- 32bytes
vmov.i16 q14, #0
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vmov q15, q14
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vst1.16 {q14, q15}, [r0]
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r3], r1
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
bx lr
ENDP ; |vp8_dequant_dc_idct_add_neon|
; Constant Pool
_CONSTANTS_ DCD cospi8sqrt2minus1
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -12,6 +12,21 @@
#include "idct.h"
#include "dequantize.h"
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
void idct_dequant_dc_full_2x_neon
(short *input, short *dq, unsigned char *pre, unsigned char *dst,
int stride, short *dc);
void idct_dequant_dc_0_2x_neon
(short *dc, unsigned char *pre, unsigned char *dst, int stride);
void idct_dequant_full_2x_neon
(short *q, short *dq, unsigned char *pre, unsigned char *dst,
int pitch, int stride);
void idct_dequant_0_2x_neon
(short *q, short dq, unsigned char *pre, int pitch,
unsigned char *dst, int stride);
void vp8_dequant_dc_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
@ -20,25 +35,15 @@ void vp8_dequant_dc_idct_add_y_block_neon
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
else
vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
else
vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
q += 64;
dc += 4;
@ -56,37 +61,15 @@ void vp8_dequant_idct_add_y_block_neon
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
((int *)q)[0] = 0;
}
idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
((int *)(q+48))[0] = 0;
}
idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
q += 64;
pre += 64;
@ -101,51 +84,34 @@ void vp8_dequant_idct_add_uv_block_neon
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
((int *)q)[0] = 0;
}
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4*stride;
q += 32;
pre += 32;
dstu += 4*stride;
eobs += 2;
}
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
((int *)q)[0] = 0;
}
q += 32;
pre += 32;
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
((int *)(q+16))[0] = 0;
}
if (((short *)eobs)[2] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
q += 32;
pre += 32;
dstv += 4*stride;
eobs += 2;
}
q += 32;
pre += 32;
dstv += 4*stride;
if (((short *)eobs)[3] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
}

View File

@ -0,0 +1,79 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
; int pitch, unsigned char *dst, int stride);
; r0 *q
; r1 dq
; r2 *pre
; r3 pitch
; sp *dst
; sp+4 stride
|idct_dequant_0_2x_neon| PROC
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d4[1]}, [r2]
vld1.32 {d8[0]}, [r12], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d10[1]}, [r12]
ldrh r12, [r0] ; lo q
ldrh r2, [r0, #32] ; hi q
mov r3, #0
strh r3, [r0]
strh r3, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
sxth r2, r2 ; hi
mul r0, r2, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
ldr r2, [sp] ; dst
ldr r3, [sp, #4] ; stride
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d10[1]}, [r0]
bx lr
ENDP ; |idct_dequant_0_2x_neon|
END

View File

@ -0,0 +1,69 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_dc_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
; unsigned char *dst, int stride);
; r0 *dc
; r1 *pre
; r2 *dst
; r3 stride
|idct_dequant_dc_0_2x_neon| PROC
ldr r0, [r0] ; *dc
mov r12, #16
vld1.32 {d2[0]}, [r1], r12 ; lo
vld1.32 {d2[1]}, [r1], r12
vld1.32 {d4[0]}, [r1], r12
vld1.32 {d4[1]}, [r1]
sub r1, r1, #44
vld1.32 {d8[0]}, [r1], r12 ; hi
vld1.32 {d8[1]}, [r1], r12
vld1.32 {d10[0]}, [r1], r12
vld1.32 {d10[1]}, [r1]
sxth r1, r0 ; lo *dc
add r1, r1, #4
asr r1, r1, #3
vdup.16 q0, r1
sxth r0, r0, ror #16 ; hi *dc
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d10[1]}, [r0]
bx lr
ENDP ;|idct_dequant_dc_0_2x_neon|
END

View File

@ -0,0 +1,190 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_dc_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
; unsigned char *dst, int stride, short *dc);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp stride
; sp+4 *dc
|idct_dequant_dc_full_2x_neon| PROC
vld1.16 {q3, q4}, [r0] ; lo input
vld1.16 {q5, q6}, [r1] ; use the same dq for both
mov r1, #16 ; pitch
add r0, r0, #32
vld1.16 {q10, q11}, [r0] ; hi input
add r12, r2, #4
vld1.32 {d14[0]}, [r2], r1 ; lo pred
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
vld1.32 {d28[0]}, [r12], r1 ; hi pred
vld1.32 {d28[1]}, [r12], r1
vld1.32 {d29[0]}, [r12], r1
ldr r1, [sp, #4] ; dc
vld1.32 {d29[1]}, [r12]
ldr r2, _CONSTANTS_
ldrh r12, [r1], #2 ; lo *dc
ldrh r1, [r1] ; hi *dc
vmul.i16 q1, q3, q5 ; lo input * dq
vmul.i16 q2, q4, q6
vmul.i16 q8, q10, q5 ; hi input * dq
vmul.i16 q9, q11, q6
vmov.16 d2[0], r12 ; move lo dc up to neon, overwrite first element
vmov.16 d16[0], r1 ; move hi dc up to neon, overwrite first element
ldr r1, [sp] ; stride
vld1.16 {d0}, [r2]
vswp d3, d4 ; lo q2(vp[4] vp[12])
vswp d17, d18 ; hi q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2] ; lo * constants
vqdmulh.s16 q4, q2, d0[0]
vqdmulh.s16 q10, q9, d0[2] ; hi * constants
vqdmulh.s16 q11, q9, d0[0]
vqadd.s16 d12, d2, d3 ; lo a1
vqsub.s16 d13, d2, d3 ; lo b1
vqadd.s16 d26, d16, d17 ; hi a1
vqsub.s16 d27, d16, d17 ; hi b1
vshr.s16 q3, q3, #1 ; lo
vshr.s16 q4, q4, #1
vshr.s16 q10, q10, #1 ; hi
vshr.s16 q11, q11, #1
vqadd.s16 q3, q3, q2 ; lo
vqadd.s16 q4, q4, q2
vqadd.s16 q10, q10, q9 ; hi
vqadd.s16 q11, q11, q9
vqsub.s16 d10, d6, d9 ; lo c1
vqadd.s16 d11, d7, d8 ; lo d1
vqsub.s16 d24, d20, d23 ; hi c1
vqadd.s16 d25, d21, d22 ; hi d1
vqadd.s16 d2, d12, d11 ; lo
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vqadd.s16 d16, d26, d25 ; hi
vqadd.s16 d17, d27, d24
vqsub.s16 d18, d27, d24
vqsub.s16 d19, d26, d25
vtrn.32 d2, d4 ; lo
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vtrn.32 d16, d18 ; hi
vtrn.32 d17, d19
vtrn.16 d16, d17
vtrn.16 d18, d19
vswp d3, d4 ; lo
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vswp d17, d18 ; hi
vqdmulh.s16 q10, q9, d0[2]
vqdmulh.s16 q11, q9, d0[0]
vqadd.s16 d12, d2, d3 ; lo a1
vqsub.s16 d13, d2, d3 ; lo b1
vqadd.s16 d26, d16, d17 ; hi a1
vqsub.s16 d27, d16, d17 ; hi b1
vshr.s16 q3, q3, #1 ; lo
vshr.s16 q4, q4, #1
vshr.s16 q10, q10, #1 ; hi
vshr.s16 q11, q11, #1
vqadd.s16 q3, q3, q2 ; lo
vqadd.s16 q4, q4, q2
vqadd.s16 q10, q10, q9 ; hi
vqadd.s16 q11, q11, q9
vqsub.s16 d10, d6, d9 ; lo c1
vqadd.s16 d11, d7, d8 ; lo d1
vqsub.s16 d24, d20, d23 ; hi c1
vqadd.s16 d25, d21, d22 ; hi d1
vqadd.s16 d2, d12, d11 ; lo
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vqadd.s16 d16, d26, d25 ; hi
vqadd.s16 d17, d27, d24
vqsub.s16 d18, d27, d24
vqsub.s16 d19, d26, d25
vrshr.s16 q1, q1, #3 ; lo
vrshr.s16 q2, q2, #3
vrshr.s16 q8, q8, #3 ; hi
vrshr.s16 q9, q9, #3
vtrn.32 d2, d4 ; lo
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vtrn.32 d16, d18 ; hi
vtrn.32 d17, d19
vtrn.16 d16, d17
vtrn.16 d18, d19
vaddw.u8 q1, q1, d14 ; lo
vaddw.u8 q2, q2, d15
vaddw.u8 q8, q8, d28 ; hi
vaddw.u8 q9, q9, d29
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
vqmovun.s16 d0, q1 ; lo
vqmovun.s16 d1, q2
vqmovun.s16 d2, q8 ; hi
vqmovun.s16 d3, q9
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
vst1.32 {d2[0]}, [r2], r1 ; hi
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r2], r1
vst1.32 {d3[1]}, [r2]
bx lr
ENDP ; |idct_dequant_dc_full_2x_neon|
; Constant Pool
_CONSTANTS_ DCD cospi8sqrt2minus1
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -0,0 +1,183 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
; unsigned char *dst, int pitch, int stride);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp pitch
; sp+4 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q3, q4}, [r0] ; lo input
vld1.16 {q5, q6}, [r1] ; use the same dq for both
ldr r1, [sp] ; pitch
add r0, r0, #32
vld1.16 {q10, q11}, [r0] ; hi input
add r12, r2, #4
vld1.32 {d14[0]}, [r2], r1 ; lo pred
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
vld1.32 {d28[0]}, [r12], r1 ; hi pred
vld1.32 {d28[1]}, [r12], r1
vld1.32 {d29[0]}, [r12], r1
vld1.32 {d29[1]}, [r12]
ldr r2, _CONSTANTS_
vmul.i16 q1, q3, q5 ; lo input * dq
vmul.i16 q2, q4, q6
vmul.i16 q8, q10, q5 ; hi input * dq
vmul.i16 q9, q11, q6
ldr r1, [sp, #4] ; stride
vld1.16 {d0}, [r2]
vswp d3, d4 ; lo q2(vp[4] vp[12])
vswp d17, d18 ; hi q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2] ; lo * constants
vqdmulh.s16 q4, q2, d0[0]
vqdmulh.s16 q10, q9, d0[2] ; hi * constants
vqdmulh.s16 q11, q9, d0[0]
vqadd.s16 d12, d2, d3 ; lo a1
vqsub.s16 d13, d2, d3 ; lo b1
vqadd.s16 d26, d16, d17 ; hi a1
vqsub.s16 d27, d16, d17 ; hi b1
vshr.s16 q3, q3, #1 ; lo
vshr.s16 q4, q4, #1
vshr.s16 q10, q10, #1 ; hi
vshr.s16 q11, q11, #1
vqadd.s16 q3, q3, q2 ; lo
vqadd.s16 q4, q4, q2
vqadd.s16 q10, q10, q9 ; hi
vqadd.s16 q11, q11, q9
vqsub.s16 d10, d6, d9 ; lo c1
vqadd.s16 d11, d7, d8 ; lo d1
vqsub.s16 d24, d20, d23 ; hi c1
vqadd.s16 d25, d21, d22 ; hi d1
vqadd.s16 d2, d12, d11 ; lo
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vqadd.s16 d16, d26, d25 ; hi
vqadd.s16 d17, d27, d24
vqsub.s16 d18, d27, d24
vqsub.s16 d19, d26, d25
vtrn.32 d2, d4 ; lo
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vtrn.32 d16, d18 ; hi
vtrn.32 d17, d19
vtrn.16 d16, d17
vtrn.16 d18, d19
vswp d3, d4 ; lo
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vswp d17, d18 ; hi
vqdmulh.s16 q10, q9, d0[2]
vqdmulh.s16 q11, q9, d0[0]
vqadd.s16 d12, d2, d3 ; lo a1
vqsub.s16 d13, d2, d3 ; lo b1
vqadd.s16 d26, d16, d17 ; hi a1
vqsub.s16 d27, d16, d17 ; hi b1
vshr.s16 q3, q3, #1 ; lo
vshr.s16 q4, q4, #1
vshr.s16 q10, q10, #1 ; hi
vshr.s16 q11, q11, #1
vqadd.s16 q3, q3, q2 ; lo
vqadd.s16 q4, q4, q2
vqadd.s16 q10, q10, q9 ; hi
vqadd.s16 q11, q11, q9
vqsub.s16 d10, d6, d9 ; lo c1
vqadd.s16 d11, d7, d8 ; lo d1
vqsub.s16 d24, d20, d23 ; hi c1
vqadd.s16 d25, d21, d22 ; hi d1
vqadd.s16 d2, d12, d11 ; lo
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vqadd.s16 d16, d26, d25 ; hi
vqadd.s16 d17, d27, d24
vqsub.s16 d18, d27, d24
vqsub.s16 d19, d26, d25
vrshr.s16 q1, q1, #3 ; lo
vrshr.s16 q2, q2, #3
vrshr.s16 q8, q8, #3 ; hi
vrshr.s16 q9, q9, #3
vtrn.32 d2, d4 ; lo
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vtrn.32 d16, d18 ; hi
vtrn.32 d17, d19
vtrn.16 d16, d17
vtrn.16 d18, d19
vaddw.u8 q1, q1, d14 ; lo
vaddw.u8 q2, q2, d15
vaddw.u8 q8, q8, d28 ; hi
vaddw.u8 q9, q9, d29
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
vqmovun.s16 d0, q1 ; lo
vqmovun.s16 d1, q2
vqmovun.s16 d2, q8 ; hi
vqmovun.s16 d3, q9
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
vst1.32 {d2[0]}, [r2], r1 ; hi
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r2], r1
vst1.32 {d3[1]}, [r2]
bx lr
ENDP ; |idct_dequant_full_2x_neon|
; Constant Pool
_CONSTANTS_ DCD cospi8sqrt2minus1
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -25,7 +25,10 @@ VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
#File list for neon
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c