ARM: NEON scalarproduct_int16 and scalarproduct_and_madd_int16
Patch by Kostya, minor fixes by me. Originally committed as revision 21958 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
0e5f33f242
commit
9b3c455c50
@ -641,6 +641,7 @@ NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
|
||||
|
||||
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
|
||||
arm/dsputil_neon.o \
|
||||
arm/int_neon.o \
|
||||
arm/simple_idct_neon.o \
|
||||
$(NEON-OBJS-yes)
|
||||
|
||||
|
@ -227,6 +227,11 @@ void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||
|
||||
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
|
||||
|
||||
int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
|
||||
int shift);
|
||||
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2,
|
||||
int16_t *v3, int len, int mul);
|
||||
|
||||
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
if (!avctx->lowres) {
|
||||
@ -406,4 +411,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||
|
||||
if (CONFIG_VORBIS_DECODER)
|
||||
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
|
||||
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
||||
}
|
||||
|
118
libavcodec/arm/int_neon.S
Normal file
118
libavcodec/arm/int_neon.S
Normal file
@ -0,0 +1,118 @@
|
||||
/*
|
||||
* ARM NEON optimised integer operations
|
||||
* Copyright (c) 2009 Kostya Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
preserve8
|
||||
.fpu neon
|
||||
.text
|
||||
|
||||
function ff_scalarproduct_int16_neon, export=1
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
negs r3, r3
|
||||
beq 2f
|
||||
|
||||
vdup.s32 q12, r3
|
||||
1: vld1.16 {d16-d17}, [r0]!
|
||||
vld1.16 {d20-d21}, [r1,:128]!
|
||||
vmull.s16 q12, d16, d20
|
||||
vld1.16 {d18-d19}, [r0]!
|
||||
vmull.s16 q13, d17, d21
|
||||
vld1.16 {d22-d23}, [r1,:128]!
|
||||
vmull.s16 q14, d18, d22
|
||||
vmull.s16 q15, d19, d23
|
||||
vshl.s32 q8, q12, q12
|
||||
vshl.s32 q9, q13, q12
|
||||
vadd.s32 q0, q0, q8
|
||||
vshl.s32 q10, q14, q12
|
||||
vadd.s32 q1, q1, q9
|
||||
vshl.s32 q11, q15, q12
|
||||
vadd.s32 q2, q2, q10
|
||||
vadd.s32 q3, q3, q11
|
||||
subs r2, r2, #16
|
||||
bne 1b
|
||||
b 3f
|
||||
|
||||
2: vld1.16 {d16-d17}, [r0]!
|
||||
vld1.16 {d20-d21}, [r1,:128]!
|
||||
vmlal.s16 q0, d16, d20
|
||||
vld1.16 {d18-d19}, [r0]!
|
||||
vmlal.s16 q1, d17, d21
|
||||
vld1.16 {d22-d23}, [r1,:128]!
|
||||
vmlal.s16 q2, d18, d22
|
||||
vmlal.s16 q3, d19, d23
|
||||
subs r2, r2, #16
|
||||
bne 2b
|
||||
|
||||
3: vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d10, d4, d5
|
||||
vpadd.s32 d11, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d10, d11
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
||||
function ff_scalarproduct_and_madd_int16_neon, export=1
|
||||
vld1.16 {d28[],d29[]}, [sp]
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
mov r12, r0
|
||||
|
||||
1: vld1.16 {d16-d17}, [r0,:128]!
|
||||
vld1.16 {d18-d19}, [r1]!
|
||||
vld1.16 {d20-d21}, [r2]!
|
||||
vld1.16 {d22-d23}, [r0,:128]!
|
||||
vld1.16 {d24-d25}, [r1]!
|
||||
vld1.16 {d26-d27}, [r2]!
|
||||
vmul.s16 q10, q10, q14
|
||||
vmul.s16 q13, q13, q14
|
||||
vmlal.s16 q0, d16, d18
|
||||
vmlal.s16 q1, d17, d19
|
||||
vadd.s16 q10, q8, q10
|
||||
vadd.s16 q13, q11, q13
|
||||
vmlal.s16 q2, d22, d24
|
||||
vmlal.s16 q3, d23, d25
|
||||
vst1.16 {q10}, [r12,:128]!
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q13}, [r12,:128]!
|
||||
bne 1b
|
||||
|
||||
vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d10, d4, d5
|
||||
vpadd.s32 d11, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d10, d11
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
.endfunc
|
Loading…
x
Reference in New Issue
Block a user