Optimized WebRtcIsacfix_NormLatticeFilterMa() function for iSAC fix for ARM Neon
architecture with intrinsics and assembly code. The total iSAC codec speech improved about 3~5%. Notes (1) The Neon version after this optimization is not bit-exact with the generic C version. The out quality, however, is not worse as verified by test vectors ouput, and undertandably in theory (32bit x 32bit in Neon is more accurate than the approximation C code in the generic version). (2) In Android, a isac neon library will be built. Along with some new function structures, it is partly for preparation of introducing a run time detection of Neon architecture soon. Review URL: http://webrtc-codereview.appspot.com/268016 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1192 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
02afbeaca5
commit
f0a964dc0a
10
Android.mk
10
Android.mk
@ -103,6 +103,7 @@ include $(BUILD_SHARED_LIBRARY)
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
|
||||
|
||||
LOCAL_ARM_MODE := arm
|
||||
LOCAL_MODULE := libwebrtc
|
||||
@ -137,6 +138,15 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
|
||||
libwebrtc_jpeg \
|
||||
libwebrtc_vpx
|
||||
|
||||
# Add Neon libraries.
|
||||
ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
|
||||
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||
libwebrtc_isacfix_neon
|
||||
else ifeq ($(ARCH_ARM_HAVE_NEON),true)
|
||||
LOCAL_WHOLE_STATIC_LIBRARIES += \
|
||||
libwebrtc_isacfix_neon
|
||||
endif
|
||||
|
||||
LOCAL_SHARED_LIBRARIES := \
|
||||
libcutils \
|
||||
libdl \
|
||||
|
@ -6,6 +6,9 @@
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
#############################
|
||||
# Build the non-neon library.
|
||||
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
@ -68,8 +71,41 @@ include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
||||
#########################
|
||||
# Build the neon library.
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
LOCAL_ARM_MODE := arm
|
||||
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_isacfix_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
LOCAL_SRC_FILES := \
|
||||
filters_neon.c \
|
||||
lattice_neon.S #.S extention is for including a header file in assembly.
|
||||
# TODO(kma): Check with C compiler team and on line community for any status
|
||||
# in the file name (.s vs .S), for a better solution.
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
$(MY_WEBRTC_COMMON_DEFS) \
|
||||
-mfpu=neon \
|
||||
-flax-vector-conversions
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(LOCAL_PATH)/../interface \
|
||||
$(LOCAL_PATH)/../../../../../.. \
|
||||
$(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include
|
||||
|
||||
|
||||
ifndef NDK_ROOT
|
||||
include external/stlport/libstlport.mk
|
||||
endif
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
||||
###########################
|
||||
# isac test app
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
LOCAL_MODULE_TAGS := tests
|
||||
|
@ -122,7 +122,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
|
||||
WebRtc_Word16 lo_hi,
|
||||
WebRtc_Word16 *lat_outQ9);
|
||||
|
||||
|
||||
void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
|
||||
WebRtc_Word16 *stateGQ0,
|
||||
WebRtc_Word32 *lat_inQ25,
|
||||
@ -131,10 +130,54 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
|
||||
WebRtc_Word16 lo_hi,
|
||||
WebRtc_Word16 *lat_outQ0);
|
||||
|
||||
int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r,
|
||||
int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
|
||||
const WebRtc_Word16* __restrict x,
|
||||
WebRtc_Word16 N,
|
||||
WebRtc_Word16 order,
|
||||
WebRtc_Word16* __restrict scale);
|
||||
|
||||
void WebRtcIsacfix_FilterMaLoopC(int16_t input0,
|
||||
int16_t input1,
|
||||
int32_t input2,
|
||||
int32_t* ptr0,
|
||||
int32_t* ptr1,
|
||||
int32_t* ptr2);
|
||||
|
||||
// Functions for ARM-Neon platforms, in place of the above two generic C ones.
|
||||
#if (defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
int WebRtcIsacfix_AutocorrNeon(WebRtc_Word32* __restrict r,
|
||||
const WebRtc_Word16* __restrict x,
|
||||
WebRtc_Word16 N,
|
||||
WebRtc_Word16 order,
|
||||
WebRtc_Word16* __restrict scale);
|
||||
|
||||
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
|
||||
int16_t input1,
|
||||
int32_t input2,
|
||||
int32_t* ptr0,
|
||||
int32_t* ptr1,
|
||||
int32_t* ptr2);
|
||||
#endif
|
||||
|
||||
/**** Function pointers associated with
|
||||
**** WebRtcIsacfix_AutocorrC() / WebRtcIsacfix_AutocorrNeon()
|
||||
**** and WebRtcIsacfix_FilterMaLoopC() / WebRtcIsacfix_FilterMaLoopNeon().
|
||||
****/
|
||||
|
||||
typedef int (*AutocorrFix)(WebRtc_Word32* __restrict__ r,
|
||||
const WebRtc_Word16* __restrict__ x,
|
||||
WebRtc_Word16 N,
|
||||
WebRtc_Word16 order,
|
||||
WebRtc_Word16* __restrict__ scale);
|
||||
extern AutocorrFix WebRtcIsacfix_AutocorrFix;
|
||||
|
||||
typedef void (*FilterMaLoopFix)(int16_t input0,
|
||||
int16_t input1,
|
||||
int32_t input2,
|
||||
int32_t* ptr0,
|
||||
int32_t* ptr1,
|
||||
int32_t* ptr2);
|
||||
extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
|
||||
|
||||
|
||||
#endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */
|
||||
|
@ -11,7 +11,7 @@
|
||||
/*
|
||||
* filters.c
|
||||
*
|
||||
* This file contains function WebRtcIsacfix_AutocorrFix,
|
||||
* This file contains function WebRtcIsacfix_AutocorrC,
|
||||
* AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
|
||||
*
|
||||
*/
|
||||
@ -22,16 +22,13 @@
|
||||
#include "lpc_masking_model.h"
|
||||
#include "codec.h"
|
||||
|
||||
#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
// Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
int WebRtcIsacfix_AutocorrFix(
|
||||
WebRtc_Word32* __restrict r,
|
||||
int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
|
||||
const WebRtc_Word16* __restrict x,
|
||||
WebRtc_Word16 N,
|
||||
WebRtc_Word16 order,
|
||||
WebRtc_Word16* __restrict scale) {
|
||||
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int16_t scaling = 0;
|
||||
@ -67,7 +64,6 @@ int WebRtcIsacfix_AutocorrFix(
|
||||
|
||||
return(order + 1);
|
||||
}
|
||||
#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
|
||||
|
||||
static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 };
|
||||
static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 };
|
||||
|
@ -11,7 +11,7 @@
|
||||
/*
|
||||
* filters_neon.c
|
||||
*
|
||||
* This file contains function WebRtcIsacfix_AutocorrFix, optimized for
|
||||
* This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
|
||||
* ARM Neon platform.
|
||||
*
|
||||
*/
|
||||
@ -23,7 +23,7 @@
|
||||
|
||||
// Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
int WebRtcIsacfix_AutocorrFix(
|
||||
int WebRtcIsacfix_AutocorrNeon(
|
||||
WebRtc_Word32* __restrict r,
|
||||
const WebRtc_Word16* __restrict x,
|
||||
WebRtc_Word16 N,
|
||||
|
@ -246,11 +246,18 @@ WebRtc_Word16 WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst,
|
||||
WebRtcIsacfix_InitPostFilterbank(&ISAC_inst->ISACenc_obj.interpolatorstr_obj);
|
||||
#endif
|
||||
|
||||
// Initiaze function pointers.
|
||||
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrC;
|
||||
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC;
|
||||
|
||||
#ifdef WEBRTC_ARCH_ARM_NEON
|
||||
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon;
|
||||
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
|
||||
#endif
|
||||
|
||||
return statusInit;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcIsacfix_Encode(...)
|
||||
*
|
||||
|
@ -18,6 +18,64 @@
|
||||
#include "codec.h"
|
||||
#include "settings.h"
|
||||
|
||||
#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
|
||||
((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
|
||||
/* This macro is FORBIDDEN to use elsewhere than in a function in this file and
|
||||
its corresponding neon version. It might give unpredictable results, since a
|
||||
general WebRtc_Word32*WebRtc_Word32 multiplication results in a 64 bit value.
|
||||
The result is then shifted just 16 steps to the right, giving need for 48
|
||||
bits, i.e. in the generel case, it will NOT fit in a WebRtc_Word32. In the
|
||||
cases used in here, the WebRtc_Word32 will be enough, since (for a good
|
||||
reason) the involved multiplicands aren't big enough to overflow a
|
||||
WebRtc_Word32 after shifting right 16 bits. I have compared the result of a
|
||||
multiplication between t32 and tmp32, done in two ways:
|
||||
1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
|
||||
2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
|
||||
By running 25 files, I haven't found any bigger diff than 64 - this was in the
|
||||
case when method 1) gave 650235648 and 2) gave 650235712.
|
||||
*/
|
||||
|
||||
/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
|
||||
It does:
|
||||
for 0 <= n < HALF_SUBFRAMELEN - 1:
|
||||
*ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
|
||||
*ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
||||
*/
|
||||
void WebRtcIsacfix_FilterMaLoopC(int16_t input0, // Filter coefficient
|
||||
int16_t input1, // Filter coefficient
|
||||
int32_t input2, // Inverse coeff. (1/input1)
|
||||
int32_t* ptr0, // Sample buffer
|
||||
int32_t* ptr1, // Sample buffer
|
||||
int32_t* ptr2) { // Sample buffer
|
||||
int n = 0;
|
||||
|
||||
// Separate the 32-bit variable input2 into two 16-bit integers (high 16 and
|
||||
// low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop.
|
||||
int16_t t16a = (int16_t)(input2 >> 16);
|
||||
int16_t t16b = (int16_t)input2;
|
||||
if (t16b < 0) t16a++;
|
||||
|
||||
// The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients
|
||||
// input0, input1, and input2.
|
||||
for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) {
|
||||
int32_t tmp32a = 0;
|
||||
int32_t tmp32b = 0;
|
||||
|
||||
// Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
|
||||
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15
|
||||
tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15
|
||||
*ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
|
||||
|
||||
// Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
||||
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15
|
||||
tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15
|
||||
*ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15
|
||||
}
|
||||
}
|
||||
|
||||
// Declare a function pointer.
|
||||
FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
|
||||
|
||||
/* filter the signal using normalized lattice filter */
|
||||
/* MA filter */
|
||||
void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
|
||||
@ -47,30 +105,6 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
|
||||
WebRtc_Word16 t16a;
|
||||
WebRtc_Word16 t16b;
|
||||
|
||||
#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
|
||||
((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
|
||||
/* This macro is FORBIDDEN to use elsewhere than in two places in this file
|
||||
since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32
|
||||
multiplication results in a 64 bit value. The result is then shifted just
|
||||
16 steps to the right, giving need for 48 bits, i.e. in the generel case,
|
||||
it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be
|
||||
enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big
|
||||
enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared
|
||||
the result of a multiplication between t32 and tmp32, done in two ways:
|
||||
|
||||
1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
|
||||
|
||||
2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
|
||||
|
||||
By running 25 files, I haven't found any bigger diff than 64 - this was in the
|
||||
case when method 1) gave 650235648 and 2) gave 650235712.
|
||||
|
||||
It might be good to investigate this further, in order to PROVE why it seems to
|
||||
work without any problems. This might be done, by using the properties of
|
||||
all reflection coefficients etc.
|
||||
|
||||
*/
|
||||
|
||||
for (u=0;u<SUBFRAMES;u++)
|
||||
{
|
||||
/* set the Direct Form coefficients */
|
||||
@ -133,24 +167,11 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
|
||||
/* save the states */
|
||||
for(k=0;k<orderCoef;k++)
|
||||
{
|
||||
for(n=0;n<HALF_SUBFRAMELEN-1;n++)
|
||||
{
|
||||
// Calculate f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
|
||||
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], gQ15[k][n]);//Q15*Q15>>15 = Q15
|
||||
tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15
|
||||
tmp32 = inv_cthQ16[k]; //Q16
|
||||
t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
|
||||
t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
|
||||
if (t16b<0) t16a++;
|
||||
tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
|
||||
fQ15vec[n+1] = tmp32; // Q15
|
||||
|
||||
// Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
|
||||
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15
|
||||
tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15
|
||||
tmp32 = tmp32 + tmp32b;//Q15+Q15 = Q15
|
||||
gQ15[k+1][n+1] = tmp32; // Q15
|
||||
}
|
||||
// for 0 <= n < HALF_SUBFRAMELEN - 1:
|
||||
// f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
|
||||
// g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
|
||||
WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k],
|
||||
&gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]);
|
||||
}
|
||||
|
||||
fQ15vec[0] = fQtmp;
|
||||
|
155
src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
Normal file
155
src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
Normal file
@ -0,0 +1,155 @@
|
||||
@
|
||||
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
@
|
||||
@ Use of this source code is governed by a BSD-style license
|
||||
@ that can be found in the LICENSE file in the root of the source
|
||||
@ tree. An additional intellectual property rights grant can be found
|
||||
@ in the file PATENTS. All contributing project authors may
|
||||
@ be found in the AUTHORS file in the root of the source tree.
|
||||
@
|
||||
|
||||
@ lattice_neon.s
|
||||
@
|
||||
@ Contains a function for the core loop in the normalized lattice MA
|
||||
@ filter routine for iSAC codec, optimized for ARM Neon platform.
|
||||
@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
|
||||
@ int16_t input1,
|
||||
@ int32_t input2,
|
||||
@ int32_t* ptr0,
|
||||
@ int32_t* ptr1,
|
||||
@ int32_t* __restrict ptr2);
|
||||
@ It calculates
|
||||
@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
|
||||
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
||||
@ in Q15 domain.
|
||||
@
|
||||
@ Reference code in lattice.c.
|
||||
@ Output is not bit-exact with the reference C code, due to the replacement
|
||||
@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
|
||||
@ instructions, smulwb, and smull. Speech quality was not degraded by
|
||||
@ testing speech and tone vectors.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
#include "settings.h"
|
||||
|
||||
.global WebRtcIsacfix_FilterMaLoopNeon
|
||||
|
||||
.align 2
|
||||
|
||||
WebRtcIsacfix_FilterMaLoopNeon:
|
||||
.fnstart
|
||||
|
||||
.save {r4-r8}
|
||||
push {r4-r8}
|
||||
|
||||
vdup.32 d28, r0 @ Initialize Neon register with input0
|
||||
vdup.32 d29, r1 @ Initialize Neon register with input1
|
||||
vdup.32 d30, r2 @ Initialize Neon register with input2
|
||||
ldr r4, [sp, #20] @ ptr1
|
||||
ldr r12, [sp, #24] @ ptr2
|
||||
|
||||
@ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
|
||||
@ Leftover samples after the loop, in r6:
|
||||
@ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
|
||||
mov r6, #HALF_SUBFRAMELEN
|
||||
sub r6, #1
|
||||
lsr r5, r6, #2
|
||||
sub r6, r5, lsl #2
|
||||
|
||||
@ First r5 iterations in a loop.
|
||||
|
||||
LOOP:
|
||||
vld1.32 {d0, d1}, [r3]! @ *ptr0
|
||||
|
||||
vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
|
||||
vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
|
||||
vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
|
||||
vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
|
||||
|
||||
vrshrn.i64 d4, q10, #15
|
||||
vrshrn.i64 d5, q11, #15
|
||||
|
||||
vld1.32 {d2, d3}, [r12] @ *ptr2
|
||||
vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
|
||||
|
||||
vrshrn.i64 d0, q12, #15
|
||||
|
||||
vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
|
||||
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
|
||||
|
||||
vrshrn.i64 d16, q10, #16
|
||||
vrshrn.i64 d17, q11, #16
|
||||
|
||||
vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
|
||||
vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
|
||||
|
||||
vrshrn.i64 d1, q13, #15
|
||||
vrshrn.i64 d18, q10, #15
|
||||
vrshrn.i64 d19, q11, #15
|
||||
|
||||
vst1.32 {d16, d17}, [r12]! @ *ptr2
|
||||
|
||||
vadd.i32 q9, q0, q9
|
||||
subs r5, #1
|
||||
vst1.32 {d18, d19}, [r4]! @ *ptr1
|
||||
|
||||
bgt LOOP
|
||||
|
||||
@ Check how many samples still need to be processed.
|
||||
subs r6, #2
|
||||
blt LAST_SAMPLE
|
||||
|
||||
@ Process two more samples:
|
||||
vld1.32 d0, [r3]! @ *ptr0
|
||||
|
||||
vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
|
||||
vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
|
||||
|
||||
vld1.32 d18, [r12] @ *ptr2
|
||||
vrshrn.i64 d4, q11, #15
|
||||
|
||||
vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
|
||||
vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
|
||||
vrshrn.i64 d16, q11, #16
|
||||
|
||||
vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
|
||||
vst1.32 d16, [r12]! @ *ptr2
|
||||
|
||||
vrshrn.i64 d0, q13, #15
|
||||
vrshrn.i64 d19, q11, #15
|
||||
vadd.i32 d19, d0, d19
|
||||
|
||||
vst1.32 d19, [r4]! @ *ptr1
|
||||
|
||||
@ If there's still one more sample, process it here.
|
||||
LAST_SAMPLE:
|
||||
cmp r6, #1
|
||||
bne END
|
||||
|
||||
@ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
|
||||
|
||||
ldr r7, [r3] @ *ptr0
|
||||
ldr r8, [r12] @ *ptr2
|
||||
|
||||
smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
|
||||
add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
|
||||
smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
|
||||
lsl r6, #16
|
||||
add r6, r5, lsr #16 @ Only take the middle 32 bits
|
||||
str r6, [r12] @ Output (*ptr2, as 32 bits)
|
||||
|
||||
@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
||||
|
||||
smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
|
||||
smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
|
||||
lsl r5, r5, #1
|
||||
add r5, r6, lsl #1
|
||||
str r5, [r4] @ Output (*ptr1)
|
||||
|
||||
END:
|
||||
pop {r4-r8}
|
||||
bx lr
|
||||
|
||||
.fnend
|
Loading…
Reference in New Issue
Block a user