Optimized an AR function in iSAC fix for ARMv7 (not Neon) platforms.

Bit exact. Speed doubled.
Review URL: http://webrtc-codereview.appspot.com/327001

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1392 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-01-11 18:01:39 +00:00
parent 04c18cb37a
commit badf2b8044
4 changed files with 162 additions and 22 deletions

View File

@ -45,6 +45,14 @@ LOCAL_SRC_FILES := \
spectrum_ar_model_tables.c \ spectrum_ar_model_tables.c \
transform.c transform.c
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
LOCAL_SRC_FILES += \
lattice_armv7.S
else
LOCAL_SRC_FILES += \
lattice_c.c
endif
# Flags passed to both C and C++ files. # Flags passed to both C and C++ files.
LOCAL_CFLAGS := \ LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS) $(MY_WEBRTC_COMMON_DEFS)
@ -88,7 +96,7 @@ LOCAL_CFLAGS := \
LOCAL_C_INCLUDES := \ LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../interface \ $(LOCAL_PATH)/../interface \
$(LOCAL_PATH)/../../../../../.. \ $(LOCAL_PATH)/../../../../../.. \
$(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include
ifndef NDK_ROOT ifndef NDK_ROOT

View File

@ -35,6 +35,16 @@
case when method 1) gave 650235648 and 2) gave 650235712. case when method 1) gave 650235648 and 2) gave 650235712.
*/ */
/* Function prototype: filtering ar_g_Q0[] and ar_f_Q0[] through an AR filter
with coefficients cth_Q15[] and sth_Q15[].
Implemented for both generic and ARMv7 platforms.
*/
void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0,
int16_t* ar_f_Q0,
int16_t* cth_Q15,
int16_t* sth_Q15,
int16_t order_coef);
/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa(). /* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
It does: It does:
for 0 <= n < HALF_SUBFRAMELEN - 1: for 0 <= n < HALF_SUBFRAMELEN - 1:
@ -107,14 +117,14 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
for (u=0;u<SUBFRAMES;u++) for (u=0;u<SUBFRAMES;u++)
{ {
int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN);
/* set the Direct Form coefficients */ /* set the Direct Form coefficients */
temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef); temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u)+lo_hi; temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u)+lo_hi;
/* compute lattice filter coefficients */ /* compute lattice filter coefficients */
for (ii=0; ii<orderCoef; ii++) { memcpy(sthQ15, &filt_coefQ15[temp2], orderCoef * sizeof(WebRtc_Word16));
sthQ15[ii] = filt_coefQ15[temp2+ii];
}
WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15); WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15);
@ -136,8 +146,8 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
/* initial conditions */ /* initial conditions */
for (i=0;i<HALF_SUBFRAMELEN;i++) for (i=0;i<HALF_SUBFRAMELEN;i++)
{ {
fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15 fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + temp1], 15); //Q15
gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15 gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + temp1], 15); //Q15
} }
@ -182,7 +192,7 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(gain16, fQ15vec[n]); //Q(1+gain_sh)*Q15>>16 = Q(gain_sh) tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(gain16, fQ15vec[n]); //Q(1+gain_sh)*Q15>>16 = Q(gain_sh)
sh = 9-gain_sh; //number of needed shifts to reach Q9 sh = 9-gain_sh; //number of needed shifts to reach Q9
t16a = (WebRtc_Word16) WEBRTC_SPL_SHIFT_W32(tmp32, sh); t16a = (WebRtc_Word16) WEBRTC_SPL_SHIFT_W32(tmp32, sh);
lat_outQ9[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = t16a; lat_outQ9[n + temp1] = t16a;
} }
/* save the states */ /* save the states */
@ -230,6 +240,8 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
for (u=0;u<SUBFRAMES;u++) for (u=0;u<SUBFRAMES;u++)
{ {
int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN);
//set the denominator and numerator of the Direct Form //set the denominator and numerator of the Direct Form
temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef); temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u) + lo_hi; temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u) + lo_hi;
@ -262,7 +274,7 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
for (i=0;i<HALF_SUBFRAMELEN;i++) for (i=0;i<HALF_SUBFRAMELEN;i++)
{ {
tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 1); //Q25->Q26 tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + temp1], 1); //Q25->Q26
tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh) tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh)
tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0 tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0
@ -280,23 +292,12 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
} }
ARgQ0vec[0] = ARfQ0vec[0]; ARgQ0vec[0] = ARfQ0vec[0];
for(n=0;n<HALF_SUBFRAMELEN-1;n++) // Filter ARgQ0vec[] and ARfQ0vec[] through coefficients cthQ15[] and sthQ15[].
{ WebRtcIsacfix_FilterArLoop(ARgQ0vec, ARfQ0vec, cthQ15, sthQ15, orderCoef);
tmpAR = ARfQ0vec[n+1];
for(k=orderCoef-1;k>=0;k--)
{
tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cthQ15[k], tmpAR)) - (WEBRTC_SPL_MUL_16_16(sthQ15[k], ARgQ0vec[k])) + 16384), 15);
tmp32_2 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sthQ15[k], tmpAR)) + (WEBRTC_SPL_MUL_16_16(cthQ15[k], ARgQ0vec[k])) + 16384), 15);
tmpAR = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32); // Q0
ARgQ0vec[k+1] = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32_2); // Q0
}
ARfQ0vec[n+1] = tmpAR;
ARgQ0vec[0] = tmpAR;
}
for(n=0;n<HALF_SUBFRAMELEN;n++) for(n=0;n<HALF_SUBFRAMELEN;n++)
{ {
lat_outQ0[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = ARfQ0vec[n]; lat_outQ0[n + temp1] = ARfQ0vec[n];
} }

View File

@ -0,0 +1,82 @@
@
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Contains a function for the core loop in the normalized lattice AR
@ filter routine for iSAC codec, optimized for ARMv7 platforms.
@
@ Output is bit-exact with the reference C code in lattic_c.c
@
@ Register usage:
@
@ r0: &ar_g_Q0
@ r1: &ar_f_Q0
@ r2: &cth_Q15
@ r3: &sth_Q15
@ r4: out loop counter
@ r5: tmpAR
@ r9: inner loop counter
@ r12: constant #16384
@ r6, r7, r8, r10, r11: scratch
#include "settings.h"
.arch armv7-a
.global WebRtcIsacfix_FilterArLoop
.align 2
WebRtcIsacfix_FilterArLoop:
.fnstart
.save {r4-r11}
push {r4-r11}
add r1, #2 @ &ar_f_Q0[1]
mov r12, #16384
mov r4, #HALF_SUBFRAMELEN
sub r4, #1 @ Outer loop counter = HALF_SUBFRAMELEN - 1
HALF_SUBFRAME_LOOP: @ for(n = 0; n < HALF_SUBFRAMELEN - 1; n++)
ldr r9, [sp, #32] @ Restore the inner loop counter to order_coef
ldrh r5, [r1] @ tmpAR = ar_f_Q0[n+1]
add r0, r9, asl #1 @ Restore r0 to &ar_g_Q0[order_coef]
add r2, r9, asl #1 @ Restore r2 to &cth_Q15[order_coef]
add r3, r9, asl #1 @ Restore r3 to &sth_Q15[order_coef]
ORDER_COEF_LOOP: @ for(k = order_coef - 1 ; k >= 0; k--)
ldrh r7, [r3, #-2]! @ sth_Q15[k]
ldrh r6, [r2, #-2]! @ cth_Q15[k]
ldrh r8, [r0, #-2] @ ar_g_Q0[k]
smlabb r11, r7, r5, r12 @ sth_Q15[k] * tmpAR + 16384
smlabb r10, r6, r5, r12 @ cth_Q15[k] * tmpAR + 16384
smulbb r7, r7, r8 @ sth_Q15[k] * ar_g_Q0[k]
smlabb r11, r6, r8, r11 @ cth_Q15[k]*ar_g_Q0[k]+(sth_Q15[k]*tmpAR+16384)
sub r10, r10, r7 @ cth_Q15[k]*tmpAR+16384-(sth_Q15[k]*ar_g_Q0[k])
ssat r11, #16, r11, asr #15
ssat r5, #16, r10, asr #15
strh r11, [r0], #-2 @ Output: ar_g_Q0[k+1]
subs r9, #1
bgt ORDER_COEF_LOOP
strh r5, [r0] @ Output: ar_g_Q0[0] = tmpAR;
strh r5, [r1], #2 @ Output: ar_f_Q0[n+1] = tmpAR;
subs r4, #1
bne HALF_SUBFRAME_LOOP
pop {r4-r11}
bx lr
.fnend

View File

@ -0,0 +1,49 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* Contains the core loop function for the lattice filter AR routine
* for iSAC codec.
*
*/
#include "settings.h"
#include "signal_processing_library.h"
#include "typedefs.h"
/* Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients
* cth_Q15[] and sth_Q15[].
*/
void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples
int16_t* ar_f_Q0, // Input samples
int16_t* cth_Q15, // Filter coefficients
int16_t* sth_Q15, // Filter coefficients
int16_t order_coef) { // order of the filter
int n = 0;
for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) {
int k = 0;
int16_t tmpAR = 0;
int32_t tmp32 = 0;
int32_t tmp32_2 = 0;
tmpAR = ar_f_Q0[n + 1];
for (k = order_coef - 1; k >= 0; k--) {
tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cth_Q15[k], tmpAR))
- (WEBRTC_SPL_MUL_16_16(sth_Q15[k], ar_g_Q0[k])) + 16384), 15);
tmp32_2 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sth_Q15[k], tmpAR))
+ (WEBRTC_SPL_MUL_16_16(cth_Q15[k], ar_g_Q0[k])) + 16384), 15);
tmpAR = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32);
ar_g_Q0[k + 1] = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32_2);
}
ar_f_Q0[n + 1] = tmpAR;
ar_g_Q0[0] = tmpAR;
}
}