Optimized an AR function in iSAC fix for ARMv7 (not Neon) platforms.

Bit exact. Speed doubled. Review URL: http://webrtc-codereview.appspot.com/327001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1392 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-01-11 18:01:39 +00:00 · 2012-01-11 18:01:39 +00:00 · badf2b8044
commit badf2b8044
parent 04c18cb37a
4 changed files with 162 additions and 22 deletions
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
@ -45,6 +45,14 @@ LOCAL_SRC_FILES := \
    spectrum_ar_model_tables.c \
    transform.c

+ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
+LOCAL_SRC_FILES += \
+    lattice_armv7.S
+else
+LOCAL_SRC_FILES += \
+    lattice_c.c
+endif
+
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
    $(MY_WEBRTC_COMMON_DEFS)
@ -88,7 +96,7 @@ LOCAL_CFLAGS := \
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../interface \
    $(LOCAL_PATH)/../../../../../.. \
-    $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include 
+    $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include


 ifndef NDK_ROOT
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
@ -35,6 +35,16 @@
   case when  method 1) gave 650235648 and 2) gave 650235712.
 */

+/* Function prototype: filtering ar_g_Q0[] and ar_f_Q0[] through an AR filter
+   with coefficients cth_Q15[] and sth_Q15[].
+   Implemented for both generic and ARMv7 platforms.
+ */
+void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0,
+                                int16_t* ar_f_Q0,
+                                int16_t* cth_Q15,
+                                int16_t* sth_Q15,
+                                int16_t order_coef);
+
 /* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
   It does:
   for 0 <= n < HALF_SUBFRAMELEN - 1:
@ -107,14 +117,14 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,

  for (u=0;u<SUBFRAMES;u++)
  {
+    int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN);
+
    /* set the Direct Form coefficients */
    temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
    temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u)+lo_hi;

    /* compute lattice filter coefficients */
-    for (ii=0; ii<orderCoef; ii++) {
-      sthQ15[ii] = filt_coefQ15[temp2+ii];
-    }
+    memcpy(sthQ15, &filt_coefQ15[temp2], orderCoef * sizeof(WebRtc_Word16));

    WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15);

@ -136,8 +146,8 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
    /* initial conditions */
    for (i=0;i<HALF_SUBFRAMELEN;i++)
    {
-      fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15
-      gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15
+      fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + temp1], 15); //Q15
+      gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + temp1], 15); //Q15
    }


@ -182,7 +192,7 @@ void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(gain16, fQ15vec[n]); //Q(1+gain_sh)*Q15>>16 = Q(gain_sh)
      sh = 9-gain_sh; //number of needed shifts to reach Q9
      t16a = (WebRtc_Word16) WEBRTC_SPL_SHIFT_W32(tmp32, sh);
-      lat_outQ9[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = t16a;
+      lat_outQ9[n + temp1] = t16a;
    }

    /* save the states */
@ -230,6 +240,8 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,

  for (u=0;u<SUBFRAMES;u++)
  {
+    int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN);
+
    //set the denominator and numerator of the Direct Form
    temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
    temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u) + lo_hi;
@ -262,7 +274,7 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
    for (i=0;i<HALF_SUBFRAMELEN;i++)
    {

-      tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 1); //Q25->Q26
+      tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + temp1], 1); //Q25->Q26
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh)
      tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0

@ -280,23 +292,12 @@ void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
    }
    ARgQ0vec[0] = ARfQ0vec[0];

-    for(n=0;n<HALF_SUBFRAMELEN-1;n++)
-    {
-      tmpAR = ARfQ0vec[n+1];
-      for(k=orderCoef-1;k>=0;k--)
-      {
-        tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cthQ15[k], tmpAR)) - (WEBRTC_SPL_MUL_16_16(sthQ15[k], ARgQ0vec[k])) + 16384), 15);
-        tmp32_2 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sthQ15[k], tmpAR)) + (WEBRTC_SPL_MUL_16_16(cthQ15[k], ARgQ0vec[k])) + 16384), 15);
-        tmpAR   = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32); // Q0
-        ARgQ0vec[k+1] = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32_2); // Q0
-      }
-      ARfQ0vec[n+1] = tmpAR;
-      ARgQ0vec[0] = tmpAR;
-    }
+    // Filter ARgQ0vec[] and ARfQ0vec[] through coefficients cthQ15[] and sthQ15[].
+    WebRtcIsacfix_FilterArLoop(ARgQ0vec, ARfQ0vec, cthQ15, sthQ15, orderCoef);

    for(n=0;n<HALF_SUBFRAMELEN;n++)
    {
-      lat_outQ0[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = ARfQ0vec[n];
+      lat_outQ0[n + temp1] = ARfQ0vec[n];
    }


--- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_armv7.S
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_armv7.S
@ -0,0 +1,82 @@
+@
+@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ Contains a function for the core loop in the normalized lattice AR
+@ filter routine for iSAC codec, optimized for ARMv7 platforms.
+@
+@ Output is bit-exact with the reference C code in lattic_c.c
+@
+@ Register usage:
+@
+@ r0:  &ar_g_Q0
+@ r1:  &ar_f_Q0
+@ r2:  &cth_Q15
+@ r3:  &sth_Q15
+@ r4:  out loop counter
+@ r5:  tmpAR
+@ r9:  inner loop counter
+@ r12: constant #16384
+@ r6, r7, r8, r10, r11: scratch
+
+#include "settings.h"
+
+.arch armv7-a
+.global WebRtcIsacfix_FilterArLoop
+.align  2
+
+WebRtcIsacfix_FilterArLoop:
+.fnstart
+
+.save {r4-r11}
+  push    {r4-r11}
+
+  add     r1, #2                 @ &ar_f_Q0[1]
+  mov     r12, #16384
+  mov     r4, #HALF_SUBFRAMELEN
+  sub     r4, #1                 @ Outer loop counter = HALF_SUBFRAMELEN - 1
+
+HALF_SUBFRAME_LOOP:  @ for(n = 0; n < HALF_SUBFRAMELEN - 1; n++)
+
+  ldr     r9, [sp, #32]          @ Restore the inner loop counter to order_coef
+  ldrh    r5, [r1]               @ tmpAR = ar_f_Q0[n+1]
+  add     r0, r9, asl #1         @ Restore r0 to &ar_g_Q0[order_coef]
+  add     r2, r9, asl #1         @ Restore r2 to &cth_Q15[order_coef]
+  add     r3, r9, asl #1         @ Restore r3 to &sth_Q15[order_coef]
+
+ORDER_COEF_LOOP:  @ for(k = order_coef - 1 ; k >= 0; k--)
+
+  ldrh    r7, [r3, #-2]!         @ sth_Q15[k]
+  ldrh    r6, [r2, #-2]!         @ cth_Q15[k]
+
+  ldrh    r8, [r0, #-2]          @ ar_g_Q0[k]
+  smlabb  r11, r7, r5, r12       @ sth_Q15[k] * tmpAR + 16384
+  smlabb  r10, r6, r5, r12       @ cth_Q15[k] * tmpAR + 16384
+  smulbb  r7, r7, r8             @ sth_Q15[k] * ar_g_Q0[k]
+  smlabb  r11, r6, r8, r11       @ cth_Q15[k]*ar_g_Q0[k]+(sth_Q15[k]*tmpAR+16384)
+
+  sub     r10, r10, r7           @ cth_Q15[k]*tmpAR+16384-(sth_Q15[k]*ar_g_Q0[k])
+  ssat    r11, #16, r11, asr #15
+  ssat    r5, #16, r10, asr #15
+  strh    r11, [r0], #-2         @ Output: ar_g_Q0[k+1]
+
+  subs    r9, #1
+  bgt     ORDER_COEF_LOOP
+
+  strh    r5, [r0]               @ Output: ar_g_Q0[0] = tmpAR;
+  strh    r5, [r1], #2           @ Output: ar_f_Q0[n+1] = tmpAR;
+
+  subs    r4, #1
+  bne     HALF_SUBFRAME_LOOP
+
+  pop     {r4-r11}
+  bx      lr
+
+.fnend
+
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_c.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_c.c
@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Contains the core loop function for the lattice filter AR routine
+ * for iSAC codec.
+ *
+ */
+
+#include "settings.h"
+#include "signal_processing_library.h"
+#include "typedefs.h"
+
+/* Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients
+ * cth_Q15[] and sth_Q15[].
+ */
+void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0,     // Input samples
+                                int16_t* ar_f_Q0,     // Input samples
+                                int16_t* cth_Q15,     // Filter coefficients
+                                int16_t* sth_Q15,     // Filter coefficients
+                                int16_t order_coef) { // order of the filter
+  int n = 0;
+
+  for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) {
+    int k = 0;
+    int16_t tmpAR = 0;
+    int32_t tmp32 = 0;
+    int32_t tmp32_2 = 0;
+
+    tmpAR = ar_f_Q0[n + 1];
+    for (k = order_coef - 1; k >= 0; k--) {
+      tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cth_Q15[k], tmpAR))
+              - (WEBRTC_SPL_MUL_16_16(sth_Q15[k], ar_g_Q0[k])) + 16384), 15);
+      tmp32_2 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sth_Q15[k], tmpAR))
+                + (WEBRTC_SPL_MUL_16_16(cth_Q15[k], ar_g_Q0[k])) + 16384), 15);
+      tmpAR   = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32);
+      ar_g_Q0[k + 1] = (WebRtc_Word16)WebRtcSpl_SatW32ToW16(tmp32_2);
+    }
+    ar_f_Q0[n + 1] = tmpAR;
+    ar_g_Q0[0] = tmpAR;
+  }
+}