In spl, introduced function WebRtcSpl_Sat32To16(), and changed file resample_by_2.c, both for optimization in ARMv7.

Review URL: http://webrtc-codereview.appspot.com/140010 git-svn-id: http://webrtc.googlecode.com/svn/trunk@649 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-09-26 16:35:25 +00:00
parent e185e9f68a
commit 961885a8bb
7 changed files with 179 additions and 165 deletions
--- a/src/common_audio/signal_processing_library/main/interface/signal_processing_library.h
+++ b/src/common_audio/signal_processing_library/main/interface/signal_processing_library.h
@@ -1659,6 +1659,17 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band,
 //      - out_data      : Super-wideband speech signal, 0-16 kHz
 //

+// WebRtc_Word16 WebRtcSpl_SatW32ToW16(...)
+//
+// This function saturates a 32-bit word into a 16-bit word.
+// 
+// Input:
+//      - value32   : The value of a 32-bit word.
+//
+// Output:
+//      - out16     : the saturated 16-bit word.
+//
+
 // WebRtc_Word16 WebRtcSpl_get_version(...)
 //
 // This function gives the version string of the Signal Processing Library.
--- a/src/common_audio/signal_processing_library/main/interface/spl_inl.h
+++ b/src/common_audio/signal_processing_library/main/interface/spl_inl.h
@@ -19,16 +19,20 @@
 #include "spl_inl_armv7.h"
 #else

+static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
+  WebRtc_Word16 out16 = (WebRtc_Word16) value32;
+
+  if (value32 > 32767)
+    out16 = 32767;
+  else if (value32 < -32768)
+    out16 = -32768;
+
+  return out16;
+}
+
 static __inline WebRtc_Word16 WebRtcSpl_AddSatW16(WebRtc_Word16 a,
                                                  WebRtc_Word16 b) {
-  WebRtc_Word32 s_sum = (WebRtc_Word32) a + (WebRtc_Word32) b;
-
-  if (s_sum > WEBRTC_SPL_WORD16_MAX)
-    s_sum = WEBRTC_SPL_WORD16_MAX;
-  else if (s_sum < WEBRTC_SPL_WORD16_MIN)
-    s_sum = WEBRTC_SPL_WORD16_MIN;
-
-  return (WebRtc_Word16)s_sum;
+  return WebRtcSpl_SatW32ToW16((WebRtc_Word32) a + (WebRtc_Word32) b);
 }

 static __inline WebRtc_Word32 WebRtcSpl_AddSatW32(WebRtc_Word32 l_var1,
@@ -54,24 +58,7 @@ static __inline WebRtc_Word32 WebRtcSpl_AddSatW32(WebRtc_Word32 l_var1,

 static __inline WebRtc_Word16 WebRtcSpl_SubSatW16(WebRtc_Word16 var1,
                                                  WebRtc_Word16 var2) {
-  WebRtc_Word32 l_diff;
-  WebRtc_Word16 s_diff;
-
-  // perform subtraction
-  l_diff = (WebRtc_Word32)var1 - (WebRtc_Word32)var2;
-
-  // default setting
-  s_diff = (WebRtc_Word16) l_diff;
-
-  // check for overflow
-  if (l_diff > (WebRtc_Word32)32767)
-  s_diff = (WebRtc_Word16)32767;
-
-  // check for underflow
-  if (l_diff < (WebRtc_Word32)-32768)
-  s_diff = (WebRtc_Word16)-32768;
-
-  return s_diff;
+  return WebRtcSpl_SatW32ToW16((WebRtc_Word32) var1 - (WebRtc_Word32) var2);
 }

 static __inline WebRtc_Word32 WebRtcSpl_SubSatW32(WebRtc_Word32 l_var1,
--- a/src/common_audio/signal_processing_library/main/interface/spl_inl_armv7.h
+++ b/src/common_audio/signal_processing_library/main/interface/spl_inl_armv7.h
@@ -119,4 +119,11 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) {
  return tmp - 17;
 }

+static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
+  WebRtc_Word16 out16;
+
+  __asm__("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32));
+
+  return out16;
+}
 #endif  // WEBRTC_SPL_SPL_INL_ARMV7_H_
--- a/src/common_audio/signal_processing_library/main/source/downsample_fast.c
+++ b/src/common_audio/signal_processing_library/main/source/downsample_fast.c
@@ -52,7 +52,7 @@ int WebRtcSpl_DownsampleFast(WebRtc_Word16 *in_ptr, WebRtc_Word16 in_length,

        // If output is higher than 32768, saturate it. Same with negative side

-        *downsampled_ptr++ = (WebRtc_Word16)WEBRTC_SPL_SAT(32767, o, -32768);
+        *downsampled_ptr++ = WebRtcSpl_SatW32ToW16(o);
    }

    return 0;
--- a/src/common_audio/signal_processing_library/main/source/resample_by_2.c
+++ b/src/common_audio/signal_processing_library/main/source/resample_by_2.c
@@ -17,154 +17,165 @@

 #include "signal_processing_library.h"

+#ifdef WEBRTC_ARCH_ARM_V7A
+
+// allpass filter coefficients.
+static const WebRtc_UWord32 kResampleAllpass1[3] = {3284, 24441, 49528 << 15};
+static const WebRtc_UWord32 kResampleAllpass2[3] =
+  {12199, 37471 << 15, 60255 << 15};
+
+// Multiply two 32-bit values and accumulate to another input value.
+// Return: state + ((diff * tbl_value) >> 16)
+
+static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value,
+                                          WebRtc_Word32 diff,
+                                          WebRtc_Word32 state) {
+  WebRtc_Word32 result;
+  __asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff),
+                                       "r"(tbl_value), "r"(state));
+  return result;
+}
+
+// Multiply two 32-bit values and accumulate to another input value.
+// Return: Return: state + (((diff << 1) * tbl_value) >> 32)
+//
+// The reason to introduce this function is that, in case we can't use smlawb
+// instruction (in MUL_ACCUM_1) due to input value range, we can still use 
+// smmla to save some cycles.
+
+static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value,
+                                          WebRtc_Word32 diff,
+                                          WebRtc_Word32 state) {
+  WebRtc_Word32 result;
+  __asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1),
+                                      "r"(tbl_value), "r"(state));
+  return result;
+}
+
+#else
+
 // allpass filter coefficients.
 static const WebRtc_UWord16 kResampleAllpass1[3] = {3284, 24441, 49528};
 static const WebRtc_UWord16 kResampleAllpass2[3] = {12199, 37471, 60255};

+// Multiply a 32-bit value with a 16-bit value and accumulate to another input:
+#define MUL_ACCUM_1(a, b, c) WEBRTC_SPL_SCALEDIFF32(a, b, c)
+#define MUL_ACCUM_2(a, b, c) WEBRTC_SPL_SCALEDIFF32(a, b, c)
+
+#endif  // WEBRTC_ARCH_ARM_V7A
+
+
 // decimator
 void WebRtcSpl_DownsampleBy2(const WebRtc_Word16* in, const WebRtc_Word16 len,
-                             WebRtc_Word16* out, WebRtc_Word32* filtState)
-{
-    WebRtc_Word32 tmp1, tmp2, diff, in32, out32;
-    WebRtc_Word16 i;
+                             WebRtc_Word16* out, WebRtc_Word32* filtState) {
+  WebRtc_Word32 tmp1, tmp2, diff, in32, out32;
+  WebRtc_Word16 i;

-    register WebRtc_Word32 state0 = filtState[0];
-    register WebRtc_Word32 state1 = filtState[1];
-    register WebRtc_Word32 state2 = filtState[2];
-    register WebRtc_Word32 state3 = filtState[3];
-    register WebRtc_Word32 state4 = filtState[4];
-    register WebRtc_Word32 state5 = filtState[5];
-    register WebRtc_Word32 state6 = filtState[6];
-    register WebRtc_Word32 state7 = filtState[7];
+  register WebRtc_Word32 state0 = filtState[0];
+  register WebRtc_Word32 state1 = filtState[1];
+  register WebRtc_Word32 state2 = filtState[2];
+  register WebRtc_Word32 state3 = filtState[3];
+  register WebRtc_Word32 state4 = filtState[4];
+  register WebRtc_Word32 state5 = filtState[5];
+  register WebRtc_Word32 state6 = filtState[6];
+  register WebRtc_Word32 state7 = filtState[7];

-    for (i = (len >> 1); i > 0; i--)
-    {
-        // lower allpass filter
-        in32 = (WebRtc_Word32)(*in++) << 10;
-        diff = in32 - state1;
-        tmp1 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[0], diff, state0);
-        state0 = in32;
-        diff = tmp1 - state2;
-        tmp2 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[1], diff, state1);
-        state1 = tmp1;
-        diff = tmp2 - state3;
-        state3 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[2], diff, state2);
-        state2 = tmp2;
+  for (i = (len >> 1); i > 0; i--) {
+    // lower allpass filter
+    in32 = (WebRtc_Word32)(*in++) << 10;
+    diff = in32 - state1;
+    tmp1 = MUL_ACCUM_1(kResampleAllpass2[0], diff, state0);
+    state0 = in32;
+    diff = tmp1 - state2;
+    tmp2 = MUL_ACCUM_2(kResampleAllpass2[1], diff, state1);
+    state1 = tmp1;
+    diff = tmp2 - state3;
+    state3 = MUL_ACCUM_2(kResampleAllpass2[2], diff, state2);
+    state2 = tmp2;

-        // upper allpass filter
-        in32 = (WebRtc_Word32)(*in++) << 10;
-        diff = in32 - state5;
-        tmp1 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[0], diff, state4);
-        state4 = in32;
-        diff = tmp1 - state6;
-        tmp2 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[1], diff, state5);
-        state5 = tmp1;
-        diff = tmp2 - state7;
-        state7 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[2], diff, state6);
-        state6 = tmp2;
+    // upper allpass filter
+    in32 = (WebRtc_Word32)(*in++) << 10;
+    diff = in32 - state5;
+    tmp1 = MUL_ACCUM_1(kResampleAllpass1[0], diff, state4);
+    state4 = in32;
+    diff = tmp1 - state6;
+    tmp2 = MUL_ACCUM_1(kResampleAllpass1[1], diff, state5);
+    state5 = tmp1;
+    diff = tmp2 - state7;
+    state7 = MUL_ACCUM_2(kResampleAllpass1[2], diff, state6);
+    state6 = tmp2;

-        // add two allpass outputs, divide by two and round
-        out32 = (state3 + state7 + 1024) >> 11;
+    // add two allpass outputs, divide by two and round
+    out32 = (state3 + state7 + 1024) >> 11;

-        // limit amplitude to prevent wrap-around, and write to output array
-#ifdef WEBRTC_ARCH_ARM_V7A
-        __asm__("ssat %r0, #16, %r1" : "=r"(*out) : "r"(out32));
-        out++;
-#else
-        if (out32 > 32767)
-            *out++ = 32767;
-        else if (out32 < -32768)
-            *out++ = -32768;
-        else
-            *out++ = (WebRtc_Word16)out32;
-#endif
-    }
+    // limit amplitude to prevent wrap-around, and write to output array
+    *out++ = WebRtcSpl_SatW32ToW16(out32);
+  }

-    filtState[0] = state0;
-    filtState[1] = state1;
-    filtState[2] = state2;
-    filtState[3] = state3;
-    filtState[4] = state4;
-    filtState[5] = state5;
-    filtState[6] = state6;
-    filtState[7] = state7;
+  filtState[0] = state0;
+  filtState[1] = state1;
+  filtState[2] = state2;
+  filtState[3] = state3;
+  filtState[4] = state4;
+  filtState[5] = state5;
+  filtState[6] = state6;
+  filtState[7] = state7;
 }

-void WebRtcSpl_UpsampleBy2(const WebRtc_Word16* in, WebRtc_Word16 len, WebRtc_Word16* out,
-                           WebRtc_Word32* filtState)
-{
-    WebRtc_Word32 tmp1, tmp2, diff, in32, out32;
-    WebRtc_Word16 i;

-    register WebRtc_Word32 state0 = filtState[0];
-    register WebRtc_Word32 state1 = filtState[1];
-    register WebRtc_Word32 state2 = filtState[2];
-    register WebRtc_Word32 state3 = filtState[3];
-    register WebRtc_Word32 state4 = filtState[4];
-    register WebRtc_Word32 state5 = filtState[5];
-    register WebRtc_Word32 state6 = filtState[6];
-    register WebRtc_Word32 state7 = filtState[7];
+void WebRtcSpl_UpsampleBy2(const WebRtc_Word16* in, WebRtc_Word16 len,
+                           WebRtc_Word16* out, WebRtc_Word32* filtState) {
+  WebRtc_Word32 tmp1, tmp2, diff, in32, out32;
+  WebRtc_Word16 i;

-    for (i = len; i > 0; i--)
-    {
-        // lower allpass filter
-        in32 = (WebRtc_Word32)(*in++) << 10;
-        diff = in32 - state1;
-        tmp1 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[0], diff, state0);
-        state0 = in32;
-        diff = tmp1 - state2;
-        tmp2 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[1], diff, state1);
-        state1 = tmp1;
-        diff = tmp2 - state3;
-        state3 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass1[2], diff, state2);
-        state2 = tmp2;
+  register WebRtc_Word32 state0 = filtState[0];
+  register WebRtc_Word32 state1 = filtState[1];
+  register WebRtc_Word32 state2 = filtState[2];
+  register WebRtc_Word32 state3 = filtState[3];
+  register WebRtc_Word32 state4 = filtState[4];
+  register WebRtc_Word32 state5 = filtState[5];
+  register WebRtc_Word32 state6 = filtState[6];
+  register WebRtc_Word32 state7 = filtState[7];

-        // round; limit amplitude to prevent wrap-around; write to output array
-        out32 = (state3 + 512) >> 10;
-#ifdef WEBRTC_ARCH_ARM_V7A
-        __asm__("ssat %r0, #16, %r1":"=r"(*out): "r"(out32));
-        out++;
-#else
-        if (out32 > 32767)
-            *out++ = 32767;
-        else if (out32 < -32768)
-            *out++ = -32768;
-        else
-            *out++ = (WebRtc_Word16)out32;
-#endif
+  for (i = len; i > 0; i--) {
+    // lower allpass filter
+    in32 = (WebRtc_Word32)(*in++) << 10;
+    diff = in32 - state1;
+    tmp1 = MUL_ACCUM_1(kResampleAllpass1[0], diff, state0);
+    state0 = in32;
+    diff = tmp1 - state2;
+    tmp2 = MUL_ACCUM_1(kResampleAllpass1[1], diff, state1);
+    state1 = tmp1;
+    diff = tmp2 - state3;
+    state3 = MUL_ACCUM_2(kResampleAllpass1[2], diff, state2);
+    state2 = tmp2;

-        // upper allpass filter
-        diff = in32 - state5;
-        tmp1 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[0], diff, state4);
-        state4 = in32;
-        diff = tmp1 - state6;
-        tmp2 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[1], diff, state5);
-        state5 = tmp1;
-        diff = tmp2 - state7;
-        state7 = WEBRTC_SPL_SCALEDIFF32(kResampleAllpass2[2], diff, state6);
-        state6 = tmp2;
+    // round; limit amplitude to prevent wrap-around; write to output array
+    out32 = (state3 + 512) >> 10;
+    *out++ = WebRtcSpl_SatW32ToW16(out32);

-        // round; limit amplitude to prevent wrap-around; write to output array
-        out32 = (state7 + 512) >> 10;
-#ifdef WEBRTC_ARCH_ARM_V7A
-        __asm__("ssat %r0, #16, %r1":"=r"(*out): "r"(out32));
-        out++;
-#else
-        if (out32 > 32767)
-            *out++ = 32767;
-        else if (out32 < -32768)
-            *out++ = -32768;
-        else
-            *out++ = (WebRtc_Word16)out32;
-#endif
-    }
-    
-    filtState[0] = state0;
-    filtState[1] = state1;
-    filtState[2] = state2;
-    filtState[3] = state3;
-    filtState[4] = state4;
-    filtState[5] = state5;
-    filtState[6] = state6;
-    filtState[7] = state7;
+    // upper allpass filter
+    diff = in32 - state5;
+    tmp1 = MUL_ACCUM_1(kResampleAllpass2[0], diff, state4);
+    state4 = in32;
+    diff = tmp1 - state6;
+    tmp2 = MUL_ACCUM_2(kResampleAllpass2[1], diff, state5);
+    state5 = tmp1;
+    diff = tmp2 - state7;
+    state7 = MUL_ACCUM_2(kResampleAllpass2[2], diff, state6);
+    state6 = tmp2;
+
+    // round; limit amplitude to prevent wrap-around; write to output array
+    out32 = (state7 + 512) >> 10;
+    *out++ = WebRtcSpl_SatW32ToW16(out32);
+  }
+
+  filtState[0] = state0;
+  filtState[1] = state1;
+  filtState[2] = state2;
+  filtState[3] = state3;
+  filtState[4] = state4;
+  filtState[5] = state5;
+  filtState[6] = state6;
+  filtState[7] = state7;
 }
--- a/src/common_audio/signal_processing_library/main/source/splitting_filter.c
+++ b/src/common_audio/signal_processing_library/main/source/splitting_filter.c
@@ -147,13 +147,11 @@ void WebRtcSpl_AnalysisQMF(const WebRtc_Word16* in_data, WebRtc_Word16* low_band
    {
        tmp = filter1[i] + filter2[i] + 1024;
        tmp = WEBRTC_SPL_RSHIFT_W32(tmp, 11);
-        low_band[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-                tmp, WEBRTC_SPL_WORD16_MIN);
+        low_band[i] = WebRtcSpl_SatW32ToW16(tmp);

        tmp = filter1[i] - filter2[i] + 1024;
        tmp = WEBRTC_SPL_RSHIFT_W32(tmp, 11);
-        high_band[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-                tmp, WEBRTC_SPL_WORD16_MIN);
+        high_band[i] = WebRtcSpl_SatW32ToW16(tmp);
    }
 }

@@ -191,10 +189,10 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band, const WebRtc_Word16*
    for (i = 0, k = 0; i < kBandFrameLength; i++)
    {
        tmp = WEBRTC_SPL_RSHIFT_W32(filter2[i] + 512, 10);
-        out_data[k++] = (WebRtc_Word16)WEBRTC_SPL_SAT(32767, tmp, -32768);
+        out_data[k++] = WebRtcSpl_SatW32ToW16(tmp);

        tmp = WEBRTC_SPL_RSHIFT_W32(filter1[i] + 512, 10);
-        out_data[k++] = (WebRtc_Word16)WEBRTC_SPL_SAT(32767, tmp, -32768);
+        out_data[k++] = WebRtcSpl_SatW32ToW16(tmp);
    }

 }
--- a/src/common_audio/signal_processing_library/main/source/vector_scaling_operations.c
+++ b/src/common_audio/signal_processing_library/main/source/vector_scaling_operations.c
@@ -125,7 +125,7 @@ void WebRtcSpl_ScaleVectorWithSat(G_CONST WebRtc_Word16 *in_vector, WebRtc_Word1
    for (i = 0; i < in_vector_length; i++)
    {
        tmpW32 = WEBRTC_SPL_MUL_16_16_RSFT(*inptr++, gain, right_shifts);
-        ( *outptr++) = (WebRtc_Word16)WEBRTC_SPL_SAT(32767, tmpW32, -32768);
+        (*outptr++) = WebRtcSpl_SatW32ToW16(tmpW32);
    }
 }