MIPS optimizations for the functions WebRtcSpl_SqrtFloor, WebRtcSpl_CrossCorrelation, WebRtcSpl_ScaleAndAddVectorsWithRound and the inline functions from signal_processing spl_inl.h file.

R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/1791004 Patch from Ljubomir Papuga <lpapuga@mips.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@4779 4adac7df-926f-26a2-2b94-8c16560cd09d
2013-09-18 17:40:46 +00:00 · 2013-09-18 17:40:46 +00:00 · 8bf755d5c5
commit 8bf755d5c5
parent 5f1051631a
9 changed files with 698 additions and 4 deletions
--- a/webrtc/common_audio/common_audio.gyp
+++ b/webrtc/common_audio/common_audio.gyp
@ -116,17 +116,28 @@
        }],
        ['target_arch=="mipsel"', {
          'sources': [
+            'signal_processing/include/spl_inl_mips.h',
            'signal_processing/complex_bit_reverse_mips.c',
            'signal_processing/complex_fft_mips.c',
+            'signal_processing/cross_correlation_mips.c',
            'signal_processing/downsample_fast_mips.c',
            'signal_processing/filter_ar_fast_q12_mips.c',
            'signal_processing/min_max_operations_mips.c',
            'signal_processing/resample_by_2_mips.c',
+            'signal_processing/spl_sqrt_floor_mips.c',
          ],
          'sources!': [
            'signal_processing/complex_bit_reverse.c',
            'signal_processing/complex_fft.c',
            'signal_processing/filter_ar_fast_q12.c',
+            'signal_processing/spl_sqrt_floor.c',
+          ],
+          'conditions': [
+            ['mips_dsp_rev>0', {
+              'sources': [
+                'signal_processing/vector_scaling_operations_mips.c',
+              ],
+            }],
          ],
        }],
      ],  # conditions
--- a/webrtc/common_audio/signal_processing/cross_correlation_mips.c
+++ b/webrtc/common_audio/signal_processing/cross_correlation_mips.c
@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+void WebRtcSpl_CrossCorrelation_mips(int32_t* cross_correlation,
+                                     const int16_t* seq1,
+                                     const int16_t* seq2,
+                                     int16_t dim_seq,
+                                     int16_t dim_cross_correlation,
+                                     int16_t right_shifts,
+                                     int16_t step_seq2) {
+
+  int32_t t0 = 0, t1 = 0, t2 = 0, t3 = 0, sum = 0;
+  int16_t *pseq2 = NULL;
+  int16_t *pseq1 = NULL;
+  int16_t *pseq1_0 = (int16_t*)&seq1[0];
+  int16_t *pseq2_0 = (int16_t*)&seq2[0];
+  int k = 0;
+
+  __asm __volatile (
+    ".set        push                                           \n\t"
+    ".set        noreorder                                      \n\t"
+    "sll         %[step_seq2], %[step_seq2],   1                \n\t"
+    "andi        %[t0],        %[dim_seq],     1                \n\t"
+    "bgtz        %[t0],        3f                               \n\t"
+    " nop                                                       \n\t"
+   "1:                                                          \n\t"
+    "move        %[pseq1],     %[pseq1_0]                       \n\t"
+    "move        %[pseq2],     %[pseq2_0]                       \n\t"
+    "sra         %[k],         %[dim_seq],     1                \n\t"
+    "addiu       %[dim_cc],    %[dim_cc],      -1               \n\t"
+    "xor         %[sum],       %[sum],         %[sum]           \n\t"
+   "2:                                                          \n\t"
+    "lh          %[t0],        0(%[pseq1])                      \n\t"
+    "lh          %[t1],        0(%[pseq2])                      \n\t"
+    "lh          %[t2],        2(%[pseq1])                      \n\t"
+    "lh          %[t3],        2(%[pseq2])                      \n\t"
+    "mul         %[t0],        %[t0],          %[t1]            \n\t"
+    "addiu       %[k],         %[k],           -1               \n\t"
+    "mul         %[t2],        %[t2],          %[t3]            \n\t"
+    "addiu       %[pseq1],     %[pseq1],       4                \n\t"
+    "addiu       %[pseq2],     %[pseq2],       4                \n\t"
+    "srav        %[t0],        %[t0],          %[right_shifts]  \n\t"
+    "addu        %[sum],       %[sum],         %[t0]            \n\t"
+    "srav        %[t2],        %[t2],          %[right_shifts]  \n\t"
+    "bgtz        %[k],         2b                               \n\t"
+    " addu       %[sum],       %[sum],         %[t2]            \n\t"
+    "addu        %[pseq2_0],   %[pseq2_0],     %[step_seq2]     \n\t"
+    "sw          %[sum],       0(%[cc])                         \n\t"
+    "bgtz        %[dim_cc],    1b                               \n\t"
+    " addiu      %[cc],        %[cc],          4                \n\t"
+    "b           6f                                             \n\t"
+    " nop                                                       \n\t"
+   "3:                                                          \n\t"
+    "move        %[pseq1],     %[pseq1_0]                       \n\t"
+    "move        %[pseq2],     %[pseq2_0]                       \n\t"
+    "sra         %[k],         %[dim_seq],     1                \n\t"
+    "addiu       %[dim_cc],    %[dim_cc],      -1               \n\t"
+    "beqz        %[k],         5f                               \n\t"
+    " xor        %[sum],       %[sum],         %[sum]           \n\t"
+   "4:                                                          \n\t"
+    "lh          %[t0],        0(%[pseq1])                      \n\t"
+    "lh          %[t1],        0(%[pseq2])                      \n\t"
+    "lh          %[t2],        2(%[pseq1])                      \n\t"
+    "lh          %[t3],        2(%[pseq2])                      \n\t"
+    "mul         %[t0],        %[t0],          %[t1]            \n\t"
+    "addiu       %[k],         %[k],           -1               \n\t"
+    "mul         %[t2],        %[t2],          %[t3]            \n\t"
+    "addiu       %[pseq1],     %[pseq1],       4                \n\t"
+    "addiu       %[pseq2],     %[pseq2],       4                \n\t"
+    "srav        %[t0],        %[t0],          %[right_shifts]  \n\t"
+    "addu        %[sum],       %[sum],         %[t0]            \n\t"
+    "srav        %[t2],        %[t2],          %[right_shifts]  \n\t"
+    "bgtz        %[k],         4b                               \n\t"
+    " addu       %[sum],       %[sum],         %[t2]            \n\t"
+   "5:                                                          \n\t"
+    "lh          %[t0],        0(%[pseq1])                      \n\t"
+    "lh          %[t1],        0(%[pseq2])                      \n\t"
+    "mul         %[t0],        %[t0],          %[t1]            \n\t"
+    "srav        %[t0],        %[t0],          %[right_shifts]  \n\t"
+    "addu        %[sum],       %[sum],         %[t0]            \n\t"
+    "addu        %[pseq2_0],   %[pseq2_0],     %[step_seq2]     \n\t"
+    "sw          %[sum],       0(%[cc])                         \n\t"
+    "bgtz        %[dim_cc],    3b                               \n\t"
+    " addiu      %[cc],        %[cc],          4                \n\t"
+   "6:                                                          \n\t"
+    ".set        pop                                            \n\t"
+    : [step_seq2] "+r" (step_seq2), [t0] "=&r" (t0), [t1] "=&r" (t1),
+      [t2] "=&r" (t2), [t3] "=&r" (t3), [pseq1] "=&r" (pseq1),
+      [pseq2] "=&r" (pseq2), [pseq1_0] "+r" (pseq1_0), [pseq2_0] "+r" (pseq2_0),
+      [k] "=&r" (k), [dim_cc] "+r" (dim_cross_correlation), [sum] "=&r" (sum),
+      [cc] "+r" (cross_correlation)
+    : [dim_seq] "r" (dim_seq), [right_shifts] "r" (right_shifts)
+    : "hi", "lo", "memory"
+  );
+}
--- a/webrtc/common_audio/signal_processing/include/signal_processing_library.h
+++ b/webrtc/common_audio/signal_processing/include/signal_processing_library.h
@ -73,6 +73,8 @@

 #ifndef WEBRTC_ARCH_ARM_V7
 // For ARMv7 platforms, these are inline functions in spl_inl_armv7.h
+#ifndef MIPS32_LE
+// For MIPS platforms, these are inline functions in spl_inl_mips.h
 #define WEBRTC_SPL_MUL_16_16(a, b) \
    ((int32_t) (((int16_t)(a)) * ((int16_t)(b))))
 #define WEBRTC_SPL_MUL_16_32_RSFT16(a, b) \
@ -87,6 +89,7 @@
    (WEBRTC_SPL_MUL_16_32_RSFT16(( \
    (int16_t)((a32 & 0x0000FFFF) >> 1)), b32) >> 15)))
 #endif
+#endif

 #define WEBRTC_SPL_MUL_16_32_RSFT11(a, b) \
    ((WEBRTC_SPL_MUL_16_16(a, (b) >> 16) << 5) \
@ -456,6 +459,15 @@ int WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(const int16_t* in_vector1,
                                              int16_t* out_vector,
                                              int length);
 #endif
+#if defined(MIPS_DSP_R1_LE)
+int WebRtcSpl_ScaleAndAddVectorsWithRound_mips(const int16_t* in_vector1,
+                                               int16_t in_vector1_scale,
+                                               const int16_t* in_vector2,
+                                               int16_t in_vector2_scale,
+                                               int right_shifts,
+                                               int16_t* out_vector,
+                                               int length);
+#endif
 // End: Vector scaling operations.

 // iLBC specific functions. Implementations in ilbc_specific_functions.c.
@ -627,6 +639,15 @@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
                                    int16_t right_shifts,
                                    int16_t step_seq2);
 #endif
+#if defined(MIPS32_LE)
+void WebRtcSpl_CrossCorrelation_mips(int32_t* cross_correlation,
+                                     const int16_t* seq1,
+                                     const int16_t* seq2,
+                                     int16_t dim_seq,
+                                     int16_t dim_cross_correlation,
+                                     int16_t right_shifts,
+                                     int16_t step_seq2);
+#endif

 // Creates (the first half of) a Hanning window. Size must be at least 1 and
 // at most 512.
--- a/webrtc/common_audio/signal_processing/include/spl_inl.h
+++ b/webrtc/common_audio/signal_processing/include/spl_inl.h
@ -19,6 +19,11 @@
 #include "webrtc/common_audio/signal_processing/include/spl_inl_armv7.h"
 #else

+#if defined(MIPS32_LE)
+#include "webrtc/common_audio/signal_processing/include/spl_inl_mips.h"
+#endif
+
+#if !defined(MIPS_DSP_R1_LE)
 static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) {
  int16_t out16 = (int16_t) value32;

@ -37,7 +42,9 @@ static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) {
 static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) {
  return WebRtcSpl_SatW32ToW16((int32_t) var1 - (int32_t) var2);
 }
+#endif  // #if !defined(MIPS_DSP_R1_LE)

+#if !defined(MIPS32_LE)
 static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) {
  int bits;

@ -121,11 +128,13 @@ static __inline int WebRtcSpl_NormW16(int16_t a) {
 static __inline int32_t WebRtc_MulAccumW16(int16_t a, int16_t b, int32_t c) {
  return (a * b + c);
 }
+#endif  // #if !defined(MIPS32_LE)

 #endif  // WEBRTC_ARCH_ARM_V7

 // The following functions have no optimized versions.
 // TODO(kma): Consider saturating add/sub instructions in X86 platform.
+#if !defined(MIPS_DSP_R1_LE)
 static __inline int32_t WebRtcSpl_AddSatW32(int32_t l_var1, int32_t l_var2) {
  int32_t l_sum;

@ -163,5 +172,6 @@ static __inline int32_t WebRtcSpl_SubSatW32(int32_t l_var1, int32_t l_var2) {

  return l_diff;
 }
+#endif  // #if !defined(MIPS_DSP_R1_LE)

 #endif  // WEBRTC_SPL_SPL_INL_H_
--- a/webrtc/common_audio/signal_processing/include/spl_inl_mips.h
+++ b/webrtc/common_audio/signal_processing/include/spl_inl_mips.h
@ -0,0 +1,281 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// This header file includes the inline functions in
+// the fix point signal processing library.
+
+#ifndef WEBRTC_SPL_SPL_INL_MIPS_H_
+#define WEBRTC_SPL_SPL_INL_MIPS_H_
+
+static __inline int32_t WEBRTC_SPL_MUL_16_16(int32_t a,
+                                             int32_t b) {
+  int32_t value32 = 0;
+  int32_t a1 = 0, b1 = 0;
+
+  __asm __volatile(
+#if defined(MIPS32_R2_LE)
+    "seh    %[a1],          %[a]                \n\t"
+    "seh    %[b1],          %[b]                \n\t"
+#else
+    "sll    %[a1],          %[a],         16    \n\t"
+    "sll    %[b1],          %[b],         16    \n\t"
+    "sra    %[a1],          %[a1],        16    \n\t"
+    "sra    %[b1],          %[b1],        16    \n\t"
+#endif
+    "mul    %[value32],     %[a1],  %[b1]       \n\t"
+    : [value32] "=r" (value32), [a1] "=&r" (a1), [b1] "=&r" (b1)
+    : [a] "r" (a), [b] "r" (b)
+    : "hi", "lo"
+  );
+  return value32;
+}
+
+static __inline int32_t WEBRTC_SPL_MUL_16_32_RSFT16(int16_t a,
+                                                    int32_t b) {
+  int32_t value32 = 0, b1 = 0, b2 = 0;
+  int32_t a1 = 0;
+
+  __asm __volatile(
+#if defined(MIPS32_R2_LE)
+    "seh    %[a1],          %[a]                        \n\t"
+#else
+    "sll    %[a1],          %[a],           16          \n\t"
+    "sra    %[a1],          %[a1],          16          \n\t"
+#endif
+    "andi   %[b2],          %[b],           0xFFFF      \n\t"
+    "sra    %[b1],          %[b],           16          \n\t"
+    "sra    %[b2],          %[b2],          1           \n\t"
+    "mul    %[value32],     %[a1],          %[b1]       \n\t"
+    "mul    %[b2],          %[a1],          %[b2]       \n\t"
+    "addiu  %[b2],          %[b2],          0x4000      \n\t"
+    "sra    %[b2],          %[b2],          15          \n\t"
+    "addu   %[value32],     %[value32],     %[b2]       \n\t"
+    : [value32] "=&r" (value32), [b1] "=&r" (b1), [b2] "=&r" (b2),
+      [a1] "=&r" (a1)
+    : [a] "r" (a), [b] "r" (b)
+    : "hi", "lo"
+  );
+  return value32;
+}
+
+static __inline int32_t WEBRTC_SPL_MUL_32_32_RSFT32BI(int32_t a,
+                                                      int32_t b) {
+  int32_t tmp = 0;
+
+  if ((32767 < a) || (a < 0))
+    tmp = WEBRTC_SPL_MUL_16_32_RSFT16(((int16_t)(a >> 16)), b);
+  tmp += WEBRTC_SPL_MUL_16_32_RSFT16(((int16_t)((a & 0x0000FFFF) >> 1)),
+                                     b) >> 15;
+
+  return tmp;
+}
+
+static __inline int32_t WEBRTC_SPL_MUL_32_32_RSFT32(int16_t a,
+                                                    int16_t b,
+                                                    int32_t c) {
+  int32_t tmp1 = 0, tmp2 = 0, tmp3 = 0, tmp4 = 0;
+
+  __asm __volatile(
+    "sra                %[tmp1],   %[c],      16      \n\t"
+    "andi               %[tmp2],   %[c],      0xFFFF  \n\t"
+#if defined(MIPS32_R2_LE)
+    "seh                %[a],      %[a]               \n\t"
+    "seh                %[b],      %[b]               \n\t"
+#else
+    "sll                %[a],      %[a],      16      \n\t"
+    "sra                %[a],      %[a],      16      \n\t"
+    "sll                %[b],      %[b],      16      \n\t"
+    "sra                %[b],      %[b],      16      \n\t"
+#endif
+    "sra                %[tmp2],   %[tmp2],   1       \n\t"
+    "mul                %[tmp3],   %[a],      %[tmp2] \n\t"
+    "mul                %[tmp4],   %[b],      %[tmp2] \n\t"
+    "mul                %[tmp2],   %[a],      %[tmp1] \n\t"
+    "mul                %[tmp1],   %[b],      %[tmp1] \n\t"
+#if defined(MIPS_DSP_R1_LE)
+    "shra_r.w           %[tmp3],   %[tmp3],   15      \n\t"
+    "shra_r.w           %[tmp4],   %[tmp4],   15      \n\t"
+#else
+    "addiu              %[tmp3],   %[tmp3],   0x4000  \n\t"
+    "sra                %[tmp3],   %[tmp3],   15      \n\t"
+    "addiu              %[tmp4],   %[tmp4],   0x4000  \n\t"
+    "sra                %[tmp4],   %[tmp4],   15      \n\t"
+#endif
+    "addu               %[tmp3],   %[tmp3],   %[tmp2] \n\t"
+    "addu               %[tmp4],   %[tmp4],   %[tmp1] \n\t"
+    "sra                %[tmp4],   %[tmp4],   16      \n\t"
+    "addu               %[tmp1],   %[tmp3],   %[tmp4] \n\t"
+    : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+      [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+      [a] "+r" (a), [b] "+r" (b)
+    : [c] "r" (c)
+    : "hi", "lo"
+  );
+  return tmp1;
+}
+
+#if defined(MIPS_DSP_R1_LE)
+static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) {
+  __asm __volatile(
+    "shll_s.w   %[value32], %[value32], 16      \n\t"
+    "sra        %[value32], %[value32], 16      \n\t"
+    : [value32] "+r" (value32)
+    :
+  );
+  int16_t out16 = (int16_t)value32;
+  return out16;
+}
+
+static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) {
+  int32_t value32 = 0;
+
+  __asm __volatile(
+    "addq_s.ph      %[value32],     %[a],   %[b]    \n\t"
+    : [value32] "=r" (value32)
+    : [a] "r" (a), [b] "r" (b)
+  );
+  return (int16_t)value32;
+}
+
+static __inline int32_t WebRtcSpl_AddSatW32(int32_t l_var1, int32_t l_var2) {
+  int32_t l_sum;
+
+  __asm __volatile(
+    "addq_s.w   %[l_sum],       %[l_var1],      %[l_var2]    \n\t"
+    : [l_sum] "=r" (l_sum)
+    : [l_var1] "r" (l_var1), [l_var2] "r" (l_var2)
+  );
+
+  return l_sum;
+}
+
+static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) {
+  int32_t value32;
+
+  __asm __volatile(
+    "subq_s.ph  %[value32], %[var1],    %[var2]     \n\t"
+    : [value32] "=r" (value32)
+    : [var1] "r" (var1), [var2] "r" (var2)
+  );
+
+  return (int16_t)value32;
+}
+
+static __inline int32_t WebRtcSpl_SubSatW32(int32_t l_var1, int32_t l_var2) {
+  int32_t l_diff;
+
+  __asm __volatile(
+    "subq_s.w   %[l_diff],      %[l_var1],      %[l_var2]    \n\t"
+    : [l_diff] "=r" (l_diff)
+    : [l_var1] "r" (l_var1), [l_var2] "r" (l_var2)
+  );
+
+  return l_diff;
+}
+#endif
+
+static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) {
+  int bits = 0;
+  int i32 = 32;
+
+  __asm __volatile(
+    "clz    %[bits],    %[n]                    \n\t"
+    "subu   %[bits],    %[i32],     %[bits]     \n\t"
+    : [bits] "=&r" (bits)
+    : [n] "r" (n), [i32] "r" (i32)
+  );
+
+  return bits;
+}
+
+static __inline int WebRtcSpl_NormW32(int32_t a) {
+  int zeros = 0;
+
+  __asm __volatile(
+    ".set       push                                \n\t"
+    ".set       noreorder                           \n\t"
+    "bnez       %[a],       1f                      \n\t"
+    " sra       %[zeros],   %[a],       31          \n\t"
+    "b          2f                                  \n\t"
+    " move      %[zeros],   $zero                   \n\t"
+   "1:                                              \n\t"
+    "xor        %[zeros],   %[a],       %[zeros]    \n\t"
+    "clz        %[zeros],   %[zeros]                \n\t"
+    "addiu      %[zeros],   %[zeros],   -1          \n\t"
+   "2:                                              \n\t"
+    ".set       pop                                 \n\t"
+    : [zeros]"=&r"(zeros)
+    : [a] "r" (a)
+  );
+
+  return zeros;
+}
+
+static __inline int WebRtcSpl_NormU32(uint32_t a) {
+  int zeros = 0;
+
+  __asm __volatile(
+    "clz    %[zeros],   %[a]    \n\t"
+    : [zeros] "=r" (zeros)
+    : [a] "r" (a)
+  );
+
+  return (zeros & 0x1f);
+}
+
+static __inline int WebRtcSpl_NormW16(int16_t a) {
+  int zeros = 0;
+  int a0 = a << 16;
+
+  __asm __volatile(
+    ".set       push                                \n\t"
+    ".set       noreorder                           \n\t"
+    "bnez       %[a0],      1f                      \n\t"
+    " sra       %[zeros],   %[a0],      31          \n\t"
+    "b          2f                                  \n\t"
+    " move      %[zeros],   $zero                   \n\t"
+   "1:                                              \n\t"
+    "xor        %[zeros],   %[a0],      %[zeros]    \n\t"
+    "clz        %[zeros],   %[zeros]                \n\t"
+    "addiu      %[zeros],   %[zeros],   -1          \n\t"
+   "2:                                              \n\t"
+    ".set       pop                                 \n\t"
+    : [zeros]"=&r"(zeros)
+    : [a0] "r" (a0)
+  );
+
+  return zeros;
+}
+
+static __inline int32_t WebRtc_MulAccumW16(int16_t a,
+                                           int16_t b,
+                                           int32_t c) {
+  int32_t res = 0, c1 = 0;
+  __asm __volatile(
+#if defined(MIPS32_R2_LE)
+    "seh    %[a],       %[a]            \n\t"
+    "seh    %[b],       %[b]            \n\t"
+#else
+    "sll    %[a],       %[a],   16      \n\t"
+    "sll    %[b],       %[b],   16      \n\t"
+    "sra    %[a],       %[a],   16      \n\t"
+    "sra    %[b],       %[b],   16      \n\t"
+#endif
+    "mul    %[res],     %[a],   %[b]    \n\t"
+    "addu   %[c1],      %[c],   %[res]  \n\t"
+    : [c1] "=r" (c1), [res] "=&r" (res)
+    : [a] "r" (a), [b] "r" (b), [c] "r" (c)
+    : "hi", "lo"
+  );
+  return (c1);
+}
+
+#endif  // WEBRTC_SPL_SPL_INL_MIPS_H_
--- a/webrtc/common_audio/signal_processing/signal_processing_unittest.cc
+++ b/webrtc/common_audio/signal_processing/signal_processing_unittest.cc
@ -529,12 +529,14 @@ TEST_F(SplTest, CrossCorrelationTest) {
  // are not bit-exact.
  const int32_t kExpected[kCrossCorrelationDimension] =
      {-266947903, -15579555, -171282001};
+  const int32_t* expected = kExpected;
+#if !defined(MIPS32_LE)
  const int32_t kExpectedNeon[kCrossCorrelationDimension] =
      {-266947901, -15579553, -171281999};
-  const int32_t* expected = kExpected;
  if (WebRtcSpl_CrossCorrelation != WebRtcSpl_CrossCorrelationC) {
    expected = kExpectedNeon;
  }
+#endif
  for (int i = 0; i < kCrossCorrelationDimension; ++i) {
    EXPECT_EQ(expected[i], vector32[i]);
  }
--- a/webrtc/common_audio/signal_processing/spl_init.c
+++ b/webrtc/common_audio/signal_processing/spl_init.c
@ -82,18 +82,20 @@ static void InitPointersToMIPS() {
  WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32_mips;
  WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16_mips;
  WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32_mips;
-  WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationC;
+  WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelation_mips;
  WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFast_mips;
-  WebRtcSpl_ScaleAndAddVectorsWithRound =
-      WebRtcSpl_ScaleAndAddVectorsWithRoundC;
  WebRtcSpl_CreateRealFFT = WebRtcSpl_CreateRealFFTC;
  WebRtcSpl_FreeRealFFT = WebRtcSpl_FreeRealFFTC;
  WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC;
  WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC;
 #if defined(MIPS_DSP_R1_LE)
  WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32_mips;
+  WebRtcSpl_ScaleAndAddVectorsWithRound =
+      WebRtcSpl_ScaleAndAddVectorsWithRound_mips;
 #else
  WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C;
+  WebRtcSpl_ScaleAndAddVectorsWithRound =
+      WebRtcSpl_ScaleAndAddVectorsWithRoundC;
 #endif
 }
 #endif
--- a/webrtc/common_audio/signal_processing/spl_sqrt_floor_mips.c
+++ b/webrtc/common_audio/signal_processing/spl_sqrt_floor_mips.c
@ -0,0 +1,207 @@
+/*
+ * Written by Wilco Dijkstra, 1996. The following email exchange establishes the
+ * license.
+ *
+ * From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
+ * Date: Fri, Jun 24, 2011 at 3:20 AM
+ * Subject: Re: sqrt routine
+ * To: Kevin Ma <kma@google.com>
+ * Hi Kevin,
+ * Thanks for asking. Those routines are public domain (originally posted to
+ * comp.sys.arm a long time ago), so you can use them freely for any purpose.
+ * Cheers,
+ * Wilco
+ *
+ * ----- Original Message -----
+ * From: "Kevin Ma" <kma@google.com>
+ * To: <Wilco.Dijkstra@ntlworld.com>
+ * Sent: Thursday, June 23, 2011 11:44 PM
+ * Subject: Fwd: sqrt routine
+ * Hi Wilco,
+ * I saw your sqrt routine from several web sites, including
+ * http://www.finesse.demon.co.uk/steven/sqrt.html.
+ * Just wonder if there's any copyright information with your Successive
+ * approximation routines, or if I can freely use it for any purpose.
+ * Thanks.
+ * Kevin
+ */
+
+// Minor modifications in code style for WebRTC, 2012.
+// Code optimizations for MIPS, 2013.
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+/*
+ * Algorithm:
+ * Successive approximation of the equation (root + delta) ^ 2 = N
+ * until delta < 1. If delta < 1 we have the integer part of SQRT (N).
+ * Use delta = 2^i for i = 15 .. 0.
+ *
+ * Output precision is 16 bits. Note for large input values (close to
+ * 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
+ * contains the MSB information (a non-sign value). Do with caution
+ * if you need to cast the output to int16_t type.
+ *
+ * If the input value is negative, it returns 0.
+ */
+
+
+int32_t WebRtcSpl_SqrtFloor(int32_t value)
+{
+  int32_t root = 0, tmp1, tmp2, tmp3, tmp4;
+
+  __asm __volatile(
+    ".set   push                                       \n\t"
+    ".set   noreorder                                  \n\t"
+
+    "lui    %[tmp1],      0x4000                       \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "sub    %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "lui    %[tmp1],      0x1                          \n\t"
+    "or     %[tmp4],      %[root],      %[tmp1]        \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x4000         \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      14                           \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x8000         \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x2000         \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      13                           \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x4000         \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x1000         \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      12                           \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x2000         \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x800          \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      11                           \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x1000         \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x400          \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      10                           \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x800          \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x200          \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      9                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],       0x400         \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x100          \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      8                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x200          \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x80           \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      7                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x100          \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x40           \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      6                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x80           \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x20           \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      5                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x40           \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x10           \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      4                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x20           \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x8            \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      3                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x10           \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x4            \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      2                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x8            \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x2            \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "sll    %[tmp1],      1                            \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "subu   %[tmp3],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x4            \n\t"
+    "movz   %[value],     %[tmp3],      %[tmp2]        \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    "addiu  %[tmp1],      $0,           0x1            \n\t"
+    "addu   %[tmp1],      %[tmp1],      %[root]        \n\t"
+    "slt    %[tmp2],      %[value],     %[tmp1]        \n\t"
+    "ori    %[tmp4],      %[root],      0x2            \n\t"
+    "movz   %[root],      %[tmp4],      %[tmp2]        \n\t"
+
+    ".set   pop                                        \n\t"
+
+    : [root] "+r" (root), [value] "+r" (value),
+      [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+      [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+    :
+  );
+
+  return root >> 1;
+}
+
--- a/webrtc/common_audio/signal_processing/vector_scaling_operations_mips.c
+++ b/webrtc/common_audio/signal_processing/vector_scaling_operations_mips.c
@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*
+ * This file contains implementations of the functions
+ * WebRtcSpl_ScaleAndAddVectorsWithRound_mips()
+ */
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+
+int WebRtcSpl_ScaleAndAddVectorsWithRound_mips(const int16_t* in_vector1,
+                                               int16_t in_vector1_scale,
+                                               const int16_t* in_vector2,
+                                               int16_t in_vector2_scale,
+                                               int right_shifts,
+                                               int16_t* out_vector,
+                                               int length) {
+  int16_t r0 = 0, r1 = 0;
+  int16_t *in1 = (int16_t*)in_vector1;
+  int16_t *in2 = (int16_t*)in_vector2;
+  int16_t *out = out_vector;
+  int i = 0, value32 = 0;
+
+  if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL ||
+      length <= 0 || right_shifts < 0) {
+    return -1;
+  }
+  for (i = 0; i < length; i++) {
+    __asm __volatile (
+      "lh         %[r0],          0(%[in1])                               \n\t"
+      "lh         %[r1],          0(%[in2])                               \n\t"
+      "mult       %[r0],          %[in_vector1_scale]                     \n\t"
+      "madd       %[r1],          %[in_vector2_scale]                     \n\t"
+      "extrv_r.w  %[value32],     $ac0,               %[right_shifts]     \n\t"
+      "addiu      %[in1],         %[in1],             2                   \n\t"
+      "addiu      %[in2],         %[in2],             2                   \n\t"
+      "sh         %[value32],     0(%[out])                               \n\t"
+      "addiu      %[out],         %[out],             2                   \n\t"
+      : [value32] "=&r" (value32), [out] "+r" (out), [in1] "+r" (in1),
+        [in2] "+r" (in2), [r0] "=&r" (r0), [r1] "=&r" (r1)
+      : [in_vector1_scale] "r" (in_vector1_scale),
+        [in_vector2_scale] "r" (in_vector2_scale),
+        [right_shifts] "r" (right_shifts)
+      : "hi", "lo", "memory"
+    );
+  }
+  return 0;
+}