MIPS optimizations for the functions WebRtcSpl_SqrtFloor, WebRtcSpl_CrossCorrelation, WebRtcSpl_ScaleAndAddVectorsWithRound and the inline functions from signal_processing spl_inl.h file.

R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/1791004

Patch from Ljubomir Papuga <lpapuga@mips.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@4779 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2013-09-18 17:40:46 +00:00
parent 5f1051631a
commit 8bf755d5c5
9 changed files with 698 additions and 4 deletions

View File

@ -116,17 +116,28 @@
}],
['target_arch=="mipsel"', {
'sources': [
'signal_processing/include/spl_inl_mips.h',
'signal_processing/complex_bit_reverse_mips.c',
'signal_processing/complex_fft_mips.c',
'signal_processing/cross_correlation_mips.c',
'signal_processing/downsample_fast_mips.c',
'signal_processing/filter_ar_fast_q12_mips.c',
'signal_processing/min_max_operations_mips.c',
'signal_processing/resample_by_2_mips.c',
'signal_processing/spl_sqrt_floor_mips.c',
],
'sources!': [
'signal_processing/complex_bit_reverse.c',
'signal_processing/complex_fft.c',
'signal_processing/filter_ar_fast_q12.c',
'signal_processing/spl_sqrt_floor.c',
],
'conditions': [
['mips_dsp_rev>0', {
'sources': [
'signal_processing/vector_scaling_operations_mips.c',
],
}],
],
}],
], # conditions

View File

@ -0,0 +1,104 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
void WebRtcSpl_CrossCorrelation_mips(int32_t* cross_correlation,
const int16_t* seq1,
const int16_t* seq2,
int16_t dim_seq,
int16_t dim_cross_correlation,
int16_t right_shifts,
int16_t step_seq2) {
int32_t t0 = 0, t1 = 0, t2 = 0, t3 = 0, sum = 0;
int16_t *pseq2 = NULL;
int16_t *pseq1 = NULL;
int16_t *pseq1_0 = (int16_t*)&seq1[0];
int16_t *pseq2_0 = (int16_t*)&seq2[0];
int k = 0;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"sll %[step_seq2], %[step_seq2], 1 \n\t"
"andi %[t0], %[dim_seq], 1 \n\t"
"bgtz %[t0], 3f \n\t"
" nop \n\t"
"1: \n\t"
"move %[pseq1], %[pseq1_0] \n\t"
"move %[pseq2], %[pseq2_0] \n\t"
"sra %[k], %[dim_seq], 1 \n\t"
"addiu %[dim_cc], %[dim_cc], -1 \n\t"
"xor %[sum], %[sum], %[sum] \n\t"
"2: \n\t"
"lh %[t0], 0(%[pseq1]) \n\t"
"lh %[t1], 0(%[pseq2]) \n\t"
"lh %[t2], 2(%[pseq1]) \n\t"
"lh %[t3], 2(%[pseq2]) \n\t"
"mul %[t0], %[t0], %[t1] \n\t"
"addiu %[k], %[k], -1 \n\t"
"mul %[t2], %[t2], %[t3] \n\t"
"addiu %[pseq1], %[pseq1], 4 \n\t"
"addiu %[pseq2], %[pseq2], 4 \n\t"
"srav %[t0], %[t0], %[right_shifts] \n\t"
"addu %[sum], %[sum], %[t0] \n\t"
"srav %[t2], %[t2], %[right_shifts] \n\t"
"bgtz %[k], 2b \n\t"
" addu %[sum], %[sum], %[t2] \n\t"
"addu %[pseq2_0], %[pseq2_0], %[step_seq2] \n\t"
"sw %[sum], 0(%[cc]) \n\t"
"bgtz %[dim_cc], 1b \n\t"
" addiu %[cc], %[cc], 4 \n\t"
"b 6f \n\t"
" nop \n\t"
"3: \n\t"
"move %[pseq1], %[pseq1_0] \n\t"
"move %[pseq2], %[pseq2_0] \n\t"
"sra %[k], %[dim_seq], 1 \n\t"
"addiu %[dim_cc], %[dim_cc], -1 \n\t"
"beqz %[k], 5f \n\t"
" xor %[sum], %[sum], %[sum] \n\t"
"4: \n\t"
"lh %[t0], 0(%[pseq1]) \n\t"
"lh %[t1], 0(%[pseq2]) \n\t"
"lh %[t2], 2(%[pseq1]) \n\t"
"lh %[t3], 2(%[pseq2]) \n\t"
"mul %[t0], %[t0], %[t1] \n\t"
"addiu %[k], %[k], -1 \n\t"
"mul %[t2], %[t2], %[t3] \n\t"
"addiu %[pseq1], %[pseq1], 4 \n\t"
"addiu %[pseq2], %[pseq2], 4 \n\t"
"srav %[t0], %[t0], %[right_shifts] \n\t"
"addu %[sum], %[sum], %[t0] \n\t"
"srav %[t2], %[t2], %[right_shifts] \n\t"
"bgtz %[k], 4b \n\t"
" addu %[sum], %[sum], %[t2] \n\t"
"5: \n\t"
"lh %[t0], 0(%[pseq1]) \n\t"
"lh %[t1], 0(%[pseq2]) \n\t"
"mul %[t0], %[t0], %[t1] \n\t"
"srav %[t0], %[t0], %[right_shifts] \n\t"
"addu %[sum], %[sum], %[t0] \n\t"
"addu %[pseq2_0], %[pseq2_0], %[step_seq2] \n\t"
"sw %[sum], 0(%[cc]) \n\t"
"bgtz %[dim_cc], 3b \n\t"
" addiu %[cc], %[cc], 4 \n\t"
"6: \n\t"
".set pop \n\t"
: [step_seq2] "+r" (step_seq2), [t0] "=&r" (t0), [t1] "=&r" (t1),
[t2] "=&r" (t2), [t3] "=&r" (t3), [pseq1] "=&r" (pseq1),
[pseq2] "=&r" (pseq2), [pseq1_0] "+r" (pseq1_0), [pseq2_0] "+r" (pseq2_0),
[k] "=&r" (k), [dim_cc] "+r" (dim_cross_correlation), [sum] "=&r" (sum),
[cc] "+r" (cross_correlation)
: [dim_seq] "r" (dim_seq), [right_shifts] "r" (right_shifts)
: "hi", "lo", "memory"
);
}

View File

@ -73,6 +73,8 @@
#ifndef WEBRTC_ARCH_ARM_V7
// For ARMv7 platforms, these are inline functions in spl_inl_armv7.h
#ifndef MIPS32_LE
// For MIPS platforms, these are inline functions in spl_inl_mips.h
#define WEBRTC_SPL_MUL_16_16(a, b) \
((int32_t) (((int16_t)(a)) * ((int16_t)(b))))
#define WEBRTC_SPL_MUL_16_32_RSFT16(a, b) \
@ -87,6 +89,7 @@
(WEBRTC_SPL_MUL_16_32_RSFT16(( \
(int16_t)((a32 & 0x0000FFFF) >> 1)), b32) >> 15)))
#endif
#endif
#define WEBRTC_SPL_MUL_16_32_RSFT11(a, b) \
((WEBRTC_SPL_MUL_16_16(a, (b) >> 16) << 5) \
@ -456,6 +459,15 @@ int WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(const int16_t* in_vector1,
int16_t* out_vector,
int length);
#endif
#if defined(MIPS_DSP_R1_LE)
int WebRtcSpl_ScaleAndAddVectorsWithRound_mips(const int16_t* in_vector1,
int16_t in_vector1_scale,
const int16_t* in_vector2,
int16_t in_vector2_scale,
int right_shifts,
int16_t* out_vector,
int length);
#endif
// End: Vector scaling operations.
// iLBC specific functions. Implementations in ilbc_specific_functions.c.
@ -627,6 +639,15 @@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
int16_t right_shifts,
int16_t step_seq2);
#endif
#if defined(MIPS32_LE)
void WebRtcSpl_CrossCorrelation_mips(int32_t* cross_correlation,
const int16_t* seq1,
const int16_t* seq2,
int16_t dim_seq,
int16_t dim_cross_correlation,
int16_t right_shifts,
int16_t step_seq2);
#endif
// Creates (the first half of) a Hanning window. Size must be at least 1 and
// at most 512.

View File

@ -19,6 +19,11 @@
#include "webrtc/common_audio/signal_processing/include/spl_inl_armv7.h"
#else
#if defined(MIPS32_LE)
#include "webrtc/common_audio/signal_processing/include/spl_inl_mips.h"
#endif
#if !defined(MIPS_DSP_R1_LE)
static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) {
int16_t out16 = (int16_t) value32;
@ -37,7 +42,9 @@ static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) {
static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) {
return WebRtcSpl_SatW32ToW16((int32_t) var1 - (int32_t) var2);
}
#endif // #if !defined(MIPS_DSP_R1_LE)
#if !defined(MIPS32_LE)
static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) {
int bits;
@ -121,11 +128,13 @@ static __inline int WebRtcSpl_NormW16(int16_t a) {
static __inline int32_t WebRtc_MulAccumW16(int16_t a, int16_t b, int32_t c) {
return (a * b + c);
}
#endif // #if !defined(MIPS32_LE)
#endif // WEBRTC_ARCH_ARM_V7
// The following functions have no optimized versions.
// TODO(kma): Consider saturating add/sub instructions in X86 platform.
#if !defined(MIPS_DSP_R1_LE)
static __inline int32_t WebRtcSpl_AddSatW32(int32_t l_var1, int32_t l_var2) {
int32_t l_sum;
@ -163,5 +172,6 @@ static __inline int32_t WebRtcSpl_SubSatW32(int32_t l_var1, int32_t l_var2) {
return l_diff;
}
#endif // #if !defined(MIPS_DSP_R1_LE)
#endif // WEBRTC_SPL_SPL_INL_H_

View File

@ -0,0 +1,281 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// This header file includes the inline functions in
// the fix point signal processing library.
#ifndef WEBRTC_SPL_SPL_INL_MIPS_H_
#define WEBRTC_SPL_SPL_INL_MIPS_H_
static __inline int32_t WEBRTC_SPL_MUL_16_16(int32_t a,
int32_t b) {
int32_t value32 = 0;
int32_t a1 = 0, b1 = 0;
__asm __volatile(
#if defined(MIPS32_R2_LE)
"seh %[a1], %[a] \n\t"
"seh %[b1], %[b] \n\t"
#else
"sll %[a1], %[a], 16 \n\t"
"sll %[b1], %[b], 16 \n\t"
"sra %[a1], %[a1], 16 \n\t"
"sra %[b1], %[b1], 16 \n\t"
#endif
"mul %[value32], %[a1], %[b1] \n\t"
: [value32] "=r" (value32), [a1] "=&r" (a1), [b1] "=&r" (b1)
: [a] "r" (a), [b] "r" (b)
: "hi", "lo"
);
return value32;
}
static __inline int32_t WEBRTC_SPL_MUL_16_32_RSFT16(int16_t a,
int32_t b) {
int32_t value32 = 0, b1 = 0, b2 = 0;
int32_t a1 = 0;
__asm __volatile(
#if defined(MIPS32_R2_LE)
"seh %[a1], %[a] \n\t"
#else
"sll %[a1], %[a], 16 \n\t"
"sra %[a1], %[a1], 16 \n\t"
#endif
"andi %[b2], %[b], 0xFFFF \n\t"
"sra %[b1], %[b], 16 \n\t"
"sra %[b2], %[b2], 1 \n\t"
"mul %[value32], %[a1], %[b1] \n\t"
"mul %[b2], %[a1], %[b2] \n\t"
"addiu %[b2], %[b2], 0x4000 \n\t"
"sra %[b2], %[b2], 15 \n\t"
"addu %[value32], %[value32], %[b2] \n\t"
: [value32] "=&r" (value32), [b1] "=&r" (b1), [b2] "=&r" (b2),
[a1] "=&r" (a1)
: [a] "r" (a), [b] "r" (b)
: "hi", "lo"
);
return value32;
}
static __inline int32_t WEBRTC_SPL_MUL_32_32_RSFT32BI(int32_t a,
int32_t b) {
int32_t tmp = 0;
if ((32767 < a) || (a < 0))
tmp = WEBRTC_SPL_MUL_16_32_RSFT16(((int16_t)(a >> 16)), b);
tmp += WEBRTC_SPL_MUL_16_32_RSFT16(((int16_t)((a & 0x0000FFFF) >> 1)),
b) >> 15;
return tmp;
}
static __inline int32_t WEBRTC_SPL_MUL_32_32_RSFT32(int16_t a,
int16_t b,
int32_t c) {
int32_t tmp1 = 0, tmp2 = 0, tmp3 = 0, tmp4 = 0;
__asm __volatile(
"sra %[tmp1], %[c], 16 \n\t"
"andi %[tmp2], %[c], 0xFFFF \n\t"
#if defined(MIPS32_R2_LE)
"seh %[a], %[a] \n\t"
"seh %[b], %[b] \n\t"
#else
"sll %[a], %[a], 16 \n\t"
"sra %[a], %[a], 16 \n\t"
"sll %[b], %[b], 16 \n\t"
"sra %[b], %[b], 16 \n\t"
#endif
"sra %[tmp2], %[tmp2], 1 \n\t"
"mul %[tmp3], %[a], %[tmp2] \n\t"
"mul %[tmp4], %[b], %[tmp2] \n\t"
"mul %[tmp2], %[a], %[tmp1] \n\t"
"mul %[tmp1], %[b], %[tmp1] \n\t"
#if defined(MIPS_DSP_R1_LE)
"shra_r.w %[tmp3], %[tmp3], 15 \n\t"
"shra_r.w %[tmp4], %[tmp4], 15 \n\t"
#else
"addiu %[tmp3], %[tmp3], 0x4000 \n\t"
"sra %[tmp3], %[tmp3], 15 \n\t"
"addiu %[tmp4], %[tmp4], 0x4000 \n\t"
"sra %[tmp4], %[tmp4], 15 \n\t"
#endif
"addu %[tmp3], %[tmp3], %[tmp2] \n\t"
"addu %[tmp4], %[tmp4], %[tmp1] \n\t"
"sra %[tmp4], %[tmp4], 16 \n\t"
"addu %[tmp1], %[tmp3], %[tmp4] \n\t"
: [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
[a] "+r" (a), [b] "+r" (b)
: [c] "r" (c)
: "hi", "lo"
);
return tmp1;
}
#if defined(MIPS_DSP_R1_LE)
static __inline int16_t WebRtcSpl_SatW32ToW16(int32_t value32) {
__asm __volatile(
"shll_s.w %[value32], %[value32], 16 \n\t"
"sra %[value32], %[value32], 16 \n\t"
: [value32] "+r" (value32)
:
);
int16_t out16 = (int16_t)value32;
return out16;
}
static __inline int16_t WebRtcSpl_AddSatW16(int16_t a, int16_t b) {
int32_t value32 = 0;
__asm __volatile(
"addq_s.ph %[value32], %[a], %[b] \n\t"
: [value32] "=r" (value32)
: [a] "r" (a), [b] "r" (b)
);
return (int16_t)value32;
}
static __inline int32_t WebRtcSpl_AddSatW32(int32_t l_var1, int32_t l_var2) {
int32_t l_sum;
__asm __volatile(
"addq_s.w %[l_sum], %[l_var1], %[l_var2] \n\t"
: [l_sum] "=r" (l_sum)
: [l_var1] "r" (l_var1), [l_var2] "r" (l_var2)
);
return l_sum;
}
static __inline int16_t WebRtcSpl_SubSatW16(int16_t var1, int16_t var2) {
int32_t value32;
__asm __volatile(
"subq_s.ph %[value32], %[var1], %[var2] \n\t"
: [value32] "=r" (value32)
: [var1] "r" (var1), [var2] "r" (var2)
);
return (int16_t)value32;
}
static __inline int32_t WebRtcSpl_SubSatW32(int32_t l_var1, int32_t l_var2) {
int32_t l_diff;
__asm __volatile(
"subq_s.w %[l_diff], %[l_var1], %[l_var2] \n\t"
: [l_diff] "=r" (l_diff)
: [l_var1] "r" (l_var1), [l_var2] "r" (l_var2)
);
return l_diff;
}
#endif
static __inline int16_t WebRtcSpl_GetSizeInBits(uint32_t n) {
int bits = 0;
int i32 = 32;
__asm __volatile(
"clz %[bits], %[n] \n\t"
"subu %[bits], %[i32], %[bits] \n\t"
: [bits] "=&r" (bits)
: [n] "r" (n), [i32] "r" (i32)
);
return bits;
}
static __inline int WebRtcSpl_NormW32(int32_t a) {
int zeros = 0;
__asm __volatile(
".set push \n\t"
".set noreorder \n\t"
"bnez %[a], 1f \n\t"
" sra %[zeros], %[a], 31 \n\t"
"b 2f \n\t"
" move %[zeros], $zero \n\t"
"1: \n\t"
"xor %[zeros], %[a], %[zeros] \n\t"
"clz %[zeros], %[zeros] \n\t"
"addiu %[zeros], %[zeros], -1 \n\t"
"2: \n\t"
".set pop \n\t"
: [zeros]"=&r"(zeros)
: [a] "r" (a)
);
return zeros;
}
static __inline int WebRtcSpl_NormU32(uint32_t a) {
int zeros = 0;
__asm __volatile(
"clz %[zeros], %[a] \n\t"
: [zeros] "=r" (zeros)
: [a] "r" (a)
);
return (zeros & 0x1f);
}
static __inline int WebRtcSpl_NormW16(int16_t a) {
int zeros = 0;
int a0 = a << 16;
__asm __volatile(
".set push \n\t"
".set noreorder \n\t"
"bnez %[a0], 1f \n\t"
" sra %[zeros], %[a0], 31 \n\t"
"b 2f \n\t"
" move %[zeros], $zero \n\t"
"1: \n\t"
"xor %[zeros], %[a0], %[zeros] \n\t"
"clz %[zeros], %[zeros] \n\t"
"addiu %[zeros], %[zeros], -1 \n\t"
"2: \n\t"
".set pop \n\t"
: [zeros]"=&r"(zeros)
: [a0] "r" (a0)
);
return zeros;
}
static __inline int32_t WebRtc_MulAccumW16(int16_t a,
int16_t b,
int32_t c) {
int32_t res = 0, c1 = 0;
__asm __volatile(
#if defined(MIPS32_R2_LE)
"seh %[a], %[a] \n\t"
"seh %[b], %[b] \n\t"
#else
"sll %[a], %[a], 16 \n\t"
"sll %[b], %[b], 16 \n\t"
"sra %[a], %[a], 16 \n\t"
"sra %[b], %[b], 16 \n\t"
#endif
"mul %[res], %[a], %[b] \n\t"
"addu %[c1], %[c], %[res] \n\t"
: [c1] "=r" (c1), [res] "=&r" (res)
: [a] "r" (a), [b] "r" (b), [c] "r" (c)
: "hi", "lo"
);
return (c1);
}
#endif // WEBRTC_SPL_SPL_INL_MIPS_H_

View File

@ -529,12 +529,14 @@ TEST_F(SplTest, CrossCorrelationTest) {
// are not bit-exact.
const int32_t kExpected[kCrossCorrelationDimension] =
{-266947903, -15579555, -171282001};
const int32_t* expected = kExpected;
#if !defined(MIPS32_LE)
const int32_t kExpectedNeon[kCrossCorrelationDimension] =
{-266947901, -15579553, -171281999};
const int32_t* expected = kExpected;
if (WebRtcSpl_CrossCorrelation != WebRtcSpl_CrossCorrelationC) {
expected = kExpectedNeon;
}
#endif
for (int i = 0; i < kCrossCorrelationDimension; ++i) {
EXPECT_EQ(expected[i], vector32[i]);
}

View File

@ -82,18 +82,20 @@ static void InitPointersToMIPS() {
WebRtcSpl_MaxValueW32 = WebRtcSpl_MaxValueW32_mips;
WebRtcSpl_MinValueW16 = WebRtcSpl_MinValueW16_mips;
WebRtcSpl_MinValueW32 = WebRtcSpl_MinValueW32_mips;
WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelationC;
WebRtcSpl_CrossCorrelation = WebRtcSpl_CrossCorrelation_mips;
WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFast_mips;
WebRtcSpl_ScaleAndAddVectorsWithRound =
WebRtcSpl_ScaleAndAddVectorsWithRoundC;
WebRtcSpl_CreateRealFFT = WebRtcSpl_CreateRealFFTC;
WebRtcSpl_FreeRealFFT = WebRtcSpl_FreeRealFFTC;
WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC;
WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC;
#if defined(MIPS_DSP_R1_LE)
WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32_mips;
WebRtcSpl_ScaleAndAddVectorsWithRound =
WebRtcSpl_ScaleAndAddVectorsWithRound_mips;
#else
WebRtcSpl_MaxAbsValueW32 = WebRtcSpl_MaxAbsValueW32C;
WebRtcSpl_ScaleAndAddVectorsWithRound =
WebRtcSpl_ScaleAndAddVectorsWithRoundC;
#endif
}
#endif

View File

@ -0,0 +1,207 @@
/*
* Written by Wilco Dijkstra, 1996. The following email exchange establishes the
* license.
*
* From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
* Date: Fri, Jun 24, 2011 at 3:20 AM
* Subject: Re: sqrt routine
* To: Kevin Ma <kma@google.com>
* Hi Kevin,
* Thanks for asking. Those routines are public domain (originally posted to
* comp.sys.arm a long time ago), so you can use them freely for any purpose.
* Cheers,
* Wilco
*
* ----- Original Message -----
* From: "Kevin Ma" <kma@google.com>
* To: <Wilco.Dijkstra@ntlworld.com>
* Sent: Thursday, June 23, 2011 11:44 PM
* Subject: Fwd: sqrt routine
* Hi Wilco,
* I saw your sqrt routine from several web sites, including
* http://www.finesse.demon.co.uk/steven/sqrt.html.
* Just wonder if there's any copyright information with your Successive
* approximation routines, or if I can freely use it for any purpose.
* Thanks.
* Kevin
*/
// Minor modifications in code style for WebRTC, 2012.
// Code optimizations for MIPS, 2013.
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
/*
* Algorithm:
* Successive approximation of the equation (root + delta) ^ 2 = N
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
* Use delta = 2^i for i = 15 .. 0.
*
* Output precision is 16 bits. Note for large input values (close to
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
* contains the MSB information (a non-sign value). Do with caution
* if you need to cast the output to int16_t type.
*
* If the input value is negative, it returns 0.
*/
int32_t WebRtcSpl_SqrtFloor(int32_t value)
{
int32_t root = 0, tmp1, tmp2, tmp3, tmp4;
__asm __volatile(
".set push \n\t"
".set noreorder \n\t"
"lui %[tmp1], 0x4000 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"sub %[tmp3], %[value], %[tmp1] \n\t"
"lui %[tmp1], 0x1 \n\t"
"or %[tmp4], %[root], %[tmp1] \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x4000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 14 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x8000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x2000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 13 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x4000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x1000 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 12 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x2000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x800 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 11 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x1000 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x400 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 10 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x800 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x200 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 9 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x400 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x100 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 8 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x200 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x80 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 7 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x100 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x40 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 6 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x80 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x20 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 5 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x40 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x10 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 4 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x20 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x8 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 3 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x10 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x4 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 2 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x8 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x2 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"sll %[tmp1], 1 \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"subu %[tmp3], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x4 \n\t"
"movz %[value], %[tmp3], %[tmp2] \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
"addiu %[tmp1], $0, 0x1 \n\t"
"addu %[tmp1], %[tmp1], %[root] \n\t"
"slt %[tmp2], %[value], %[tmp1] \n\t"
"ori %[tmp4], %[root], 0x2 \n\t"
"movz %[root], %[tmp4], %[tmp2] \n\t"
".set pop \n\t"
: [root] "+r" (root), [value] "+r" (value),
[tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
[tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
:
);
return root >> 1;
}

View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file contains implementations of the functions
* WebRtcSpl_ScaleAndAddVectorsWithRound_mips()
*/
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
int WebRtcSpl_ScaleAndAddVectorsWithRound_mips(const int16_t* in_vector1,
int16_t in_vector1_scale,
const int16_t* in_vector2,
int16_t in_vector2_scale,
int right_shifts,
int16_t* out_vector,
int length) {
int16_t r0 = 0, r1 = 0;
int16_t *in1 = (int16_t*)in_vector1;
int16_t *in2 = (int16_t*)in_vector2;
int16_t *out = out_vector;
int i = 0, value32 = 0;
if (in_vector1 == NULL || in_vector2 == NULL || out_vector == NULL ||
length <= 0 || right_shifts < 0) {
return -1;
}
for (i = 0; i < length; i++) {
__asm __volatile (
"lh %[r0], 0(%[in1]) \n\t"
"lh %[r1], 0(%[in2]) \n\t"
"mult %[r0], %[in_vector1_scale] \n\t"
"madd %[r1], %[in_vector2_scale] \n\t"
"extrv_r.w %[value32], $ac0, %[right_shifts] \n\t"
"addiu %[in1], %[in1], 2 \n\t"
"addiu %[in2], %[in2], 2 \n\t"
"sh %[value32], 0(%[out]) \n\t"
"addiu %[out], %[out], 2 \n\t"
: [value32] "=&r" (value32), [out] "+r" (out), [in1] "+r" (in1),
[in2] "+r" (in2), [r0] "=&r" (r0), [r1] "=&r" (r1)
: [in_vector1_scale] "r" (in_vector1_scale),
[in_vector2_scale] "r" (in_vector2_scale),
[right_shifts] "r" (right_shifts)
: "hi", "lo", "memory"
);
}
return 0;
}