MIPS optimizations for ISAC (patch #1)
Implemented functions: - WebRtcIsacfix_AutocorrMIPS - WebRtcIsacfix_FilterArLoop - WebRtcIsacfix_FilterMaLoopMIPS - WebRtcIsacfix_AllpassFilter2FixDec16MIPS (only MIPS DSP) - WebRtcIsacfix_PitchFilterCore (only MIPS DSPR2) Gain achieved: from aprox. 15% (MIPS32) up to aprox. 40% (MIPS DSPR2) R=andrew@webrtc.org, tina.legrand@webrtc.org Review URL: https://webrtc-codereview.appspot.com/17559005 Patch from Ljubomir Papuga <lpapuga@mips.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@6387 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
0d7ab0a634
commit
919914d71b
@ -179,6 +179,21 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
|
||||
int32_t* ptr2);
|
||||
#endif
|
||||
|
||||
#if defined(MIPS32_LE)
|
||||
int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r,
|
||||
const int16_t* __restrict x,
|
||||
int16_t N,
|
||||
int16_t order,
|
||||
int16_t* __restrict scale);
|
||||
|
||||
void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0,
|
||||
int16_t input1,
|
||||
int32_t input2,
|
||||
int32_t* ptr0,
|
||||
int32_t* ptr1,
|
||||
int32_t* ptr2);
|
||||
#endif
|
||||
|
||||
/* Function pointers associated with the above functions. */
|
||||
|
||||
typedef int (*AutocorrFix)(int32_t* __restrict r,
|
||||
|
@ -58,6 +58,17 @@ void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
|
||||
int32_t *filter_state_ch2);
|
||||
#endif
|
||||
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
void WebRtcIsacfix_AllpassFilter2FixDec16MIPS(
|
||||
int16_t *data_ch1,
|
||||
int16_t *data_ch2,
|
||||
const int16_t *factor_ch1,
|
||||
const int16_t *factor_ch2,
|
||||
const int length,
|
||||
int32_t *filter_state_ch1,
|
||||
int32_t *filter_state_ch2);
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) || defined(c_plusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h"
|
||||
|
||||
// WebRtcIsacfix_AllpassFilter2FixDec16 function optimized for MIPSDSP platform
|
||||
// Bit-exact with WebRtcIsacfix_AllpassFilter2FixDec16C from filterbanks.c
|
||||
void WebRtcIsacfix_AllpassFilter2FixDec16MIPS(
|
||||
int16_t *data_ch1, // Input and output in channel 1, in Q0
|
||||
int16_t *data_ch2, // Input and output in channel 2, in Q0
|
||||
const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
|
||||
const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
|
||||
const int length, // Length of the data buffers
|
||||
int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
|
||||
int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
|
||||
|
||||
int32_t st0_ch1, st1_ch1; // channel1 state variables
|
||||
int32_t st0_ch2, st1_ch2; // channel2 state variables
|
||||
int32_t f_ch10, f_ch11, f_ch20, f_ch21; // factor variables
|
||||
int32_t r0, r1, r2, r3, r4, r5; // temporary ragister variables
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
// Load all the state and factor variables
|
||||
"lh %[f_ch10], 0(%[factor_ch1]) \n\t"
|
||||
"lh %[f_ch20], 0(%[factor_ch2]) \n\t"
|
||||
"lh %[f_ch11], 2(%[factor_ch1]) \n\t"
|
||||
"lh %[f_ch21], 2(%[factor_ch2]) \n\t"
|
||||
"lw %[st0_ch1], 0(%[filter_state_ch1]) \n\t"
|
||||
"lw %[st1_ch1], 4(%[filter_state_ch1]) \n\t"
|
||||
"lw %[st0_ch2], 0(%[filter_state_ch2]) \n\t"
|
||||
"lw %[st1_ch2], 4(%[filter_state_ch2]) \n\t"
|
||||
// Allpass filtering loop
|
||||
"1: \n\t"
|
||||
"lh %[r0], 0(%[data_ch1]) \n\t"
|
||||
"lh %[r1], 0(%[data_ch2]) \n\t"
|
||||
"addiu %[length], %[length], -1 \n\t"
|
||||
"mul %[r2], %[r0], %[f_ch10] \n\t"
|
||||
"mul %[r3], %[r1], %[f_ch20] \n\t"
|
||||
"sll %[r0], %[r0], 16 \n\t"
|
||||
"sll %[r1], %[r1], 16 \n\t"
|
||||
"sll %[r2], %[r2], 1 \n\t"
|
||||
"addq_s.w %[r2], %[r2], %[st0_ch1] \n\t"
|
||||
"sll %[r3], %[r3], 1 \n\t"
|
||||
"addq_s.w %[r3], %[r3], %[st0_ch2] \n\t"
|
||||
"sra %[r2], %[r2], 16 \n\t"
|
||||
"mul %[st0_ch1], %[f_ch10], %[r2] \n\t"
|
||||
"sra %[r3], %[r3], 16 \n\t"
|
||||
"mul %[st0_ch2], %[f_ch20], %[r3] \n\t"
|
||||
"mul %[r4], %[r2], %[f_ch11] \n\t"
|
||||
"mul %[r5], %[r3], %[f_ch21] \n\t"
|
||||
"sll %[st0_ch1], %[st0_ch1], 1 \n\t"
|
||||
"subq_s.w %[st0_ch1], %[r0], %[st0_ch1] \n\t"
|
||||
"sll %[st0_ch2], %[st0_ch2], 1 \n\t"
|
||||
"subq_s.w %[st0_ch2], %[r1], %[st0_ch2] \n\t"
|
||||
"sll %[r4], %[r4], 1 \n\t"
|
||||
"addq_s.w %[r4], %[r4], %[st1_ch1] \n\t"
|
||||
"sll %[r5], %[r5], 1 \n\t"
|
||||
"addq_s.w %[r5], %[r5], %[st1_ch2] \n\t"
|
||||
"sra %[r4], %[r4], 16 \n\t"
|
||||
"mul %[r0], %[r4], %[f_ch11] \n\t"
|
||||
"sra %[r5], %[r5], 16 \n\t"
|
||||
"mul %[r1], %[r5], %[f_ch21] \n\t"
|
||||
"sh %[r4], 0(%[data_ch1]) \n\t"
|
||||
"sh %[r5], 0(%[data_ch2]) \n\t"
|
||||
"addiu %[data_ch1], %[data_ch1], 2 \n\t"
|
||||
"sll %[r2], %[r2], 16 \n\t"
|
||||
"sll %[r0], %[r0], 1 \n\t"
|
||||
"subq_s.w %[st1_ch1], %[r2], %[r0] \n\t"
|
||||
"sll %[r3], %[r3], 16 \n\t"
|
||||
"sll %[r1], %[r1], 1 \n\t"
|
||||
"subq_s.w %[st1_ch2], %[r3], %[r1] \n\t"
|
||||
"bgtz %[length], 1b \n\t"
|
||||
" addiu %[data_ch2], %[data_ch2], 2 \n\t"
|
||||
// Store channel states
|
||||
"sw %[st0_ch1], 0(%[filter_state_ch1]) \n\t"
|
||||
"sw %[st1_ch1], 4(%[filter_state_ch1]) \n\t"
|
||||
"sw %[st0_ch2], 0(%[filter_state_ch2]) \n\t"
|
||||
"sw %[st1_ch2], 4(%[filter_state_ch2]) \n\t"
|
||||
".set pop \n\t"
|
||||
: [f_ch10] "=&r" (f_ch10), [f_ch20] "=&r" (f_ch20),
|
||||
[f_ch11] "=&r" (f_ch11), [f_ch21] "=&r" (f_ch21),
|
||||
[st0_ch1] "=&r" (st0_ch1), [st1_ch1] "=&r" (st1_ch1),
|
||||
[st0_ch2] "=&r" (st0_ch2), [st1_ch2] "=&r" (st1_ch2),
|
||||
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
|
||||
[r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5)
|
||||
: [factor_ch1] "r" (factor_ch1), [factor_ch2] "r" (factor_ch2),
|
||||
[filter_state_ch1] "r" (filter_state_ch1),
|
||||
[filter_state_ch2] "r" (filter_state_ch2),
|
||||
[data_ch1] "r" (data_ch1), [data_ch2] "r" (data_ch2),
|
||||
[length] "r" (length)
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
}
|
@ -0,0 +1,365 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
|
||||
|
||||
// MIPS optimized implementation of the Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r,
|
||||
const int16_t* __restrict x,
|
||||
int16_t N,
|
||||
int16_t order,
|
||||
int16_t* __restrict scale) {
|
||||
int i = 0;
|
||||
int16_t scaling = 0;
|
||||
int16_t* in = (int16_t*)x;
|
||||
int loop_size = (int)(N >> 3);
|
||||
int count = (int)(N & 7);
|
||||
// Declare temporary variables used as registry values.
|
||||
int32_t r0, r1, r2, r3;
|
||||
#if !defined(MIPS_DSP_R2_LE)
|
||||
// For non-DSPR2 optimizations 4 more registers are used.
|
||||
int32_t r4, r5, r6, r7;
|
||||
#endif
|
||||
|
||||
// Calculate r[0] and scaling needed.
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"mult $0, $0 \n\t"
|
||||
// Loop is unrolled 8 times, set accumulator to zero in branch delay slot.
|
||||
"beqz %[loop_size], 2f \n\t"
|
||||
" mult $0, $0 \n\t"
|
||||
"1: \n\t"
|
||||
// Load 8 samples per loop iteration.
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"ulw %[r0], 0(%[in]) \n\t"
|
||||
"ulw %[r1], 4(%[in]) \n\t"
|
||||
"ulw %[r2], 8(%[in]) \n\t"
|
||||
"ulw %[r3], 12(%[in]) \n\t"
|
||||
#else
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 2(%[in]) \n\t"
|
||||
"lh %[r2], 4(%[in]) \n\t"
|
||||
"lh %[r3], 6(%[in]) \n\t"
|
||||
"lh %[r4], 8(%[in]) \n\t"
|
||||
"lh %[r5], 10(%[in]) \n\t"
|
||||
"lh %[r6], 12(%[in]) \n\t"
|
||||
"lh %[r7], 14(%[in]) \n\t"
|
||||
#endif
|
||||
"addiu %[loop_size], %[loop_size], -1 \n\t"
|
||||
// Multiply and accumulate.
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"dpa.w.ph $ac0, %[r0], %[r0] \n\t"
|
||||
"dpa.w.ph $ac0, %[r1], %[r1] \n\t"
|
||||
"dpa.w.ph $ac0, %[r2], %[r2] \n\t"
|
||||
"dpa.w.ph $ac0, %[r3], %[r3] \n\t"
|
||||
#else
|
||||
"madd %[r0], %[r0] \n\t"
|
||||
"madd %[r1], %[r1] \n\t"
|
||||
"madd %[r2], %[r2] \n\t"
|
||||
"madd %[r3], %[r3] \n\t"
|
||||
"madd %[r4], %[r4] \n\t"
|
||||
"madd %[r5], %[r5] \n\t"
|
||||
"madd %[r6], %[r6] \n\t"
|
||||
"madd %[r7], %[r7] \n\t"
|
||||
#endif
|
||||
"bnez %[loop_size], 1b \n\t"
|
||||
" addiu %[in], %[in], 16 \n\t"
|
||||
"2: \n\t"
|
||||
"beqz %[count], 4f \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
" extr.w %[r0], $ac0, 31 \n\t"
|
||||
#else
|
||||
" mfhi %[r2] \n\t"
|
||||
#endif
|
||||
// Process remaining samples (if any).
|
||||
"3: \n\t"
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"addiu %[count], %[count], -1 \n\t"
|
||||
"madd %[r0], %[r0] \n\t"
|
||||
"bnez %[count], 3b \n\t"
|
||||
" addiu %[in], %[in], 2 \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"extr.w %[r0], $ac0, 31 \n\t"
|
||||
#else
|
||||
"mfhi %[r2] \n\t"
|
||||
#endif
|
||||
"4: \n\t"
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
"mflo %[r3] \n\t"
|
||||
"sll %[r0], %[r2], 1 \n\t"
|
||||
"srl %[r1], %[r3], 31 \n\t"
|
||||
"addu %[r0], %[r0], %[r1] \n\t"
|
||||
#endif
|
||||
// Calculate scaling (the value of shifting).
|
||||
"clz %[r1], %[r0] \n\t"
|
||||
"addiu %[r1], %[r1], -32 \n\t"
|
||||
"subu %[scaling], $0, %[r1] \n\t"
|
||||
"slti %[r1], %[r0], 0x1 \n\t"
|
||||
"movn %[scaling], $0, %[r1] \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"extrv.w %[r0], $ac0, %[scaling] \n\t"
|
||||
"mfhi %[r2], $ac0 \n\t"
|
||||
#else
|
||||
"addiu %[r1], %[scaling], -32 \n\t"
|
||||
"subu %[r1], $0, %[r1] \n\t"
|
||||
"sllv %[r1], %[r2], %[r1] \n\t"
|
||||
"srlv %[r0], %[r3], %[scaling] \n\t"
|
||||
"addu %[r0], %[r0], %[r1] \n\t"
|
||||
#endif
|
||||
"slti %[r1], %[scaling], 32 \n\t"
|
||||
"movz %[r0], %[r2], %[r1] \n\t"
|
||||
".set pop \n\t"
|
||||
: [loop_size] "+r" (loop_size), [in] "+r" (in), [r0] "=&r" (r0),
|
||||
[r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
|
||||
#if !defined(MIPS_DSP_R2_LE)
|
||||
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
|
||||
#endif
|
||||
[count] "+r" (count), [scaling] "=r" (scaling)
|
||||
: [N] "r" (N)
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
r[0] = r0;
|
||||
|
||||
// Correlation calculation is divided in 3 cases depending on the scaling
|
||||
// value (different accumulator manipulation needed). Three slightly different
|
||||
// loops are written in order to avoid branches inside the loop.
|
||||
if (scaling == 0) {
|
||||
// In this case, the result will be in low part of the accumulator.
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
in = (int16_t*)x;
|
||||
int16_t* in1 = (int16_t*)x + i;
|
||||
count = N - i;
|
||||
loop_size = (count) >> 2;
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"mult $0, $0 \n\t"
|
||||
"beqz %[loop_size], 2f \n\t"
|
||||
" andi %[count], %[count], 0x3 \n\t"
|
||||
// Loop processing 4 pairs of samples per iteration.
|
||||
"1: \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"ulw %[r0], 0(%[in]) \n\t"
|
||||
"ulw %[r1], 0(%[in1]) \n\t"
|
||||
"ulw %[r2], 4(%[in]) \n\t"
|
||||
"ulw %[r3], 4(%[in1]) \n\t"
|
||||
#else
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"lh %[r2], 2(%[in]) \n\t"
|
||||
"lh %[r3], 2(%[in1]) \n\t"
|
||||
"lh %[r4], 4(%[in]) \n\t"
|
||||
"lh %[r5], 4(%[in1]) \n\t"
|
||||
"lh %[r6], 6(%[in]) \n\t"
|
||||
"lh %[r7], 6(%[in1]) \n\t"
|
||||
#endif
|
||||
"addiu %[loop_size], %[loop_size], -1 \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
|
||||
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
|
||||
#else
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"madd %[r2], %[r3] \n\t"
|
||||
"madd %[r4], %[r5] \n\t"
|
||||
"madd %[r6], %[r7] \n\t"
|
||||
#endif
|
||||
"addiu %[in], %[in], 8 \n\t"
|
||||
"bnez %[loop_size], 1b \n\t"
|
||||
" addiu %[in1], %[in1], 8 \n\t"
|
||||
"2: \n\t"
|
||||
"beqz %[count], 4f \n\t"
|
||||
" mflo %[r0] \n\t"
|
||||
// Process remaining samples (if any).
|
||||
"3: \n\t"
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"addiu %[count], %[count], -1 \n\t"
|
||||
"addiu %[in], %[in], 2 \n\t"
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"bnez %[count], 3b \n\t"
|
||||
" addiu %[in1], %[in1], 2 \n\t"
|
||||
"mflo %[r0] \n\t"
|
||||
"4: \n\t"
|
||||
".set pop \n\t"
|
||||
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
|
||||
#if !defined(MIPS_DSP_R2_LE)
|
||||
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
|
||||
#endif
|
||||
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
|
||||
[count] "+r" (count)
|
||||
:
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
r[i] = r0;
|
||||
}
|
||||
} else if (scaling == 32) {
|
||||
// In this case, the result will be high part of the accumulator.
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
in = (int16_t*)x;
|
||||
int16_t* in1 = (int16_t*)x + i;
|
||||
count = N - i;
|
||||
loop_size = (count) >> 2;
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"mult $0, $0 \n\t"
|
||||
"beqz %[loop_size], 2f \n\t"
|
||||
" andi %[count], %[count], 0x3 \n\t"
|
||||
// Loop processing 4 pairs of samples per iteration.
|
||||
"1: \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"ulw %[r0], 0(%[in]) \n\t"
|
||||
"ulw %[r1], 0(%[in1]) \n\t"
|
||||
"ulw %[r2], 4(%[in]) \n\t"
|
||||
"ulw %[r3], 4(%[in1]) \n\t"
|
||||
#else
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"lh %[r2], 2(%[in]) \n\t"
|
||||
"lh %[r3], 2(%[in1]) \n\t"
|
||||
"lh %[r4], 4(%[in]) \n\t"
|
||||
"lh %[r5], 4(%[in1]) \n\t"
|
||||
"lh %[r6], 6(%[in]) \n\t"
|
||||
"lh %[r7], 6(%[in1]) \n\t"
|
||||
#endif
|
||||
"addiu %[loop_size], %[loop_size], -1 \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
|
||||
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
|
||||
#else
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"madd %[r2], %[r3] \n\t"
|
||||
"madd %[r4], %[r5] \n\t"
|
||||
"madd %[r6], %[r7] \n\t"
|
||||
#endif
|
||||
"addiu %[in], %[in], 8 \n\t"
|
||||
"bnez %[loop_size], 1b \n\t"
|
||||
" addiu %[in1], %[in1], 8 \n\t"
|
||||
"2: \n\t"
|
||||
"beqz %[count], 4f \n\t"
|
||||
" mfhi %[r0] \n\t"
|
||||
// Process remaining samples (if any).
|
||||
"3: \n\t"
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"addiu %[count], %[count], -1 \n\t"
|
||||
"addiu %[in], %[in], 2 \n\t"
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"bnez %[count], 3b \n\t"
|
||||
" addiu %[in1], %[in1], 2 \n\t"
|
||||
"mfhi %[r0] \n\t"
|
||||
"4: \n\t"
|
||||
".set pop \n\t"
|
||||
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
|
||||
#if !defined(MIPS_DSP_R2_LE)
|
||||
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
|
||||
#endif
|
||||
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
|
||||
[count] "+r" (count)
|
||||
:
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
r[i] = r0;
|
||||
}
|
||||
} else {
|
||||
// In this case, the result is obtained by combining low and high parts
|
||||
// of the accumulator.
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
int32_t tmp_shift = 32 - scaling;
|
||||
#endif
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
in = (int16_t*)x;
|
||||
int16_t* in1 = (int16_t*)x + i;
|
||||
count = N - i;
|
||||
loop_size = (count) >> 2;
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"mult $0, $0 \n\t"
|
||||
"beqz %[loop_size], 2f \n\t"
|
||||
" andi %[count], %[count], 0x3 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"ulw %[r0], 0(%[in]) \n\t"
|
||||
"ulw %[r1], 0(%[in1]) \n\t"
|
||||
"ulw %[r2], 4(%[in]) \n\t"
|
||||
"ulw %[r3], 4(%[in1]) \n\t"
|
||||
#else
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"lh %[r2], 2(%[in]) \n\t"
|
||||
"lh %[r3], 2(%[in1]) \n\t"
|
||||
"lh %[r4], 4(%[in]) \n\t"
|
||||
"lh %[r5], 4(%[in1]) \n\t"
|
||||
"lh %[r6], 6(%[in]) \n\t"
|
||||
"lh %[r7], 6(%[in1]) \n\t"
|
||||
#endif
|
||||
"addiu %[loop_size], %[loop_size], -1 \n\t"
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
|
||||
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
|
||||
#else
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"madd %[r2], %[r3] \n\t"
|
||||
"madd %[r4], %[r5] \n\t"
|
||||
"madd %[r6], %[r7] \n\t"
|
||||
#endif
|
||||
"addiu %[in], %[in], 8 \n\t"
|
||||
"bnez %[loop_size], 1b \n\t"
|
||||
" addiu %[in1], %[in1], 8 \n\t"
|
||||
"2: \n\t"
|
||||
"beqz %[count], 4f \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
" extrv.w %[r0], $ac0, %[scaling] \n\t"
|
||||
#else
|
||||
" mfhi %[r0] \n\t"
|
||||
#endif
|
||||
"3: \n\t"
|
||||
"lh %[r0], 0(%[in]) \n\t"
|
||||
"lh %[r1], 0(%[in1]) \n\t"
|
||||
"addiu %[count], %[count], -1 \n\t"
|
||||
"addiu %[in], %[in], 2 \n\t"
|
||||
"madd %[r0], %[r1] \n\t"
|
||||
"bnez %[count], 3b \n\t"
|
||||
" addiu %[in1], %[in1], 2 \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"extrv.w %[r0], $ac0, %[scaling] \n\t"
|
||||
#else
|
||||
"mfhi %[r0] \n\t"
|
||||
#endif
|
||||
"4: \n\t"
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
"mflo %[r1] \n\t"
|
||||
"sllv %[r0], %[r0], %[tmp_shift] \n\t"
|
||||
"srlv %[r1], %[r1], %[scaling] \n\t"
|
||||
"addu %[r0], %[r0], %[r1] \n\t"
|
||||
#endif
|
||||
".set pop \n\t"
|
||||
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
|
||||
#if !defined(MIPS_DSP_R2_LE)
|
||||
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
|
||||
#endif
|
||||
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
|
||||
[count] "+r" (count)
|
||||
: [scaling] "r" (scaling)
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
, [tmp_shift] "r" (tmp_shift)
|
||||
#endif
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
r[i] = r0;
|
||||
}
|
||||
}
|
||||
*scale = scaling;
|
||||
|
||||
return (order + 1);
|
||||
}
|
@ -179,7 +179,7 @@ int16_t WebRtcIsacfix_FreeInternal(ISACFIX_MainStruct *ISAC_main_inst)
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcAecm_InitNeon(...)
|
||||
* WebRtcIsacfix_InitNeon(...)
|
||||
*
|
||||
* This function initializes function pointers for ARM Neon platform.
|
||||
*/
|
||||
@ -199,6 +199,23 @@ static void WebRtcIsacfix_InitNeon(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcIsacfix_InitMIPS(...)
|
||||
*
|
||||
* This function initializes function pointers for MIPS platform.
|
||||
*/
|
||||
|
||||
#if defined(MIPS32_LE)
|
||||
static void WebRtcIsacfix_InitMIPS(void) {
|
||||
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrMIPS;
|
||||
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopMIPS;
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
WebRtcIsacfix_AllpassFilter2FixDec16 =
|
||||
WebRtcIsacfix_AllpassFilter2FixDec16MIPS;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcIsacfix_EncoderInit(...)
|
||||
*
|
||||
@ -296,6 +313,10 @@ int16_t WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst,
|
||||
WebRtcIsacfix_InitNeon();
|
||||
#endif
|
||||
|
||||
#if defined(MIPS32_LE)
|
||||
WebRtcIsacfix_InitMIPS();
|
||||
#endif
|
||||
|
||||
return statusInit;
|
||||
}
|
||||
|
||||
|
@ -85,6 +85,30 @@
|
||||
'pitch_filter_c.c',
|
||||
],
|
||||
}],
|
||||
['target_arch=="mipsel"', {
|
||||
'sources': [
|
||||
'filters_mips.c',
|
||||
'lattice_mips.c',
|
||||
],
|
||||
'sources!': [
|
||||
'lattice_c.c',
|
||||
],
|
||||
'conditions': [
|
||||
['mips_dsp_rev>0', {
|
||||
'sources': [
|
||||
'filterbanks_mips.c',
|
||||
],
|
||||
}],
|
||||
['mips_dsp_rev>1', {
|
||||
'sources': [
|
||||
'pitch_filter_mips.c',
|
||||
],
|
||||
'sources!': [
|
||||
'pitch_filter_c.c',
|
||||
],
|
||||
}],
|
||||
],
|
||||
}],
|
||||
],
|
||||
},
|
||||
],
|
||||
|
@ -0,0 +1,327 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
|
||||
#include "webrtc/typedefs.h"
|
||||
|
||||
// Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients
|
||||
// cth_Q15[] and sth_Q15[].
|
||||
void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples
|
||||
int16_t* ar_f_Q0, // Input samples
|
||||
int16_t* cth_Q15, // Filter coefficients
|
||||
int16_t* sth_Q15, // Filter coefficients
|
||||
int16_t order_coef) { // order of the filter
|
||||
int n = 0;
|
||||
|
||||
for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) {
|
||||
int count = order_coef - 1;
|
||||
int offset;
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
int16_t* tmp_cth;
|
||||
int16_t* tmp_sth;
|
||||
int16_t* tmp_arg;
|
||||
int32_t max_q16 = 0x7fff;
|
||||
int32_t min_q16 = 0xffff8000;
|
||||
#endif
|
||||
// Declare variables used as temporary registers.
|
||||
int32_t r0, r1, r2, t0, t1, t2, t_ar;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"bltz %[count], 2f \n\t"
|
||||
" lh %[t_ar], 0(%[tmp]) \n\t"
|
||||
// Inner loop
|
||||
"1: \n\t"
|
||||
"sll %[offset], %[count], 1 \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"lhx %[r0], %[offset](%[cth_Q15]) \n\t"
|
||||
"lhx %[r1], %[offset](%[sth_Q15]) \n\t"
|
||||
"lhx %[r2], %[offset](%[ar_g_Q0]) \n\t"
|
||||
#else
|
||||
"addu %[tmp_cth], %[cth_Q15], %[offset] \n\t"
|
||||
"addu %[tmp_sth], %[sth_Q15], %[offset] \n\t"
|
||||
"addu %[tmp_arg], %[ar_g_Q0], %[offset] \n\t"
|
||||
"lh %[r0], 0(%[tmp_cth]) \n\t"
|
||||
"lh %[r1], 0(%[tmp_sth]) \n\t"
|
||||
"lh %[r2], 0(%[tmp_arg]) \n\t"
|
||||
#endif
|
||||
"mul %[t0], %[r0], %[t_ar] \n\t"
|
||||
"mul %[t1], %[r1], %[t_ar] \n\t"
|
||||
"mul %[t2], %[r1], %[r2] \n\t"
|
||||
"mul %[r0], %[r0], %[r2] \n\t"
|
||||
"subu %[t0], %[t0], %[t2] \n\t"
|
||||
"addu %[t1], %[t1], %[r0] \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"shra_r.w %[t1], %[t1], 15 \n\t"
|
||||
"shra_r.w %[t0], %[t0], 15 \n\t"
|
||||
#else
|
||||
"addiu %[t1], %[t1], 0x4000 \n\t"
|
||||
"sra %[t1], %[t1], 15 \n\t"
|
||||
"addiu %[t0], %[t0], 0x4000 \n\t"
|
||||
"sra %[t0], %[t0], 15 \n\t"
|
||||
#endif
|
||||
"addiu %[offset], %[offset], 2 \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"shll_s.w %[t1], %[t1], 16 \n\t"
|
||||
"shll_s.w %[t_ar], %[t0], 16 \n\t"
|
||||
#else
|
||||
"slt %[r0], %[t1], %[max_q16] \n\t"
|
||||
"slt %[r1], %[t0], %[max_q16] \n\t"
|
||||
"movz %[t1], %[max_q16], %[r0] \n\t"
|
||||
"movz %[t0], %[max_q16], %[r1] \n\t"
|
||||
#endif
|
||||
"addu %[offset], %[offset], %[ar_g_Q0] \n\t"
|
||||
#if defined(MIPS_DSP_R1_LE)
|
||||
"sra %[t1], %[t1], 16 \n\t"
|
||||
"sra %[t_ar], %[t_ar], 16 \n\t"
|
||||
#else
|
||||
"slt %[r0], %[t1], %[min_q16] \n\t"
|
||||
"slt %[r1], %[t0], %[min_q16] \n\t"
|
||||
"movn %[t1], %[min_q16], %[r0] \n\t"
|
||||
"movn %[t0], %[min_q16], %[r1] \n\t"
|
||||
"addu %[t_ar], $zero, %[t0] \n\t"
|
||||
#endif
|
||||
"sh %[t1], 0(%[offset]) \n\t"
|
||||
"bgtz %[count], 1b \n\t"
|
||||
" addiu %[count], %[count], -1 \n\t"
|
||||
"2: \n\t"
|
||||
"sh %[t_ar], 0(%[tmp]) \n\t"
|
||||
"sh %[t_ar], 0(%[ar_g_Q0]) \n\t"
|
||||
".set pop \n\t"
|
||||
: [t_ar] "=&r" (t_ar), [count] "+r" (count), [offset] "=&r" (offset),
|
||||
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [t0] "=&r" (t0),
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
[tmp_cth] "=&r" (tmp_cth), [tmp_sth] "=&r" (tmp_sth),
|
||||
[tmp_arg] "=&r" (tmp_arg),
|
||||
#endif
|
||||
[t1] "=&r" (t1), [t2] "=&r" (t2)
|
||||
: [tmp] "r" (&ar_f_Q0[n+1]), [cth_Q15] "r" (cth_Q15),
|
||||
#if !defined(MIPS_DSP_R1_LE)
|
||||
[max_q16] "r" (max_q16), [min_q16] "r" (min_q16),
|
||||
#endif
|
||||
[sth_Q15] "r" (sth_Q15), [ar_g_Q0] "r" (ar_g_Q0)
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// MIPS optimization of the inner loop used for function
|
||||
// WebRtcIsacfix_NormLatticeFilterMa(). It does:
|
||||
//
|
||||
// for 0 <= n < HALF_SUBFRAMELEN - 1:
|
||||
// *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
|
||||
// *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
||||
//
|
||||
// Note, function WebRtcIsacfix_FilterMaLoopMIPS and WebRtcIsacfix_FilterMaLoopC
|
||||
// are not bit-exact. The accuracy of the MIPS function is same or better.
|
||||
void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, // Filter coefficient
|
||||
int16_t input1, // Filter coefficient
|
||||
int32_t input2, // Inverse coeff (1/input1)
|
||||
int32_t* ptr0, // Sample buffer
|
||||
int32_t* ptr1, // Sample buffer
|
||||
int32_t* ptr2) { // Sample buffer
|
||||
#if defined(MIPS_DSP_R2_LE)
|
||||
// MIPS DSPR2 version. 4 available accumulators allows loop unrolling 4 times.
|
||||
// This variant is not bit-exact with WebRtcIsacfix_FilterMaLoopC, since we
|
||||
// are exploiting 64-bit accumulators. The accuracy of the MIPS DSPR2 function
|
||||
// is same or better.
|
||||
int n = (HALF_SUBFRAMELEN - 1) >> 2;
|
||||
int m = (HALF_SUBFRAMELEN - 1) & 3;
|
||||
|
||||
int r0, r1, r2, r3;
|
||||
int t0, t1, t2, t3;
|
||||
int s0, s1, s2, s3;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"1: \n\t"
|
||||
"lw %[r0], 0(%[ptr0]) \n\t"
|
||||
"lw %[r1], 4(%[ptr0]) \n\t"
|
||||
"lw %[r2], 8(%[ptr0]) \n\t"
|
||||
"lw %[r3], 12(%[ptr0]) \n\t"
|
||||
"mult $ac0, %[r0], %[input0] \n\t"
|
||||
"mult $ac1, %[r1], %[input0] \n\t"
|
||||
"mult $ac2, %[r2], %[input0] \n\t"
|
||||
"mult $ac3, %[r3], %[input0] \n\t"
|
||||
"lw %[t0], 0(%[ptr2]) \n\t"
|
||||
"extr_rs.w %[s0], $ac0, 15 \n\t"
|
||||
"extr_rs.w %[s1], $ac1, 15 \n\t"
|
||||
"extr_rs.w %[s2], $ac2, 15 \n\t"
|
||||
"extr_rs.w %[s3], $ac3, 15 \n\t"
|
||||
"lw %[t1], 4(%[ptr2]) \n\t"
|
||||
"lw %[t2], 8(%[ptr2]) \n\t"
|
||||
"lw %[t3], 12(%[ptr2]) \n\t"
|
||||
"addu %[t0], %[t0], %[s0] \n\t"
|
||||
"addu %[t1], %[t1], %[s1] \n\t"
|
||||
"addu %[t2], %[t2], %[s2] \n\t"
|
||||
"addu %[t3], %[t3], %[s3] \n\t"
|
||||
"mult $ac0, %[t0], %[input2] \n\t"
|
||||
"mult $ac1, %[t1], %[input2] \n\t"
|
||||
"mult $ac2, %[t2], %[input2] \n\t"
|
||||
"mult $ac3, %[t3], %[input2] \n\t"
|
||||
"addiu %[ptr0], %[ptr0], 16 \n\t"
|
||||
"extr_rs.w %[t0], $ac0, 16 \n\t"
|
||||
"extr_rs.w %[t1], $ac1, 16 \n\t"
|
||||
"extr_rs.w %[t2], $ac2, 16 \n\t"
|
||||
"extr_rs.w %[t3], $ac3, 16 \n\t"
|
||||
"addiu %[n], %[n], -1 \n\t"
|
||||
"mult $ac0, %[r0], %[input1] \n\t"
|
||||
"mult $ac1, %[r1], %[input1] \n\t"
|
||||
"mult $ac2, %[r2], %[input1] \n\t"
|
||||
"mult $ac3, %[r3], %[input1] \n\t"
|
||||
"sw %[t0], 0(%[ptr2]) \n\t"
|
||||
"extr_rs.w %[s0], $ac0, 15 \n\t"
|
||||
"extr_rs.w %[s1], $ac1, 15 \n\t"
|
||||
"extr_rs.w %[s2], $ac2, 15 \n\t"
|
||||
"extr_rs.w %[s3], $ac3, 15 \n\t"
|
||||
"sw %[t1], 4(%[ptr2]) \n\t"
|
||||
"sw %[t2], 8(%[ptr2]) \n\t"
|
||||
"sw %[t3], 12(%[ptr2]) \n\t"
|
||||
"mult $ac0, %[t0], %[input0] \n\t"
|
||||
"mult $ac1, %[t1], %[input0] \n\t"
|
||||
"mult $ac2, %[t2], %[input0] \n\t"
|
||||
"mult $ac3, %[t3], %[input0] \n\t"
|
||||
"addiu %[ptr2], %[ptr2], 16 \n\t"
|
||||
"extr_rs.w %[t0], $ac0, 15 \n\t"
|
||||
"extr_rs.w %[t1], $ac1, 15 \n\t"
|
||||
"extr_rs.w %[t2], $ac2, 15 \n\t"
|
||||
"extr_rs.w %[t3], $ac3, 15 \n\t"
|
||||
"addu %[t0], %[t0], %[s0] \n\t"
|
||||
"addu %[t1], %[t1], %[s1] \n\t"
|
||||
"addu %[t2], %[t2], %[s2] \n\t"
|
||||
"addu %[t3], %[t3], %[s3] \n\t"
|
||||
"sw %[t0], 0(%[ptr1]) \n\t"
|
||||
"sw %[t1], 4(%[ptr1]) \n\t"
|
||||
"sw %[t2], 8(%[ptr1]) \n\t"
|
||||
"sw %[t3], 12(%[ptr1]) \n\t"
|
||||
"bgtz %[n], 1b \n\t"
|
||||
" addiu %[ptr1], %[ptr1], 16 \n\t"
|
||||
"beq %[m], %0, 3f \n\t"
|
||||
" nop \n\t"
|
||||
"2: \n\t"
|
||||
"lw %[r0], 0(%[ptr0]) \n\t"
|
||||
"lw %[t0], 0(%[ptr2]) \n\t"
|
||||
"addiu %[ptr0], %[ptr0], 4 \n\t"
|
||||
"mult $ac0, %[r0], %[input0] \n\t"
|
||||
"mult $ac1, %[r0], %[input1] \n\t"
|
||||
"extr_rs.w %[r1], $ac0, 15 \n\t"
|
||||
"extr_rs.w %[t1], $ac1, 15 \n\t"
|
||||
"addu %[t0], %[t0], %[r1] \n\t"
|
||||
"mult $ac0, %[t0], %[input2] \n\t"
|
||||
"extr_rs.w %[t0], $ac0, 16 \n\t"
|
||||
"sw %[t0], 0(%[ptr2]) \n\t"
|
||||
"mult $ac0, %[t0], %[input0] \n\t"
|
||||
"addiu %[ptr2], %[ptr2], 4 \n\t"
|
||||
"addiu %[m], %[m], -1 \n\t"
|
||||
"extr_rs.w %[t0], $ac0, 15 \n\t"
|
||||
"addu %[t0], %[t0], %[t1] \n\t"
|
||||
"sw %[t0], 0(%[ptr1]) \n\t"
|
||||
"bgtz %[m], 2b \n\t"
|
||||
" addiu %[ptr1], %[ptr1], 4 \n\t"
|
||||
"3: \n\t"
|
||||
".set pop \n\t"
|
||||
: [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
|
||||
[r3] "=&r" (r3), [t0] "=&r" (t0), [t1] "=&r" (t1),
|
||||
[t2] "=&r" (t2), [t3] "=&r" (t3), [s0] "=&r" (s0),
|
||||
[s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3),
|
||||
[ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [m] "+r" (m),
|
||||
[ptr2] "+r" (ptr2), [n] "+r" (n)
|
||||
: [input0] "r" (input0), [input1] "r" (input1),
|
||||
[input2] "r" (input2)
|
||||
: "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi",
|
||||
"$ac2lo", "$ac3hi", "$ac3lo"
|
||||
);
|
||||
#else
|
||||
// Non-DSPR2 version of the function. Avoiding the accumulator usage due to
|
||||
// large latencies. This variant is bit-exact with C code.
|
||||
int n = HALF_SUBFRAMELEN - 1;
|
||||
int32_t t16a, t16b;
|
||||
int32_t r0, r1, r2, r3, r4;
|
||||
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"sra %[t16a], %[input2], 16 \n\t"
|
||||
"andi %[t16b], %[input2], 0xFFFF \n\t"
|
||||
#if defined(MIPS32R2_LE)
|
||||
"seh %[t16b], %[t16b] \n\t"
|
||||
"seh %[input0], %[input0] \n\t"
|
||||
"seh %[input1], %[input1] \n\t"
|
||||
#else
|
||||
"sll %[t16b], %[t16b], 16 \n\t"
|
||||
"sra %[t16b], %[t16b], 16 \n\t"
|
||||
"sll %[input0], %[input0], 16 \n\t"
|
||||
"sra %[input0], %[input0], 16 \n\t"
|
||||
"sll %[input1], %[input1], 16 \n\t"
|
||||
"sra %[input1], %[input1], 16 \n\t"
|
||||
#endif
|
||||
"addiu %[r0], %[t16a], 1 \n\t"
|
||||
"slt %[r1], %[t16b], $zero \n\t"
|
||||
"movn %[t16a], %[r0], %[r1] \n\t"
|
||||
"1: \n\t"
|
||||
"lw %[r0], 0(%[ptr0]) \n\t"
|
||||
"lw %[r1], 0(%[ptr2]) \n\t"
|
||||
"addiu %[ptr0], %[ptr0], 4 \n\t"
|
||||
"sra %[r2], %[r0], 16 \n\t"
|
||||
"andi %[r0], %[r0], 0xFFFF \n\t"
|
||||
"mul %[r3], %[r2], %[input0] \n\t"
|
||||
"mul %[r4], %[r0], %[input0] \n\t"
|
||||
"mul %[r2], %[r2], %[input1] \n\t"
|
||||
"mul %[r0], %[r0], %[input1] \n\t"
|
||||
"addiu %[ptr2], %[ptr2], 4 \n\t"
|
||||
"sll %[r3], %[r3], 1 \n\t"
|
||||
"sra %[r4], %[r4], 1 \n\t"
|
||||
"addiu %[r4], %[r4], 0x2000 \n\t"
|
||||
"sra %[r4], %[r4], 14 \n\t"
|
||||
"addu %[r3], %[r3], %[r4] \n\t"
|
||||
"addu %[r1], %[r1], %[r3] \n\t"
|
||||
"sra %[r3], %[r1], 16 \n\t"
|
||||
"andi %[r4], %[r1], 0xFFFF \n\t"
|
||||
"sra %[r4], %[r4], 1 \n\t"
|
||||
"mul %[r1], %[r1], %[t16a] \n\t"
|
||||
"mul %[r3], %[r3], %[t16b] \n\t"
|
||||
"mul %[r4], %[r4], %[t16b] \n\t"
|
||||
"sll %[r2], %[r2], 1 \n\t"
|
||||
"sra %[r0], %[r0], 1 \n\t"
|
||||
"addiu %[r0], %[r0], 0x2000 \n\t"
|
||||
"sra %[r0], %[r0], 14 \n\t"
|
||||
"addu %[r0], %[r0], %[r2] \n\t"
|
||||
"addiu %[n], %[n], -1 \n\t"
|
||||
"addu %[r1], %[r1], %[r3] \n\t"
|
||||
"addiu %[r4], %[r4], 0x4000 \n\t"
|
||||
"sra %[r4], %[r4], 15 \n\t"
|
||||
"addu %[r1], %[r1], %[r4] \n\t"
|
||||
"sra %[r2], %[r1], 16 \n\t"
|
||||
"andi %[r3], %[r1], 0xFFFF \n\t"
|
||||
"mul %[r3], %[r3], %[input0] \n\t"
|
||||
"mul %[r2], %[r2], %[input0] \n\t"
|
||||
"sw %[r1], -4(%[ptr2]) \n\t"
|
||||
"sra %[r3], %[r3], 1 \n\t"
|
||||
"addiu %[r3], %[r3], 0x2000 \n\t"
|
||||
"sra %[r3], %[r3], 14 \n\t"
|
||||
"addu %[r0], %[r0], %[r3] \n\t"
|
||||
"sll %[r2], %[r2], 1 \n\t"
|
||||
"addu %[r0], %[r0], %[r2] \n\t"
|
||||
"sw %[r0], 0(%[ptr1]) \n\t"
|
||||
"bgtz %[n], 1b \n\t"
|
||||
" addiu %[ptr1], %[ptr1], 4 \n\t"
|
||||
".set pop \n\t"
|
||||
: [t16a] "=&r" (t16a), [t16b] "=&r" (t16b), [r0] "=&r" (r0),
|
||||
[r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
|
||||
[r4] "=&r" (r4), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1),
|
||||
[ptr2] "+r" (ptr2), [n] "+r" (n)
|
||||
: [input0] "r" (input0), [input1] "r" (input1),
|
||||
[input2] "r" (input2)
|
||||
: "hi", "lo", "memory"
|
||||
);
|
||||
#endif
|
||||
}
|
@ -0,0 +1,133 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_estimator.h"
|
||||
|
||||
void WebRtcIsacfix_PitchFilterCore(int loopNumber,
|
||||
int16_t gain,
|
||||
int index,
|
||||
int16_t sign,
|
||||
int16_t* inputState,
|
||||
int16_t* outputBuf2,
|
||||
const int16_t* coefficient,
|
||||
int16_t* inputBuf,
|
||||
int16_t* outputBuf,
|
||||
int* index2) {
|
||||
int ind2t = *index2;
|
||||
int i = 0;
|
||||
int16_t* out2_pos2 = &outputBuf2[PITCH_BUFFSIZE - (index + 2)] + ind2t;
|
||||
int32_t w1, w2, w3, w4, w5, gain32, sign32;
|
||||
int32_t coef1, coef2, coef3, coef4, coef5 = 0;
|
||||
// Define damp factors as int32_t (pair of int16_t)
|
||||
int32_t kDampF0 = 0x0000F70A;
|
||||
int32_t kDampF1 = 0x51EC2000;
|
||||
int32_t kDampF2 = 0xF70A2000;
|
||||
int16_t* input1 = inputBuf + ind2t;
|
||||
int16_t* output1 = outputBuf + ind2t;
|
||||
int16_t* output2 = outputBuf2 + ind2t + PITCH_BUFFSIZE;
|
||||
|
||||
// Load coefficients outside the loop and sign-extend gain and sign
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"lwl %[coef1], 3(%[coefficient]) \n\t"
|
||||
"lwl %[coef2], 7(%[coefficient]) \n\t"
|
||||
"lwl %[coef3], 11(%[coefficient]) \n\t"
|
||||
"lwl %[coef4], 15(%[coefficient]) \n\t"
|
||||
"lwr %[coef1], 0(%[coefficient]) \n\t"
|
||||
"lwr %[coef2], 4(%[coefficient]) \n\t"
|
||||
"lwr %[coef3], 8(%[coefficient]) \n\t"
|
||||
"lwr %[coef4], 12(%[coefficient]) \n\t"
|
||||
"lhu %[coef5], 16(%[coefficient]) \n\t"
|
||||
"seh %[gain32], %[gain] \n\t"
|
||||
"seh %[sign32], %[sign] \n\t"
|
||||
".set pop \n\t"
|
||||
: [coef1] "=&r" (coef1), [coef2] "=&r" (coef2), [coef3] "=&r" (coef3),
|
||||
[coef4] "=&r" (coef4), [coef5] "=&r" (coef5), [gain32] "=&r" (gain32),
|
||||
[sign32] "=&r" (sign32)
|
||||
: [coefficient] "r" (coefficient), [gain] "r" (gain),
|
||||
[sign] "r" (sign)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
for (i = 0; i < loopNumber; i++) {
|
||||
__asm __volatile (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
// Filter to get fractional pitch
|
||||
"li %[w1], 8192 \n\t"
|
||||
"mtlo %[w1] \n\t"
|
||||
"mthi $0 \n\t"
|
||||
"lwl %[w1], 3(%[out2_pos2]) \n\t"
|
||||
"lwl %[w2], 7(%[out2_pos2]) \n\t"
|
||||
"lwl %[w3], 11(%[out2_pos2]) \n\t"
|
||||
"lwl %[w4], 15(%[out2_pos2]) \n\t"
|
||||
"lwr %[w1], 0(%[out2_pos2]) \n\t"
|
||||
"lwr %[w2], 4(%[out2_pos2]) \n\t"
|
||||
"lwr %[w3], 8(%[out2_pos2]) \n\t"
|
||||
"lwr %[w4], 12(%[out2_pos2]) \n\t"
|
||||
"lhu %[w5], 16(%[out2_pos2]) \n\t"
|
||||
"dpa.w.ph $ac0, %[w1], %[coef1] \n\t"
|
||||
"dpa.w.ph $ac0, %[w2], %[coef2] \n\t"
|
||||
"dpa.w.ph $ac0, %[w3], %[coef3] \n\t"
|
||||
"dpa.w.ph $ac0, %[w4], %[coef4] \n\t"
|
||||
"dpa.w.ph $ac0, %[w5], %[coef5] \n\t"
|
||||
"addiu %[out2_pos2], %[out2_pos2], 2 \n\t"
|
||||
"mthi $0, $ac1 \n\t"
|
||||
"lwl %[w2], 3(%[inputState]) \n\t"
|
||||
"lwl %[w3], 7(%[inputState]) \n\t"
|
||||
// Fractional pitch shift & saturation
|
||||
"extr_s.h %[w1], $ac0, 14 \n\t"
|
||||
"li %[w4], 16384 \n\t"
|
||||
"lwr %[w2], 0(%[inputState]) \n\t"
|
||||
"lwr %[w3], 4(%[inputState]) \n\t"
|
||||
"mtlo %[w4], $ac1 \n\t"
|
||||
// Shift low pass filter state
|
||||
"swl %[w2], 5(%[inputState]) \n\t"
|
||||
"swl %[w3], 9(%[inputState]) \n\t"
|
||||
"mul %[w1], %[gain32], %[w1] \n\t"
|
||||
"swr %[w2], 2(%[inputState]) \n\t"
|
||||
"swr %[w3], 6(%[inputState]) \n\t"
|
||||
// Low pass filter accumulation
|
||||
"dpa.w.ph $ac1, %[kDampF1], %[w2] \n\t"
|
||||
"dpa.w.ph $ac1, %[kDampF2], %[w3] \n\t"
|
||||
"lh %[w4], 0(%[input1]) \n\t"
|
||||
"addiu %[input1], %[input1], 2 \n\t"
|
||||
"shra_r.w %[w1], %[w1], 12 \n\t"
|
||||
"sh %[w1], 0(%[inputState]) \n\t"
|
||||
"dpa.w.ph $ac1, %[kDampF0], %[w1] \n\t"
|
||||
// Low pass filter shift & saturation
|
||||
"extr_s.h %[w2], $ac1, 15 \n\t"
|
||||
"mul %[w2], %[w2], %[sign32] \n\t"
|
||||
// Buffer update
|
||||
"subu %[w2], %[w4], %[w2] \n\t"
|
||||
"shll_s.w %[w2], %[w2], 16 \n\t"
|
||||
"sra %[w2], %[w2], 16 \n\t"
|
||||
"sh %[w2], 0(%[output1]) \n\t"
|
||||
"addu %[w2], %[w2], %[w4] \n\t"
|
||||
"shll_s.w %[w2], %[w2], 16 \n\t"
|
||||
"addiu %[output1], %[output1], 2 \n\t"
|
||||
"sra %[w2], %[w2], 16 \n\t"
|
||||
"sh %[w2], 0(%[output2]) \n\t"
|
||||
"addiu %[output2], %[output2], 2 \n\t"
|
||||
".set pop \n\t"
|
||||
: [w1] "=&r" (w1), [w2] "=&r" (w2), [w3] "=&r" (w3), [w4] "=&r" (w4),
|
||||
[w5] "=&r" (w5), [input1] "+r" (input1), [out2_pos2] "+r" (out2_pos2),
|
||||
[output1] "+r" (output1), [output2] "+r" (output2)
|
||||
: [coefficient] "r" (coefficient), [inputState] "r" (inputState),
|
||||
[gain32] "r" (gain32), [sign32] "r" (sign32), [kDampF0] "r" (kDampF0),
|
||||
[kDampF1] "r" (kDampF1), [kDampF2] "r" (kDampF2),
|
||||
[coef1] "r" (coef1), [coef2] "r" (coef2), [coef3] "r" (coef3),
|
||||
[coef4] "r" (coef4), [coef5] "r" (coef5)
|
||||
: "hi", "lo", "$ac1hi", "$ac1lo", "memory"
|
||||
);
|
||||
}
|
||||
(*index2) += loopNumber;
|
||||
}
|
Loading…
Reference in New Issue
Block a user