MIPS optimizations for ISAC (patch #1)

Implemented functions:
    - WebRtcIsacfix_AutocorrMIPS
    - WebRtcIsacfix_FilterArLoop
    - WebRtcIsacfix_FilterMaLoopMIPS
    - WebRtcIsacfix_AllpassFilter2FixDec16MIPS (only MIPS DSP)
    - WebRtcIsacfix_PitchFilterCore (only MIPS DSPR2)

Gain achieved: from aprox. 15% (MIPS32) up to aprox. 40% (MIPS DSPR2)

R=andrew@webrtc.org, tina.legrand@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/17559005

Patch from Ljubomir Papuga <lpapuga@mips.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@6387 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org 2014-06-10 18:13:15 +00:00
parent 0d7ab0a634
commit 919914d71b
8 changed files with 999 additions and 1 deletions

View File

@ -179,6 +179,21 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
int32_t* ptr2);
#endif
#if defined(MIPS32_LE)
int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r,
const int16_t* __restrict x,
int16_t N,
int16_t order,
int16_t* __restrict scale);
void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0,
int16_t input1,
int32_t input2,
int32_t* ptr0,
int32_t* ptr1,
int32_t* ptr2);
#endif
/* Function pointers associated with the above functions. */
typedef int (*AutocorrFix)(int32_t* __restrict r,

View File

@ -58,6 +58,17 @@ void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
int32_t *filter_state_ch2);
#endif
#if defined(MIPS_DSP_R1_LE)
void WebRtcIsacfix_AllpassFilter2FixDec16MIPS(
int16_t *data_ch1,
int16_t *data_ch2,
const int16_t *factor_ch1,
const int16_t *factor_ch2,
const int length,
int32_t *filter_state_ch1,
int32_t *filter_state_ch2);
#endif
#if defined(__cplusplus) || defined(c_plusplus)
}
#endif

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h"
// WebRtcIsacfix_AllpassFilter2FixDec16 function optimized for MIPSDSP platform
// Bit-exact with WebRtcIsacfix_AllpassFilter2FixDec16C from filterbanks.c
void WebRtcIsacfix_AllpassFilter2FixDec16MIPS(
int16_t *data_ch1, // Input and output in channel 1, in Q0
int16_t *data_ch2, // Input and output in channel 2, in Q0
const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
const int length, // Length of the data buffers
int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
int32_t st0_ch1, st1_ch1; // channel1 state variables
int32_t st0_ch2, st1_ch2; // channel2 state variables
int32_t f_ch10, f_ch11, f_ch20, f_ch21; // factor variables
int32_t r0, r1, r2, r3, r4, r5; // temporary ragister variables
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
// Load all the state and factor variables
"lh %[f_ch10], 0(%[factor_ch1]) \n\t"
"lh %[f_ch20], 0(%[factor_ch2]) \n\t"
"lh %[f_ch11], 2(%[factor_ch1]) \n\t"
"lh %[f_ch21], 2(%[factor_ch2]) \n\t"
"lw %[st0_ch1], 0(%[filter_state_ch1]) \n\t"
"lw %[st1_ch1], 4(%[filter_state_ch1]) \n\t"
"lw %[st0_ch2], 0(%[filter_state_ch2]) \n\t"
"lw %[st1_ch2], 4(%[filter_state_ch2]) \n\t"
// Allpass filtering loop
"1: \n\t"
"lh %[r0], 0(%[data_ch1]) \n\t"
"lh %[r1], 0(%[data_ch2]) \n\t"
"addiu %[length], %[length], -1 \n\t"
"mul %[r2], %[r0], %[f_ch10] \n\t"
"mul %[r3], %[r1], %[f_ch20] \n\t"
"sll %[r0], %[r0], 16 \n\t"
"sll %[r1], %[r1], 16 \n\t"
"sll %[r2], %[r2], 1 \n\t"
"addq_s.w %[r2], %[r2], %[st0_ch1] \n\t"
"sll %[r3], %[r3], 1 \n\t"
"addq_s.w %[r3], %[r3], %[st0_ch2] \n\t"
"sra %[r2], %[r2], 16 \n\t"
"mul %[st0_ch1], %[f_ch10], %[r2] \n\t"
"sra %[r3], %[r3], 16 \n\t"
"mul %[st0_ch2], %[f_ch20], %[r3] \n\t"
"mul %[r4], %[r2], %[f_ch11] \n\t"
"mul %[r5], %[r3], %[f_ch21] \n\t"
"sll %[st0_ch1], %[st0_ch1], 1 \n\t"
"subq_s.w %[st0_ch1], %[r0], %[st0_ch1] \n\t"
"sll %[st0_ch2], %[st0_ch2], 1 \n\t"
"subq_s.w %[st0_ch2], %[r1], %[st0_ch2] \n\t"
"sll %[r4], %[r4], 1 \n\t"
"addq_s.w %[r4], %[r4], %[st1_ch1] \n\t"
"sll %[r5], %[r5], 1 \n\t"
"addq_s.w %[r5], %[r5], %[st1_ch2] \n\t"
"sra %[r4], %[r4], 16 \n\t"
"mul %[r0], %[r4], %[f_ch11] \n\t"
"sra %[r5], %[r5], 16 \n\t"
"mul %[r1], %[r5], %[f_ch21] \n\t"
"sh %[r4], 0(%[data_ch1]) \n\t"
"sh %[r5], 0(%[data_ch2]) \n\t"
"addiu %[data_ch1], %[data_ch1], 2 \n\t"
"sll %[r2], %[r2], 16 \n\t"
"sll %[r0], %[r0], 1 \n\t"
"subq_s.w %[st1_ch1], %[r2], %[r0] \n\t"
"sll %[r3], %[r3], 16 \n\t"
"sll %[r1], %[r1], 1 \n\t"
"subq_s.w %[st1_ch2], %[r3], %[r1] \n\t"
"bgtz %[length], 1b \n\t"
" addiu %[data_ch2], %[data_ch2], 2 \n\t"
// Store channel states
"sw %[st0_ch1], 0(%[filter_state_ch1]) \n\t"
"sw %[st1_ch1], 4(%[filter_state_ch1]) \n\t"
"sw %[st0_ch2], 0(%[filter_state_ch2]) \n\t"
"sw %[st1_ch2], 4(%[filter_state_ch2]) \n\t"
".set pop \n\t"
: [f_ch10] "=&r" (f_ch10), [f_ch20] "=&r" (f_ch20),
[f_ch11] "=&r" (f_ch11), [f_ch21] "=&r" (f_ch21),
[st0_ch1] "=&r" (st0_ch1), [st1_ch1] "=&r" (st1_ch1),
[st0_ch2] "=&r" (st0_ch2), [st1_ch2] "=&r" (st1_ch2),
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
[r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5)
: [factor_ch1] "r" (factor_ch1), [factor_ch2] "r" (factor_ch2),
[filter_state_ch1] "r" (filter_state_ch1),
[filter_state_ch2] "r" (filter_state_ch2),
[data_ch1] "r" (data_ch1), [data_ch2] "r" (data_ch2),
[length] "r" (length)
: "memory", "hi", "lo"
);
}

View File

@ -0,0 +1,365 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
// MIPS optimized implementation of the Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r,
const int16_t* __restrict x,
int16_t N,
int16_t order,
int16_t* __restrict scale) {
int i = 0;
int16_t scaling = 0;
int16_t* in = (int16_t*)x;
int loop_size = (int)(N >> 3);
int count = (int)(N & 7);
// Declare temporary variables used as registry values.
int32_t r0, r1, r2, r3;
#if !defined(MIPS_DSP_R2_LE)
// For non-DSPR2 optimizations 4 more registers are used.
int32_t r4, r5, r6, r7;
#endif
// Calculate r[0] and scaling needed.
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
// Loop is unrolled 8 times, set accumulator to zero in branch delay slot.
"beqz %[loop_size], 2f \n\t"
" mult $0, $0 \n\t"
"1: \n\t"
// Load 8 samples per loop iteration.
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 4(%[in]) \n\t"
"ulw %[r2], 8(%[in]) \n\t"
"ulw %[r3], 12(%[in]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 2(%[in]) \n\t"
"lh %[r2], 4(%[in]) \n\t"
"lh %[r3], 6(%[in]) \n\t"
"lh %[r4], 8(%[in]) \n\t"
"lh %[r5], 10(%[in]) \n\t"
"lh %[r6], 12(%[in]) \n\t"
"lh %[r7], 14(%[in]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
// Multiply and accumulate.
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r0] \n\t"
"dpa.w.ph $ac0, %[r1], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r2] \n\t"
"dpa.w.ph $ac0, %[r3], %[r3] \n\t"
#else
"madd %[r0], %[r0] \n\t"
"madd %[r1], %[r1] \n\t"
"madd %[r2], %[r2] \n\t"
"madd %[r3], %[r3] \n\t"
"madd %[r4], %[r4] \n\t"
"madd %[r5], %[r5] \n\t"
"madd %[r6], %[r6] \n\t"
"madd %[r7], %[r7] \n\t"
#endif
"bnez %[loop_size], 1b \n\t"
" addiu %[in], %[in], 16 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
#if defined(MIPS_DSP_R1_LE)
" extr.w %[r0], $ac0, 31 \n\t"
#else
" mfhi %[r2] \n\t"
#endif
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"madd %[r0], %[r0] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in], %[in], 2 \n\t"
#if defined(MIPS_DSP_R1_LE)
"extr.w %[r0], $ac0, 31 \n\t"
#else
"mfhi %[r2] \n\t"
#endif
"4: \n\t"
#if !defined(MIPS_DSP_R1_LE)
"mflo %[r3] \n\t"
"sll %[r0], %[r2], 1 \n\t"
"srl %[r1], %[r3], 31 \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
// Calculate scaling (the value of shifting).
"clz %[r1], %[r0] \n\t"
"addiu %[r1], %[r1], -32 \n\t"
"subu %[scaling], $0, %[r1] \n\t"
"slti %[r1], %[r0], 0x1 \n\t"
"movn %[scaling], $0, %[r1] \n\t"
#if defined(MIPS_DSP_R1_LE)
"extrv.w %[r0], $ac0, %[scaling] \n\t"
"mfhi %[r2], $ac0 \n\t"
#else
"addiu %[r1], %[scaling], -32 \n\t"
"subu %[r1], $0, %[r1] \n\t"
"sllv %[r1], %[r2], %[r1] \n\t"
"srlv %[r0], %[r3], %[scaling] \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
"slti %[r1], %[scaling], 32 \n\t"
"movz %[r0], %[r2], %[r1] \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [r0] "=&r" (r0),
[r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[count] "+r" (count), [scaling] "=r" (scaling)
: [N] "r" (N)
: "memory", "hi", "lo"
);
r[0] = r0;
// Correlation calculation is divided in 3 cases depending on the scaling
// value (different accumulator manipulation needed). Three slightly different
// loops are written in order to avoid branches inside the loop.
if (scaling == 0) {
// In this case, the result will be in low part of the accumulator.
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
// Loop processing 4 pairs of samples per iteration.
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
" mflo %[r0] \n\t"
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
"mflo %[r0] \n\t"
"4: \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
:
: "memory", "hi", "lo"
);
r[i] = r0;
}
} else if (scaling == 32) {
// In this case, the result will be high part of the accumulator.
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
// Loop processing 4 pairs of samples per iteration.
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
" mfhi %[r0] \n\t"
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
"mfhi %[r0] \n\t"
"4: \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
:
: "memory", "hi", "lo"
);
r[i] = r0;
}
} else {
// In this case, the result is obtained by combining low and high parts
// of the accumulator.
#if !defined(MIPS_DSP_R1_LE)
int32_t tmp_shift = 32 - scaling;
#endif
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
#if defined(MIPS_DSP_R1_LE)
" extrv.w %[r0], $ac0, %[scaling] \n\t"
#else
" mfhi %[r0] \n\t"
#endif
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
#if defined(MIPS_DSP_R1_LE)
"extrv.w %[r0], $ac0, %[scaling] \n\t"
#else
"mfhi %[r0] \n\t"
#endif
"4: \n\t"
#if !defined(MIPS_DSP_R1_LE)
"mflo %[r1] \n\t"
"sllv %[r0], %[r0], %[tmp_shift] \n\t"
"srlv %[r1], %[r1], %[scaling] \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
: [scaling] "r" (scaling)
#if !defined(MIPS_DSP_R1_LE)
, [tmp_shift] "r" (tmp_shift)
#endif
: "memory", "hi", "lo"
);
r[i] = r0;
}
}
*scale = scaling;
return (order + 1);
}

View File

@ -179,7 +179,7 @@ int16_t WebRtcIsacfix_FreeInternal(ISACFIX_MainStruct *ISAC_main_inst)
}
/****************************************************************************
* WebRtcAecm_InitNeon(...)
* WebRtcIsacfix_InitNeon(...)
*
* This function initializes function pointers for ARM Neon platform.
*/
@ -199,6 +199,23 @@ static void WebRtcIsacfix_InitNeon(void) {
}
#endif
/****************************************************************************
* WebRtcIsacfix_InitMIPS(...)
*
* This function initializes function pointers for MIPS platform.
*/
#if defined(MIPS32_LE)
static void WebRtcIsacfix_InitMIPS(void) {
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrMIPS;
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopMIPS;
#if defined(MIPS_DSP_R1_LE)
WebRtcIsacfix_AllpassFilter2FixDec16 =
WebRtcIsacfix_AllpassFilter2FixDec16MIPS;
#endif
}
#endif
/****************************************************************************
* WebRtcIsacfix_EncoderInit(...)
*
@ -296,6 +313,10 @@ int16_t WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst,
WebRtcIsacfix_InitNeon();
#endif
#if defined(MIPS32_LE)
WebRtcIsacfix_InitMIPS();
#endif
return statusInit;
}

View File

@ -85,6 +85,30 @@
'pitch_filter_c.c',
],
}],
['target_arch=="mipsel"', {
'sources': [
'filters_mips.c',
'lattice_mips.c',
],
'sources!': [
'lattice_c.c',
],
'conditions': [
['mips_dsp_rev>0', {
'sources': [
'filterbanks_mips.c',
],
}],
['mips_dsp_rev>1', {
'sources': [
'pitch_filter_mips.c',
],
'sources!': [
'pitch_filter_c.c',
],
}],
],
}],
],
},
],

View File

@ -0,0 +1,327 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
#include "webrtc/typedefs.h"
// Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients
// cth_Q15[] and sth_Q15[].
void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples
int16_t* ar_f_Q0, // Input samples
int16_t* cth_Q15, // Filter coefficients
int16_t* sth_Q15, // Filter coefficients
int16_t order_coef) { // order of the filter
int n = 0;
for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) {
int count = order_coef - 1;
int offset;
#if !defined(MIPS_DSP_R1_LE)
int16_t* tmp_cth;
int16_t* tmp_sth;
int16_t* tmp_arg;
int32_t max_q16 = 0x7fff;
int32_t min_q16 = 0xffff8000;
#endif
// Declare variables used as temporary registers.
int32_t r0, r1, r2, t0, t1, t2, t_ar;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"bltz %[count], 2f \n\t"
" lh %[t_ar], 0(%[tmp]) \n\t"
// Inner loop
"1: \n\t"
"sll %[offset], %[count], 1 \n\t"
#if defined(MIPS_DSP_R1_LE)
"lhx %[r0], %[offset](%[cth_Q15]) \n\t"
"lhx %[r1], %[offset](%[sth_Q15]) \n\t"
"lhx %[r2], %[offset](%[ar_g_Q0]) \n\t"
#else
"addu %[tmp_cth], %[cth_Q15], %[offset] \n\t"
"addu %[tmp_sth], %[sth_Q15], %[offset] \n\t"
"addu %[tmp_arg], %[ar_g_Q0], %[offset] \n\t"
"lh %[r0], 0(%[tmp_cth]) \n\t"
"lh %[r1], 0(%[tmp_sth]) \n\t"
"lh %[r2], 0(%[tmp_arg]) \n\t"
#endif
"mul %[t0], %[r0], %[t_ar] \n\t"
"mul %[t1], %[r1], %[t_ar] \n\t"
"mul %[t2], %[r1], %[r2] \n\t"
"mul %[r0], %[r0], %[r2] \n\t"
"subu %[t0], %[t0], %[t2] \n\t"
"addu %[t1], %[t1], %[r0] \n\t"
#if defined(MIPS_DSP_R1_LE)
"shra_r.w %[t1], %[t1], 15 \n\t"
"shra_r.w %[t0], %[t0], 15 \n\t"
#else
"addiu %[t1], %[t1], 0x4000 \n\t"
"sra %[t1], %[t1], 15 \n\t"
"addiu %[t0], %[t0], 0x4000 \n\t"
"sra %[t0], %[t0], 15 \n\t"
#endif
"addiu %[offset], %[offset], 2 \n\t"
#if defined(MIPS_DSP_R1_LE)
"shll_s.w %[t1], %[t1], 16 \n\t"
"shll_s.w %[t_ar], %[t0], 16 \n\t"
#else
"slt %[r0], %[t1], %[max_q16] \n\t"
"slt %[r1], %[t0], %[max_q16] \n\t"
"movz %[t1], %[max_q16], %[r0] \n\t"
"movz %[t0], %[max_q16], %[r1] \n\t"
#endif
"addu %[offset], %[offset], %[ar_g_Q0] \n\t"
#if defined(MIPS_DSP_R1_LE)
"sra %[t1], %[t1], 16 \n\t"
"sra %[t_ar], %[t_ar], 16 \n\t"
#else
"slt %[r0], %[t1], %[min_q16] \n\t"
"slt %[r1], %[t0], %[min_q16] \n\t"
"movn %[t1], %[min_q16], %[r0] \n\t"
"movn %[t0], %[min_q16], %[r1] \n\t"
"addu %[t_ar], $zero, %[t0] \n\t"
#endif
"sh %[t1], 0(%[offset]) \n\t"
"bgtz %[count], 1b \n\t"
" addiu %[count], %[count], -1 \n\t"
"2: \n\t"
"sh %[t_ar], 0(%[tmp]) \n\t"
"sh %[t_ar], 0(%[ar_g_Q0]) \n\t"
".set pop \n\t"
: [t_ar] "=&r" (t_ar), [count] "+r" (count), [offset] "=&r" (offset),
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [t0] "=&r" (t0),
#if !defined(MIPS_DSP_R1_LE)
[tmp_cth] "=&r" (tmp_cth), [tmp_sth] "=&r" (tmp_sth),
[tmp_arg] "=&r" (tmp_arg),
#endif
[t1] "=&r" (t1), [t2] "=&r" (t2)
: [tmp] "r" (&ar_f_Q0[n+1]), [cth_Q15] "r" (cth_Q15),
#if !defined(MIPS_DSP_R1_LE)
[max_q16] "r" (max_q16), [min_q16] "r" (min_q16),
#endif
[sth_Q15] "r" (sth_Q15), [ar_g_Q0] "r" (ar_g_Q0)
: "memory", "hi", "lo"
);
}
}
// MIPS optimization of the inner loop used for function
// WebRtcIsacfix_NormLatticeFilterMa(). It does:
//
// for 0 <= n < HALF_SUBFRAMELEN - 1:
// *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
// *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
//
// Note, function WebRtcIsacfix_FilterMaLoopMIPS and WebRtcIsacfix_FilterMaLoopC
// are not bit-exact. The accuracy of the MIPS function is same or better.
void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, // Filter coefficient
int16_t input1, // Filter coefficient
int32_t input2, // Inverse coeff (1/input1)
int32_t* ptr0, // Sample buffer
int32_t* ptr1, // Sample buffer
int32_t* ptr2) { // Sample buffer
#if defined(MIPS_DSP_R2_LE)
// MIPS DSPR2 version. 4 available accumulators allows loop unrolling 4 times.
// This variant is not bit-exact with WebRtcIsacfix_FilterMaLoopC, since we
// are exploiting 64-bit accumulators. The accuracy of the MIPS DSPR2 function
// is same or better.
int n = (HALF_SUBFRAMELEN - 1) >> 2;
int m = (HALF_SUBFRAMELEN - 1) & 3;
int r0, r1, r2, r3;
int t0, t1, t2, t3;
int s0, s1, s2, s3;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lw %[r0], 0(%[ptr0]) \n\t"
"lw %[r1], 4(%[ptr0]) \n\t"
"lw %[r2], 8(%[ptr0]) \n\t"
"lw %[r3], 12(%[ptr0]) \n\t"
"mult $ac0, %[r0], %[input0] \n\t"
"mult $ac1, %[r1], %[input0] \n\t"
"mult $ac2, %[r2], %[input0] \n\t"
"mult $ac3, %[r3], %[input0] \n\t"
"lw %[t0], 0(%[ptr2]) \n\t"
"extr_rs.w %[s0], $ac0, 15 \n\t"
"extr_rs.w %[s1], $ac1, 15 \n\t"
"extr_rs.w %[s2], $ac2, 15 \n\t"
"extr_rs.w %[s3], $ac3, 15 \n\t"
"lw %[t1], 4(%[ptr2]) \n\t"
"lw %[t2], 8(%[ptr2]) \n\t"
"lw %[t3], 12(%[ptr2]) \n\t"
"addu %[t0], %[t0], %[s0] \n\t"
"addu %[t1], %[t1], %[s1] \n\t"
"addu %[t2], %[t2], %[s2] \n\t"
"addu %[t3], %[t3], %[s3] \n\t"
"mult $ac0, %[t0], %[input2] \n\t"
"mult $ac1, %[t1], %[input2] \n\t"
"mult $ac2, %[t2], %[input2] \n\t"
"mult $ac3, %[t3], %[input2] \n\t"
"addiu %[ptr0], %[ptr0], 16 \n\t"
"extr_rs.w %[t0], $ac0, 16 \n\t"
"extr_rs.w %[t1], $ac1, 16 \n\t"
"extr_rs.w %[t2], $ac2, 16 \n\t"
"extr_rs.w %[t3], $ac3, 16 \n\t"
"addiu %[n], %[n], -1 \n\t"
"mult $ac0, %[r0], %[input1] \n\t"
"mult $ac1, %[r1], %[input1] \n\t"
"mult $ac2, %[r2], %[input1] \n\t"
"mult $ac3, %[r3], %[input1] \n\t"
"sw %[t0], 0(%[ptr2]) \n\t"
"extr_rs.w %[s0], $ac0, 15 \n\t"
"extr_rs.w %[s1], $ac1, 15 \n\t"
"extr_rs.w %[s2], $ac2, 15 \n\t"
"extr_rs.w %[s3], $ac3, 15 \n\t"
"sw %[t1], 4(%[ptr2]) \n\t"
"sw %[t2], 8(%[ptr2]) \n\t"
"sw %[t3], 12(%[ptr2]) \n\t"
"mult $ac0, %[t0], %[input0] \n\t"
"mult $ac1, %[t1], %[input0] \n\t"
"mult $ac2, %[t2], %[input0] \n\t"
"mult $ac3, %[t3], %[input0] \n\t"
"addiu %[ptr2], %[ptr2], 16 \n\t"
"extr_rs.w %[t0], $ac0, 15 \n\t"
"extr_rs.w %[t1], $ac1, 15 \n\t"
"extr_rs.w %[t2], $ac2, 15 \n\t"
"extr_rs.w %[t3], $ac3, 15 \n\t"
"addu %[t0], %[t0], %[s0] \n\t"
"addu %[t1], %[t1], %[s1] \n\t"
"addu %[t2], %[t2], %[s2] \n\t"
"addu %[t3], %[t3], %[s3] \n\t"
"sw %[t0], 0(%[ptr1]) \n\t"
"sw %[t1], 4(%[ptr1]) \n\t"
"sw %[t2], 8(%[ptr1]) \n\t"
"sw %[t3], 12(%[ptr1]) \n\t"
"bgtz %[n], 1b \n\t"
" addiu %[ptr1], %[ptr1], 16 \n\t"
"beq %[m], %0, 3f \n\t"
" nop \n\t"
"2: \n\t"
"lw %[r0], 0(%[ptr0]) \n\t"
"lw %[t0], 0(%[ptr2]) \n\t"
"addiu %[ptr0], %[ptr0], 4 \n\t"
"mult $ac0, %[r0], %[input0] \n\t"
"mult $ac1, %[r0], %[input1] \n\t"
"extr_rs.w %[r1], $ac0, 15 \n\t"
"extr_rs.w %[t1], $ac1, 15 \n\t"
"addu %[t0], %[t0], %[r1] \n\t"
"mult $ac0, %[t0], %[input2] \n\t"
"extr_rs.w %[t0], $ac0, 16 \n\t"
"sw %[t0], 0(%[ptr2]) \n\t"
"mult $ac0, %[t0], %[input0] \n\t"
"addiu %[ptr2], %[ptr2], 4 \n\t"
"addiu %[m], %[m], -1 \n\t"
"extr_rs.w %[t0], $ac0, 15 \n\t"
"addu %[t0], %[t0], %[t1] \n\t"
"sw %[t0], 0(%[ptr1]) \n\t"
"bgtz %[m], 2b \n\t"
" addiu %[ptr1], %[ptr1], 4 \n\t"
"3: \n\t"
".set pop \n\t"
: [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
[r3] "=&r" (r3), [t0] "=&r" (t0), [t1] "=&r" (t1),
[t2] "=&r" (t2), [t3] "=&r" (t3), [s0] "=&r" (s0),
[s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3),
[ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [m] "+r" (m),
[ptr2] "+r" (ptr2), [n] "+r" (n)
: [input0] "r" (input0), [input1] "r" (input1),
[input2] "r" (input2)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi",
"$ac2lo", "$ac3hi", "$ac3lo"
);
#else
// Non-DSPR2 version of the function. Avoiding the accumulator usage due to
// large latencies. This variant is bit-exact with C code.
int n = HALF_SUBFRAMELEN - 1;
int32_t t16a, t16b;
int32_t r0, r1, r2, r3, r4;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"sra %[t16a], %[input2], 16 \n\t"
"andi %[t16b], %[input2], 0xFFFF \n\t"
#if defined(MIPS32R2_LE)
"seh %[t16b], %[t16b] \n\t"
"seh %[input0], %[input0] \n\t"
"seh %[input1], %[input1] \n\t"
#else
"sll %[t16b], %[t16b], 16 \n\t"
"sra %[t16b], %[t16b], 16 \n\t"
"sll %[input0], %[input0], 16 \n\t"
"sra %[input0], %[input0], 16 \n\t"
"sll %[input1], %[input1], 16 \n\t"
"sra %[input1], %[input1], 16 \n\t"
#endif
"addiu %[r0], %[t16a], 1 \n\t"
"slt %[r1], %[t16b], $zero \n\t"
"movn %[t16a], %[r0], %[r1] \n\t"
"1: \n\t"
"lw %[r0], 0(%[ptr0]) \n\t"
"lw %[r1], 0(%[ptr2]) \n\t"
"addiu %[ptr0], %[ptr0], 4 \n\t"
"sra %[r2], %[r0], 16 \n\t"
"andi %[r0], %[r0], 0xFFFF \n\t"
"mul %[r3], %[r2], %[input0] \n\t"
"mul %[r4], %[r0], %[input0] \n\t"
"mul %[r2], %[r2], %[input1] \n\t"
"mul %[r0], %[r0], %[input1] \n\t"
"addiu %[ptr2], %[ptr2], 4 \n\t"
"sll %[r3], %[r3], 1 \n\t"
"sra %[r4], %[r4], 1 \n\t"
"addiu %[r4], %[r4], 0x2000 \n\t"
"sra %[r4], %[r4], 14 \n\t"
"addu %[r3], %[r3], %[r4] \n\t"
"addu %[r1], %[r1], %[r3] \n\t"
"sra %[r3], %[r1], 16 \n\t"
"andi %[r4], %[r1], 0xFFFF \n\t"
"sra %[r4], %[r4], 1 \n\t"
"mul %[r1], %[r1], %[t16a] \n\t"
"mul %[r3], %[r3], %[t16b] \n\t"
"mul %[r4], %[r4], %[t16b] \n\t"
"sll %[r2], %[r2], 1 \n\t"
"sra %[r0], %[r0], 1 \n\t"
"addiu %[r0], %[r0], 0x2000 \n\t"
"sra %[r0], %[r0], 14 \n\t"
"addu %[r0], %[r0], %[r2] \n\t"
"addiu %[n], %[n], -1 \n\t"
"addu %[r1], %[r1], %[r3] \n\t"
"addiu %[r4], %[r4], 0x4000 \n\t"
"sra %[r4], %[r4], 15 \n\t"
"addu %[r1], %[r1], %[r4] \n\t"
"sra %[r2], %[r1], 16 \n\t"
"andi %[r3], %[r1], 0xFFFF \n\t"
"mul %[r3], %[r3], %[input0] \n\t"
"mul %[r2], %[r2], %[input0] \n\t"
"sw %[r1], -4(%[ptr2]) \n\t"
"sra %[r3], %[r3], 1 \n\t"
"addiu %[r3], %[r3], 0x2000 \n\t"
"sra %[r3], %[r3], 14 \n\t"
"addu %[r0], %[r0], %[r3] \n\t"
"sll %[r2], %[r2], 1 \n\t"
"addu %[r0], %[r0], %[r2] \n\t"
"sw %[r0], 0(%[ptr1]) \n\t"
"bgtz %[n], 1b \n\t"
" addiu %[ptr1], %[ptr1], 4 \n\t"
".set pop \n\t"
: [t16a] "=&r" (t16a), [t16b] "=&r" (t16b), [r0] "=&r" (r0),
[r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[r4] "=&r" (r4), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1),
[ptr2] "+r" (ptr2), [n] "+r" (n)
: [input0] "r" (input0), [input1] "r" (input1),
[input2] "r" (input2)
: "hi", "lo", "memory"
);
#endif
}

View File

@ -0,0 +1,133 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_estimator.h"
void WebRtcIsacfix_PitchFilterCore(int loopNumber,
int16_t gain,
int index,
int16_t sign,
int16_t* inputState,
int16_t* outputBuf2,
const int16_t* coefficient,
int16_t* inputBuf,
int16_t* outputBuf,
int* index2) {
int ind2t = *index2;
int i = 0;
int16_t* out2_pos2 = &outputBuf2[PITCH_BUFFSIZE - (index + 2)] + ind2t;
int32_t w1, w2, w3, w4, w5, gain32, sign32;
int32_t coef1, coef2, coef3, coef4, coef5 = 0;
// Define damp factors as int32_t (pair of int16_t)
int32_t kDampF0 = 0x0000F70A;
int32_t kDampF1 = 0x51EC2000;
int32_t kDampF2 = 0xF70A2000;
int16_t* input1 = inputBuf + ind2t;
int16_t* output1 = outputBuf + ind2t;
int16_t* output2 = outputBuf2 + ind2t + PITCH_BUFFSIZE;
// Load coefficients outside the loop and sign-extend gain and sign
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lwl %[coef1], 3(%[coefficient]) \n\t"
"lwl %[coef2], 7(%[coefficient]) \n\t"
"lwl %[coef3], 11(%[coefficient]) \n\t"
"lwl %[coef4], 15(%[coefficient]) \n\t"
"lwr %[coef1], 0(%[coefficient]) \n\t"
"lwr %[coef2], 4(%[coefficient]) \n\t"
"lwr %[coef3], 8(%[coefficient]) \n\t"
"lwr %[coef4], 12(%[coefficient]) \n\t"
"lhu %[coef5], 16(%[coefficient]) \n\t"
"seh %[gain32], %[gain] \n\t"
"seh %[sign32], %[sign] \n\t"
".set pop \n\t"
: [coef1] "=&r" (coef1), [coef2] "=&r" (coef2), [coef3] "=&r" (coef3),
[coef4] "=&r" (coef4), [coef5] "=&r" (coef5), [gain32] "=&r" (gain32),
[sign32] "=&r" (sign32)
: [coefficient] "r" (coefficient), [gain] "r" (gain),
[sign] "r" (sign)
: "memory"
);
for (i = 0; i < loopNumber; i++) {
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
// Filter to get fractional pitch
"li %[w1], 8192 \n\t"
"mtlo %[w1] \n\t"
"mthi $0 \n\t"
"lwl %[w1], 3(%[out2_pos2]) \n\t"
"lwl %[w2], 7(%[out2_pos2]) \n\t"
"lwl %[w3], 11(%[out2_pos2]) \n\t"
"lwl %[w4], 15(%[out2_pos2]) \n\t"
"lwr %[w1], 0(%[out2_pos2]) \n\t"
"lwr %[w2], 4(%[out2_pos2]) \n\t"
"lwr %[w3], 8(%[out2_pos2]) \n\t"
"lwr %[w4], 12(%[out2_pos2]) \n\t"
"lhu %[w5], 16(%[out2_pos2]) \n\t"
"dpa.w.ph $ac0, %[w1], %[coef1] \n\t"
"dpa.w.ph $ac0, %[w2], %[coef2] \n\t"
"dpa.w.ph $ac0, %[w3], %[coef3] \n\t"
"dpa.w.ph $ac0, %[w4], %[coef4] \n\t"
"dpa.w.ph $ac0, %[w5], %[coef5] \n\t"
"addiu %[out2_pos2], %[out2_pos2], 2 \n\t"
"mthi $0, $ac1 \n\t"
"lwl %[w2], 3(%[inputState]) \n\t"
"lwl %[w3], 7(%[inputState]) \n\t"
// Fractional pitch shift & saturation
"extr_s.h %[w1], $ac0, 14 \n\t"
"li %[w4], 16384 \n\t"
"lwr %[w2], 0(%[inputState]) \n\t"
"lwr %[w3], 4(%[inputState]) \n\t"
"mtlo %[w4], $ac1 \n\t"
// Shift low pass filter state
"swl %[w2], 5(%[inputState]) \n\t"
"swl %[w3], 9(%[inputState]) \n\t"
"mul %[w1], %[gain32], %[w1] \n\t"
"swr %[w2], 2(%[inputState]) \n\t"
"swr %[w3], 6(%[inputState]) \n\t"
// Low pass filter accumulation
"dpa.w.ph $ac1, %[kDampF1], %[w2] \n\t"
"dpa.w.ph $ac1, %[kDampF2], %[w3] \n\t"
"lh %[w4], 0(%[input1]) \n\t"
"addiu %[input1], %[input1], 2 \n\t"
"shra_r.w %[w1], %[w1], 12 \n\t"
"sh %[w1], 0(%[inputState]) \n\t"
"dpa.w.ph $ac1, %[kDampF0], %[w1] \n\t"
// Low pass filter shift & saturation
"extr_s.h %[w2], $ac1, 15 \n\t"
"mul %[w2], %[w2], %[sign32] \n\t"
// Buffer update
"subu %[w2], %[w4], %[w2] \n\t"
"shll_s.w %[w2], %[w2], 16 \n\t"
"sra %[w2], %[w2], 16 \n\t"
"sh %[w2], 0(%[output1]) \n\t"
"addu %[w2], %[w2], %[w4] \n\t"
"shll_s.w %[w2], %[w2], 16 \n\t"
"addiu %[output1], %[output1], 2 \n\t"
"sra %[w2], %[w2], 16 \n\t"
"sh %[w2], 0(%[output2]) \n\t"
"addiu %[output2], %[output2], 2 \n\t"
".set pop \n\t"
: [w1] "=&r" (w1), [w2] "=&r" (w2), [w3] "=&r" (w3), [w4] "=&r" (w4),
[w5] "=&r" (w5), [input1] "+r" (input1), [out2_pos2] "+r" (out2_pos2),
[output1] "+r" (output1), [output2] "+r" (output2)
: [coefficient] "r" (coefficient), [inputState] "r" (inputState),
[gain32] "r" (gain32), [sign32] "r" (sign32), [kDampF0] "r" (kDampF0),
[kDampF1] "r" (kDampF1), [kDampF2] "r" (kDampF2),
[coef1] "r" (coef1), [coef2] "r" (coef2), [coef3] "r" (coef3),
[coef4] "r" (coef4), [coef5] "r" (coef5)
: "hi", "lo", "$ac1hi", "$ac1lo", "memory"
);
}
(*index2) += loopNumber;
}