diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h index 88c7e1abe..2f649324e 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h @@ -179,6 +179,21 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, int32_t* ptr2); #endif +#if defined(MIPS32_LE) +int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r, + const int16_t* __restrict x, + int16_t N, + int16_t order, + int16_t* __restrict scale); + +void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, + int16_t input1, + int32_t input2, + int32_t* ptr0, + int32_t* ptr1, + int32_t* ptr2); +#endif + /* Function pointers associated with the above functions. */ typedef int (*AutocorrFix)(int32_t* __restrict r, diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h index 28d103572..3fefc1a5d 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h @@ -58,6 +58,17 @@ void WebRtcIsacfix_AllpassFilter2FixDec16Neon( int32_t *filter_state_ch2); #endif +#if defined(MIPS_DSP_R1_LE) +void WebRtcIsacfix_AllpassFilter2FixDec16MIPS( + int16_t *data_ch1, + int16_t *data_ch2, + const int16_t *factor_ch1, + const int16_t *factor_ch2, + const int length, + int32_t *filter_state_ch1, + int32_t *filter_state_ch2); +#endif + #if defined(__cplusplus) || defined(c_plusplus) } #endif diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_mips.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_mips.c new file mode 100644 index 000000000..1887745b7 --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_mips.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/filterbank_internal.h" + +// WebRtcIsacfix_AllpassFilter2FixDec16 function optimized for MIPSDSP platform +// Bit-exact with WebRtcIsacfix_AllpassFilter2FixDec16C from filterbanks.c +void WebRtcIsacfix_AllpassFilter2FixDec16MIPS( + int16_t *data_ch1, // Input and output in channel 1, in Q0 + int16_t *data_ch2, // Input and output in channel 2, in Q0 + const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15 + const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15 + const int length, // Length of the data buffers + int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 + int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16 + + int32_t st0_ch1, st1_ch1; // channel1 state variables + int32_t st0_ch2, st1_ch2; // channel2 state variables + int32_t f_ch10, f_ch11, f_ch20, f_ch21; // factor variables + int32_t r0, r1, r2, r3, r4, r5; // temporary ragister variables + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + // Load all the state and factor variables + "lh %[f_ch10], 0(%[factor_ch1]) \n\t" + "lh %[f_ch20], 0(%[factor_ch2]) \n\t" + "lh %[f_ch11], 2(%[factor_ch1]) \n\t" + "lh %[f_ch21], 2(%[factor_ch2]) \n\t" + "lw %[st0_ch1], 0(%[filter_state_ch1]) \n\t" + "lw %[st1_ch1], 4(%[filter_state_ch1]) \n\t" + "lw %[st0_ch2], 0(%[filter_state_ch2]) \n\t" + "lw %[st1_ch2], 4(%[filter_state_ch2]) \n\t" + // Allpass filtering loop + "1: \n\t" + "lh %[r0], 0(%[data_ch1]) \n\t" + "lh %[r1], 0(%[data_ch2]) \n\t" + "addiu %[length], %[length], -1 \n\t" + "mul %[r2], %[r0], %[f_ch10] \n\t" + "mul %[r3], %[r1], %[f_ch20] \n\t" + "sll %[r0], %[r0], 16 \n\t" + "sll %[r1], %[r1], 16 \n\t" + "sll %[r2], %[r2], 1 \n\t" + "addq_s.w %[r2], %[r2], %[st0_ch1] \n\t" + "sll %[r3], %[r3], 1 \n\t" + "addq_s.w %[r3], %[r3], %[st0_ch2] \n\t" + "sra %[r2], %[r2], 16 \n\t" + "mul %[st0_ch1], %[f_ch10], %[r2] \n\t" + "sra %[r3], %[r3], 16 \n\t" + "mul %[st0_ch2], %[f_ch20], %[r3] \n\t" + "mul %[r4], %[r2], %[f_ch11] \n\t" + "mul %[r5], %[r3], %[f_ch21] \n\t" + "sll %[st0_ch1], %[st0_ch1], 1 \n\t" + "subq_s.w %[st0_ch1], %[r0], %[st0_ch1] \n\t" + "sll %[st0_ch2], %[st0_ch2], 1 \n\t" + "subq_s.w %[st0_ch2], %[r1], %[st0_ch2] \n\t" + "sll %[r4], %[r4], 1 \n\t" + "addq_s.w %[r4], %[r4], %[st1_ch1] \n\t" + "sll %[r5], %[r5], 1 \n\t" + "addq_s.w %[r5], %[r5], %[st1_ch2] \n\t" + "sra %[r4], %[r4], 16 \n\t" + "mul %[r0], %[r4], %[f_ch11] \n\t" + "sra %[r5], %[r5], 16 \n\t" + "mul %[r1], %[r5], %[f_ch21] \n\t" + "sh %[r4], 0(%[data_ch1]) \n\t" + "sh %[r5], 0(%[data_ch2]) \n\t" + "addiu %[data_ch1], %[data_ch1], 2 \n\t" + "sll %[r2], %[r2], 16 \n\t" + "sll %[r0], %[r0], 1 \n\t" + "subq_s.w %[st1_ch1], %[r2], %[r0] \n\t" + "sll %[r3], %[r3], 16 \n\t" + "sll %[r1], %[r1], 1 \n\t" + "subq_s.w %[st1_ch2], %[r3], %[r1] \n\t" + "bgtz %[length], 1b \n\t" + " addiu %[data_ch2], %[data_ch2], 2 \n\t" + // Store channel states + "sw %[st0_ch1], 0(%[filter_state_ch1]) \n\t" + "sw %[st1_ch1], 4(%[filter_state_ch1]) \n\t" + "sw %[st0_ch2], 0(%[filter_state_ch2]) \n\t" + "sw %[st1_ch2], 4(%[filter_state_ch2]) \n\t" + ".set pop \n\t" + : [f_ch10] "=&r" (f_ch10), [f_ch20] "=&r" (f_ch20), + [f_ch11] "=&r" (f_ch11), [f_ch21] "=&r" (f_ch21), + [st0_ch1] "=&r" (st0_ch1), [st1_ch1] "=&r" (st1_ch1), + [st0_ch2] "=&r" (st0_ch2), [st1_ch2] "=&r" (st1_ch2), + [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), + [r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5) + : [factor_ch1] "r" (factor_ch1), [factor_ch2] "r" (factor_ch2), + [filter_state_ch1] "r" (filter_state_ch1), + [filter_state_ch2] "r" (filter_state_ch2), + [data_ch1] "r" (data_ch1), [data_ch2] "r" (data_ch2), + [length] "r" (length) + : "memory", "hi", "lo" + ); +} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_mips.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_mips.c new file mode 100644 index 000000000..056dc275d --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/filters_mips.c @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h" + +// MIPS optimized implementation of the Autocorrelation function in fixed point. +// NOTE! Different from SPLIB-version in how it scales the signal. +int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r, + const int16_t* __restrict x, + int16_t N, + int16_t order, + int16_t* __restrict scale) { + int i = 0; + int16_t scaling = 0; + int16_t* in = (int16_t*)x; + int loop_size = (int)(N >> 3); + int count = (int)(N & 7); + // Declare temporary variables used as registry values. + int32_t r0, r1, r2, r3; +#if !defined(MIPS_DSP_R2_LE) + // For non-DSPR2 optimizations 4 more registers are used. + int32_t r4, r5, r6, r7; +#endif + + // Calculate r[0] and scaling needed. + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "mult $0, $0 \n\t" + // Loop is unrolled 8 times, set accumulator to zero in branch delay slot. + "beqz %[loop_size], 2f \n\t" + " mult $0, $0 \n\t" + "1: \n\t" + // Load 8 samples per loop iteration. +#if defined(MIPS_DSP_R2_LE) + "ulw %[r0], 0(%[in]) \n\t" + "ulw %[r1], 4(%[in]) \n\t" + "ulw %[r2], 8(%[in]) \n\t" + "ulw %[r3], 12(%[in]) \n\t" +#else + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 2(%[in]) \n\t" + "lh %[r2], 4(%[in]) \n\t" + "lh %[r3], 6(%[in]) \n\t" + "lh %[r4], 8(%[in]) \n\t" + "lh %[r5], 10(%[in]) \n\t" + "lh %[r6], 12(%[in]) \n\t" + "lh %[r7], 14(%[in]) \n\t" +#endif + "addiu %[loop_size], %[loop_size], -1 \n\t" + // Multiply and accumulate. +#if defined(MIPS_DSP_R2_LE) + "dpa.w.ph $ac0, %[r0], %[r0] \n\t" + "dpa.w.ph $ac0, %[r1], %[r1] \n\t" + "dpa.w.ph $ac0, %[r2], %[r2] \n\t" + "dpa.w.ph $ac0, %[r3], %[r3] \n\t" +#else + "madd %[r0], %[r0] \n\t" + "madd %[r1], %[r1] \n\t" + "madd %[r2], %[r2] \n\t" + "madd %[r3], %[r3] \n\t" + "madd %[r4], %[r4] \n\t" + "madd %[r5], %[r5] \n\t" + "madd %[r6], %[r6] \n\t" + "madd %[r7], %[r7] \n\t" +#endif + "bnez %[loop_size], 1b \n\t" + " addiu %[in], %[in], 16 \n\t" + "2: \n\t" + "beqz %[count], 4f \n\t" +#if defined(MIPS_DSP_R1_LE) + " extr.w %[r0], $ac0, 31 \n\t" +#else + " mfhi %[r2] \n\t" +#endif + // Process remaining samples (if any). + "3: \n\t" + "lh %[r0], 0(%[in]) \n\t" + "addiu %[count], %[count], -1 \n\t" + "madd %[r0], %[r0] \n\t" + "bnez %[count], 3b \n\t" + " addiu %[in], %[in], 2 \n\t" +#if defined(MIPS_DSP_R1_LE) + "extr.w %[r0], $ac0, 31 \n\t" +#else + "mfhi %[r2] \n\t" +#endif + "4: \n\t" +#if !defined(MIPS_DSP_R1_LE) + "mflo %[r3] \n\t" + "sll %[r0], %[r2], 1 \n\t" + "srl %[r1], %[r3], 31 \n\t" + "addu %[r0], %[r0], %[r1] \n\t" +#endif + // Calculate scaling (the value of shifting). + "clz %[r1], %[r0] \n\t" + "addiu %[r1], %[r1], -32 \n\t" + "subu %[scaling], $0, %[r1] \n\t" + "slti %[r1], %[r0], 0x1 \n\t" + "movn %[scaling], $0, %[r1] \n\t" +#if defined(MIPS_DSP_R1_LE) + "extrv.w %[r0], $ac0, %[scaling] \n\t" + "mfhi %[r2], $ac0 \n\t" +#else + "addiu %[r1], %[scaling], -32 \n\t" + "subu %[r1], $0, %[r1] \n\t" + "sllv %[r1], %[r2], %[r1] \n\t" + "srlv %[r0], %[r3], %[scaling] \n\t" + "addu %[r0], %[r0], %[r1] \n\t" +#endif + "slti %[r1], %[scaling], 32 \n\t" + "movz %[r0], %[r2], %[r1] \n\t" + ".set pop \n\t" + : [loop_size] "+r" (loop_size), [in] "+r" (in), [r0] "=&r" (r0), + [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), +#if !defined(MIPS_DSP_R2_LE) + [r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7), +#endif + [count] "+r" (count), [scaling] "=r" (scaling) + : [N] "r" (N) + : "memory", "hi", "lo" + ); + r[0] = r0; + + // Correlation calculation is divided in 3 cases depending on the scaling + // value (different accumulator manipulation needed). Three slightly different + // loops are written in order to avoid branches inside the loop. + if (scaling == 0) { + // In this case, the result will be in low part of the accumulator. + for (i = 1; i < order + 1; i++) { + in = (int16_t*)x; + int16_t* in1 = (int16_t*)x + i; + count = N - i; + loop_size = (count) >> 2; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "mult $0, $0 \n\t" + "beqz %[loop_size], 2f \n\t" + " andi %[count], %[count], 0x3 \n\t" + // Loop processing 4 pairs of samples per iteration. + "1: \n\t" +#if defined(MIPS_DSP_R2_LE) + "ulw %[r0], 0(%[in]) \n\t" + "ulw %[r1], 0(%[in1]) \n\t" + "ulw %[r2], 4(%[in]) \n\t" + "ulw %[r3], 4(%[in1]) \n\t" +#else + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "lh %[r2], 2(%[in]) \n\t" + "lh %[r3], 2(%[in1]) \n\t" + "lh %[r4], 4(%[in]) \n\t" + "lh %[r5], 4(%[in1]) \n\t" + "lh %[r6], 6(%[in]) \n\t" + "lh %[r7], 6(%[in1]) \n\t" +#endif + "addiu %[loop_size], %[loop_size], -1 \n\t" +#if defined(MIPS_DSP_R2_LE) + "dpa.w.ph $ac0, %[r0], %[r1] \n\t" + "dpa.w.ph $ac0, %[r2], %[r3] \n\t" +#else + "madd %[r0], %[r1] \n\t" + "madd %[r2], %[r3] \n\t" + "madd %[r4], %[r5] \n\t" + "madd %[r6], %[r7] \n\t" +#endif + "addiu %[in], %[in], 8 \n\t" + "bnez %[loop_size], 1b \n\t" + " addiu %[in1], %[in1], 8 \n\t" + "2: \n\t" + "beqz %[count], 4f \n\t" + " mflo %[r0] \n\t" + // Process remaining samples (if any). + "3: \n\t" + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "addiu %[count], %[count], -1 \n\t" + "addiu %[in], %[in], 2 \n\t" + "madd %[r0], %[r1] \n\t" + "bnez %[count], 3b \n\t" + " addiu %[in1], %[in1], 2 \n\t" + "mflo %[r0] \n\t" + "4: \n\t" + ".set pop \n\t" + : [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1), +#if !defined(MIPS_DSP_R2_LE) + [r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7), +#endif + [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), + [count] "+r" (count) + : + : "memory", "hi", "lo" + ); + r[i] = r0; + } + } else if (scaling == 32) { + // In this case, the result will be high part of the accumulator. + for (i = 1; i < order + 1; i++) { + in = (int16_t*)x; + int16_t* in1 = (int16_t*)x + i; + count = N - i; + loop_size = (count) >> 2; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "mult $0, $0 \n\t" + "beqz %[loop_size], 2f \n\t" + " andi %[count], %[count], 0x3 \n\t" + // Loop processing 4 pairs of samples per iteration. + "1: \n\t" +#if defined(MIPS_DSP_R2_LE) + "ulw %[r0], 0(%[in]) \n\t" + "ulw %[r1], 0(%[in1]) \n\t" + "ulw %[r2], 4(%[in]) \n\t" + "ulw %[r3], 4(%[in1]) \n\t" +#else + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "lh %[r2], 2(%[in]) \n\t" + "lh %[r3], 2(%[in1]) \n\t" + "lh %[r4], 4(%[in]) \n\t" + "lh %[r5], 4(%[in1]) \n\t" + "lh %[r6], 6(%[in]) \n\t" + "lh %[r7], 6(%[in1]) \n\t" +#endif + "addiu %[loop_size], %[loop_size], -1 \n\t" +#if defined(MIPS_DSP_R2_LE) + "dpa.w.ph $ac0, %[r0], %[r1] \n\t" + "dpa.w.ph $ac0, %[r2], %[r3] \n\t" +#else + "madd %[r0], %[r1] \n\t" + "madd %[r2], %[r3] \n\t" + "madd %[r4], %[r5] \n\t" + "madd %[r6], %[r7] \n\t" +#endif + "addiu %[in], %[in], 8 \n\t" + "bnez %[loop_size], 1b \n\t" + " addiu %[in1], %[in1], 8 \n\t" + "2: \n\t" + "beqz %[count], 4f \n\t" + " mfhi %[r0] \n\t" + // Process remaining samples (if any). + "3: \n\t" + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "addiu %[count], %[count], -1 \n\t" + "addiu %[in], %[in], 2 \n\t" + "madd %[r0], %[r1] \n\t" + "bnez %[count], 3b \n\t" + " addiu %[in1], %[in1], 2 \n\t" + "mfhi %[r0] \n\t" + "4: \n\t" + ".set pop \n\t" + : [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1), +#if !defined(MIPS_DSP_R2_LE) + [r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7), +#endif + [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), + [count] "+r" (count) + : + : "memory", "hi", "lo" + ); + r[i] = r0; + } + } else { + // In this case, the result is obtained by combining low and high parts + // of the accumulator. +#if !defined(MIPS_DSP_R1_LE) + int32_t tmp_shift = 32 - scaling; +#endif + for (i = 1; i < order + 1; i++) { + in = (int16_t*)x; + int16_t* in1 = (int16_t*)x + i; + count = N - i; + loop_size = (count) >> 2; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "mult $0, $0 \n\t" + "beqz %[loop_size], 2f \n\t" + " andi %[count], %[count], 0x3 \n\t" + "1: \n\t" +#if defined(MIPS_DSP_R2_LE) + "ulw %[r0], 0(%[in]) \n\t" + "ulw %[r1], 0(%[in1]) \n\t" + "ulw %[r2], 4(%[in]) \n\t" + "ulw %[r3], 4(%[in1]) \n\t" +#else + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "lh %[r2], 2(%[in]) \n\t" + "lh %[r3], 2(%[in1]) \n\t" + "lh %[r4], 4(%[in]) \n\t" + "lh %[r5], 4(%[in1]) \n\t" + "lh %[r6], 6(%[in]) \n\t" + "lh %[r7], 6(%[in1]) \n\t" +#endif + "addiu %[loop_size], %[loop_size], -1 \n\t" +#if defined(MIPS_DSP_R2_LE) + "dpa.w.ph $ac0, %[r0], %[r1] \n\t" + "dpa.w.ph $ac0, %[r2], %[r3] \n\t" +#else + "madd %[r0], %[r1] \n\t" + "madd %[r2], %[r3] \n\t" + "madd %[r4], %[r5] \n\t" + "madd %[r6], %[r7] \n\t" +#endif + "addiu %[in], %[in], 8 \n\t" + "bnez %[loop_size], 1b \n\t" + " addiu %[in1], %[in1], 8 \n\t" + "2: \n\t" + "beqz %[count], 4f \n\t" +#if defined(MIPS_DSP_R1_LE) + " extrv.w %[r0], $ac0, %[scaling] \n\t" +#else + " mfhi %[r0] \n\t" +#endif + "3: \n\t" + "lh %[r0], 0(%[in]) \n\t" + "lh %[r1], 0(%[in1]) \n\t" + "addiu %[count], %[count], -1 \n\t" + "addiu %[in], %[in], 2 \n\t" + "madd %[r0], %[r1] \n\t" + "bnez %[count], 3b \n\t" + " addiu %[in1], %[in1], 2 \n\t" +#if defined(MIPS_DSP_R1_LE) + "extrv.w %[r0], $ac0, %[scaling] \n\t" +#else + "mfhi %[r0] \n\t" +#endif + "4: \n\t" +#if !defined(MIPS_DSP_R1_LE) + "mflo %[r1] \n\t" + "sllv %[r0], %[r0], %[tmp_shift] \n\t" + "srlv %[r1], %[r1], %[scaling] \n\t" + "addu %[r0], %[r0], %[r1] \n\t" +#endif + ".set pop \n\t" + : [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1), +#if !defined(MIPS_DSP_R2_LE) + [r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7), +#endif + [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), + [count] "+r" (count) + : [scaling] "r" (scaling) +#if !defined(MIPS_DSP_R1_LE) + , [tmp_shift] "r" (tmp_shift) +#endif + : "memory", "hi", "lo" + ); + r[i] = r0; + } + } + *scale = scaling; + + return (order + 1); +} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c index 30e6f67e7..688ec07a5 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.c @@ -179,7 +179,7 @@ int16_t WebRtcIsacfix_FreeInternal(ISACFIX_MainStruct *ISAC_main_inst) } /**************************************************************************** - * WebRtcAecm_InitNeon(...) + * WebRtcIsacfix_InitNeon(...) * * This function initializes function pointers for ARM Neon platform. */ @@ -199,6 +199,23 @@ static void WebRtcIsacfix_InitNeon(void) { } #endif +/**************************************************************************** + * WebRtcIsacfix_InitMIPS(...) + * + * This function initializes function pointers for MIPS platform. + */ + +#if defined(MIPS32_LE) +static void WebRtcIsacfix_InitMIPS(void) { + WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrMIPS; + WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopMIPS; +#if defined(MIPS_DSP_R1_LE) + WebRtcIsacfix_AllpassFilter2FixDec16 = + WebRtcIsacfix_AllpassFilter2FixDec16MIPS; +#endif +} +#endif + /**************************************************************************** * WebRtcIsacfix_EncoderInit(...) * @@ -296,6 +313,10 @@ int16_t WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst, WebRtcIsacfix_InitNeon(); #endif +#if defined(MIPS32_LE) + WebRtcIsacfix_InitMIPS(); +#endif + return statusInit; } diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi index 87c98606a..a18a803d6 100644 --- a/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/isacfix.gypi @@ -85,6 +85,30 @@ 'pitch_filter_c.c', ], }], + ['target_arch=="mipsel"', { + 'sources': [ + 'filters_mips.c', + 'lattice_mips.c', + ], + 'sources!': [ + 'lattice_c.c', + ], + 'conditions': [ + ['mips_dsp_rev>0', { + 'sources': [ + 'filterbanks_mips.c', + ], + }], + ['mips_dsp_rev>1', { + 'sources': [ + 'pitch_filter_mips.c', + ], + 'sources!': [ + 'pitch_filter_c.c', + ], + }], + ], + }], ], }, ], diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_mips.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_mips.c new file mode 100644 index 000000000..c59692216 --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_mips.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" +#include "webrtc/typedefs.h" + +// Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients +// cth_Q15[] and sth_Q15[]. +void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples + int16_t* ar_f_Q0, // Input samples + int16_t* cth_Q15, // Filter coefficients + int16_t* sth_Q15, // Filter coefficients + int16_t order_coef) { // order of the filter + int n = 0; + + for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) { + int count = order_coef - 1; + int offset; +#if !defined(MIPS_DSP_R1_LE) + int16_t* tmp_cth; + int16_t* tmp_sth; + int16_t* tmp_arg; + int32_t max_q16 = 0x7fff; + int32_t min_q16 = 0xffff8000; +#endif + // Declare variables used as temporary registers. + int32_t r0, r1, r2, t0, t1, t2, t_ar; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "bltz %[count], 2f \n\t" + " lh %[t_ar], 0(%[tmp]) \n\t" + // Inner loop + "1: \n\t" + "sll %[offset], %[count], 1 \n\t" +#if defined(MIPS_DSP_R1_LE) + "lhx %[r0], %[offset](%[cth_Q15]) \n\t" + "lhx %[r1], %[offset](%[sth_Q15]) \n\t" + "lhx %[r2], %[offset](%[ar_g_Q0]) \n\t" +#else + "addu %[tmp_cth], %[cth_Q15], %[offset] \n\t" + "addu %[tmp_sth], %[sth_Q15], %[offset] \n\t" + "addu %[tmp_arg], %[ar_g_Q0], %[offset] \n\t" + "lh %[r0], 0(%[tmp_cth]) \n\t" + "lh %[r1], 0(%[tmp_sth]) \n\t" + "lh %[r2], 0(%[tmp_arg]) \n\t" +#endif + "mul %[t0], %[r0], %[t_ar] \n\t" + "mul %[t1], %[r1], %[t_ar] \n\t" + "mul %[t2], %[r1], %[r2] \n\t" + "mul %[r0], %[r0], %[r2] \n\t" + "subu %[t0], %[t0], %[t2] \n\t" + "addu %[t1], %[t1], %[r0] \n\t" +#if defined(MIPS_DSP_R1_LE) + "shra_r.w %[t1], %[t1], 15 \n\t" + "shra_r.w %[t0], %[t0], 15 \n\t" +#else + "addiu %[t1], %[t1], 0x4000 \n\t" + "sra %[t1], %[t1], 15 \n\t" + "addiu %[t0], %[t0], 0x4000 \n\t" + "sra %[t0], %[t0], 15 \n\t" +#endif + "addiu %[offset], %[offset], 2 \n\t" +#if defined(MIPS_DSP_R1_LE) + "shll_s.w %[t1], %[t1], 16 \n\t" + "shll_s.w %[t_ar], %[t0], 16 \n\t" +#else + "slt %[r0], %[t1], %[max_q16] \n\t" + "slt %[r1], %[t0], %[max_q16] \n\t" + "movz %[t1], %[max_q16], %[r0] \n\t" + "movz %[t0], %[max_q16], %[r1] \n\t" +#endif + "addu %[offset], %[offset], %[ar_g_Q0] \n\t" +#if defined(MIPS_DSP_R1_LE) + "sra %[t1], %[t1], 16 \n\t" + "sra %[t_ar], %[t_ar], 16 \n\t" +#else + "slt %[r0], %[t1], %[min_q16] \n\t" + "slt %[r1], %[t0], %[min_q16] \n\t" + "movn %[t1], %[min_q16], %[r0] \n\t" + "movn %[t0], %[min_q16], %[r1] \n\t" + "addu %[t_ar], $zero, %[t0] \n\t" +#endif + "sh %[t1], 0(%[offset]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[count], %[count], -1 \n\t" + "2: \n\t" + "sh %[t_ar], 0(%[tmp]) \n\t" + "sh %[t_ar], 0(%[ar_g_Q0]) \n\t" + ".set pop \n\t" + : [t_ar] "=&r" (t_ar), [count] "+r" (count), [offset] "=&r" (offset), + [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [t0] "=&r" (t0), +#if !defined(MIPS_DSP_R1_LE) + [tmp_cth] "=&r" (tmp_cth), [tmp_sth] "=&r" (tmp_sth), + [tmp_arg] "=&r" (tmp_arg), +#endif + [t1] "=&r" (t1), [t2] "=&r" (t2) + : [tmp] "r" (&ar_f_Q0[n+1]), [cth_Q15] "r" (cth_Q15), +#if !defined(MIPS_DSP_R1_LE) + [max_q16] "r" (max_q16), [min_q16] "r" (min_q16), +#endif + [sth_Q15] "r" (sth_Q15), [ar_g_Q0] "r" (ar_g_Q0) + : "memory", "hi", "lo" + ); + } +} + +// MIPS optimization of the inner loop used for function +// WebRtcIsacfix_NormLatticeFilterMa(). It does: +// +// for 0 <= n < HALF_SUBFRAMELEN - 1: +// *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); +// *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); +// +// Note, function WebRtcIsacfix_FilterMaLoopMIPS and WebRtcIsacfix_FilterMaLoopC +// are not bit-exact. The accuracy of the MIPS function is same or better. +void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, // Filter coefficient + int16_t input1, // Filter coefficient + int32_t input2, // Inverse coeff (1/input1) + int32_t* ptr0, // Sample buffer + int32_t* ptr1, // Sample buffer + int32_t* ptr2) { // Sample buffer +#if defined(MIPS_DSP_R2_LE) + // MIPS DSPR2 version. 4 available accumulators allows loop unrolling 4 times. + // This variant is not bit-exact with WebRtcIsacfix_FilterMaLoopC, since we + // are exploiting 64-bit accumulators. The accuracy of the MIPS DSPR2 function + // is same or better. + int n = (HALF_SUBFRAMELEN - 1) >> 2; + int m = (HALF_SUBFRAMELEN - 1) & 3; + + int r0, r1, r2, r3; + int t0, t1, t2, t3; + int s0, s1, s2, s3; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "1: \n\t" + "lw %[r0], 0(%[ptr0]) \n\t" + "lw %[r1], 4(%[ptr0]) \n\t" + "lw %[r2], 8(%[ptr0]) \n\t" + "lw %[r3], 12(%[ptr0]) \n\t" + "mult $ac0, %[r0], %[input0] \n\t" + "mult $ac1, %[r1], %[input0] \n\t" + "mult $ac2, %[r2], %[input0] \n\t" + "mult $ac3, %[r3], %[input0] \n\t" + "lw %[t0], 0(%[ptr2]) \n\t" + "extr_rs.w %[s0], $ac0, 15 \n\t" + "extr_rs.w %[s1], $ac1, 15 \n\t" + "extr_rs.w %[s2], $ac2, 15 \n\t" + "extr_rs.w %[s3], $ac3, 15 \n\t" + "lw %[t1], 4(%[ptr2]) \n\t" + "lw %[t2], 8(%[ptr2]) \n\t" + "lw %[t3], 12(%[ptr2]) \n\t" + "addu %[t0], %[t0], %[s0] \n\t" + "addu %[t1], %[t1], %[s1] \n\t" + "addu %[t2], %[t2], %[s2] \n\t" + "addu %[t3], %[t3], %[s3] \n\t" + "mult $ac0, %[t0], %[input2] \n\t" + "mult $ac1, %[t1], %[input2] \n\t" + "mult $ac2, %[t2], %[input2] \n\t" + "mult $ac3, %[t3], %[input2] \n\t" + "addiu %[ptr0], %[ptr0], 16 \n\t" + "extr_rs.w %[t0], $ac0, 16 \n\t" + "extr_rs.w %[t1], $ac1, 16 \n\t" + "extr_rs.w %[t2], $ac2, 16 \n\t" + "extr_rs.w %[t3], $ac3, 16 \n\t" + "addiu %[n], %[n], -1 \n\t" + "mult $ac0, %[r0], %[input1] \n\t" + "mult $ac1, %[r1], %[input1] \n\t" + "mult $ac2, %[r2], %[input1] \n\t" + "mult $ac3, %[r3], %[input1] \n\t" + "sw %[t0], 0(%[ptr2]) \n\t" + "extr_rs.w %[s0], $ac0, 15 \n\t" + "extr_rs.w %[s1], $ac1, 15 \n\t" + "extr_rs.w %[s2], $ac2, 15 \n\t" + "extr_rs.w %[s3], $ac3, 15 \n\t" + "sw %[t1], 4(%[ptr2]) \n\t" + "sw %[t2], 8(%[ptr2]) \n\t" + "sw %[t3], 12(%[ptr2]) \n\t" + "mult $ac0, %[t0], %[input0] \n\t" + "mult $ac1, %[t1], %[input0] \n\t" + "mult $ac2, %[t2], %[input0] \n\t" + "mult $ac3, %[t3], %[input0] \n\t" + "addiu %[ptr2], %[ptr2], 16 \n\t" + "extr_rs.w %[t0], $ac0, 15 \n\t" + "extr_rs.w %[t1], $ac1, 15 \n\t" + "extr_rs.w %[t2], $ac2, 15 \n\t" + "extr_rs.w %[t3], $ac3, 15 \n\t" + "addu %[t0], %[t0], %[s0] \n\t" + "addu %[t1], %[t1], %[s1] \n\t" + "addu %[t2], %[t2], %[s2] \n\t" + "addu %[t3], %[t3], %[s3] \n\t" + "sw %[t0], 0(%[ptr1]) \n\t" + "sw %[t1], 4(%[ptr1]) \n\t" + "sw %[t2], 8(%[ptr1]) \n\t" + "sw %[t3], 12(%[ptr1]) \n\t" + "bgtz %[n], 1b \n\t" + " addiu %[ptr1], %[ptr1], 16 \n\t" + "beq %[m], %0, 3f \n\t" + " nop \n\t" + "2: \n\t" + "lw %[r0], 0(%[ptr0]) \n\t" + "lw %[t0], 0(%[ptr2]) \n\t" + "addiu %[ptr0], %[ptr0], 4 \n\t" + "mult $ac0, %[r0], %[input0] \n\t" + "mult $ac1, %[r0], %[input1] \n\t" + "extr_rs.w %[r1], $ac0, 15 \n\t" + "extr_rs.w %[t1], $ac1, 15 \n\t" + "addu %[t0], %[t0], %[r1] \n\t" + "mult $ac0, %[t0], %[input2] \n\t" + "extr_rs.w %[t0], $ac0, 16 \n\t" + "sw %[t0], 0(%[ptr2]) \n\t" + "mult $ac0, %[t0], %[input0] \n\t" + "addiu %[ptr2], %[ptr2], 4 \n\t" + "addiu %[m], %[m], -1 \n\t" + "extr_rs.w %[t0], $ac0, 15 \n\t" + "addu %[t0], %[t0], %[t1] \n\t" + "sw %[t0], 0(%[ptr1]) \n\t" + "bgtz %[m], 2b \n\t" + " addiu %[ptr1], %[ptr1], 4 \n\t" + "3: \n\t" + ".set pop \n\t" + : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), + [r3] "=&r" (r3), [t0] "=&r" (t0), [t1] "=&r" (t1), + [t2] "=&r" (t2), [t3] "=&r" (t3), [s0] "=&r" (s0), + [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3), + [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [m] "+r" (m), + [ptr2] "+r" (ptr2), [n] "+r" (n) + : [input0] "r" (input0), [input1] "r" (input1), + [input2] "r" (input2) + : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", + "$ac2lo", "$ac3hi", "$ac3lo" + ); +#else + // Non-DSPR2 version of the function. Avoiding the accumulator usage due to + // large latencies. This variant is bit-exact with C code. + int n = HALF_SUBFRAMELEN - 1; + int32_t t16a, t16b; + int32_t r0, r1, r2, r3, r4; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "sra %[t16a], %[input2], 16 \n\t" + "andi %[t16b], %[input2], 0xFFFF \n\t" +#if defined(MIPS32R2_LE) + "seh %[t16b], %[t16b] \n\t" + "seh %[input0], %[input0] \n\t" + "seh %[input1], %[input1] \n\t" +#else + "sll %[t16b], %[t16b], 16 \n\t" + "sra %[t16b], %[t16b], 16 \n\t" + "sll %[input0], %[input0], 16 \n\t" + "sra %[input0], %[input0], 16 \n\t" + "sll %[input1], %[input1], 16 \n\t" + "sra %[input1], %[input1], 16 \n\t" +#endif + "addiu %[r0], %[t16a], 1 \n\t" + "slt %[r1], %[t16b], $zero \n\t" + "movn %[t16a], %[r0], %[r1] \n\t" + "1: \n\t" + "lw %[r0], 0(%[ptr0]) \n\t" + "lw %[r1], 0(%[ptr2]) \n\t" + "addiu %[ptr0], %[ptr0], 4 \n\t" + "sra %[r2], %[r0], 16 \n\t" + "andi %[r0], %[r0], 0xFFFF \n\t" + "mul %[r3], %[r2], %[input0] \n\t" + "mul %[r4], %[r0], %[input0] \n\t" + "mul %[r2], %[r2], %[input1] \n\t" + "mul %[r0], %[r0], %[input1] \n\t" + "addiu %[ptr2], %[ptr2], 4 \n\t" + "sll %[r3], %[r3], 1 \n\t" + "sra %[r4], %[r4], 1 \n\t" + "addiu %[r4], %[r4], 0x2000 \n\t" + "sra %[r4], %[r4], 14 \n\t" + "addu %[r3], %[r3], %[r4] \n\t" + "addu %[r1], %[r1], %[r3] \n\t" + "sra %[r3], %[r1], 16 \n\t" + "andi %[r4], %[r1], 0xFFFF \n\t" + "sra %[r4], %[r4], 1 \n\t" + "mul %[r1], %[r1], %[t16a] \n\t" + "mul %[r3], %[r3], %[t16b] \n\t" + "mul %[r4], %[r4], %[t16b] \n\t" + "sll %[r2], %[r2], 1 \n\t" + "sra %[r0], %[r0], 1 \n\t" + "addiu %[r0], %[r0], 0x2000 \n\t" + "sra %[r0], %[r0], 14 \n\t" + "addu %[r0], %[r0], %[r2] \n\t" + "addiu %[n], %[n], -1 \n\t" + "addu %[r1], %[r1], %[r3] \n\t" + "addiu %[r4], %[r4], 0x4000 \n\t" + "sra %[r4], %[r4], 15 \n\t" + "addu %[r1], %[r1], %[r4] \n\t" + "sra %[r2], %[r1], 16 \n\t" + "andi %[r3], %[r1], 0xFFFF \n\t" + "mul %[r3], %[r3], %[input0] \n\t" + "mul %[r2], %[r2], %[input0] \n\t" + "sw %[r1], -4(%[ptr2]) \n\t" + "sra %[r3], %[r3], 1 \n\t" + "addiu %[r3], %[r3], 0x2000 \n\t" + "sra %[r3], %[r3], 14 \n\t" + "addu %[r0], %[r0], %[r3] \n\t" + "sll %[r2], %[r2], 1 \n\t" + "addu %[r0], %[r0], %[r2] \n\t" + "sw %[r0], 0(%[ptr1]) \n\t" + "bgtz %[n], 1b \n\t" + " addiu %[ptr1], %[ptr1], 4 \n\t" + ".set pop \n\t" + : [t16a] "=&r" (t16a), [t16b] "=&r" (t16b), [r0] "=&r" (r0), + [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), + [r4] "=&r" (r4), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), + [ptr2] "+r" (ptr2), [n] "+r" (n) + : [input0] "r" (input0), [input1] "r" (input1), + [input2] "r" (input2) + : "hi", "lo", "memory" + ); +#endif +} diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_mips.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_mips.c new file mode 100644 index 000000000..8334f7eb1 --- /dev/null +++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_filter_mips.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_coding/codecs/isac/fix/source/pitch_estimator.h" + +void WebRtcIsacfix_PitchFilterCore(int loopNumber, + int16_t gain, + int index, + int16_t sign, + int16_t* inputState, + int16_t* outputBuf2, + const int16_t* coefficient, + int16_t* inputBuf, + int16_t* outputBuf, + int* index2) { + int ind2t = *index2; + int i = 0; + int16_t* out2_pos2 = &outputBuf2[PITCH_BUFFSIZE - (index + 2)] + ind2t; + int32_t w1, w2, w3, w4, w5, gain32, sign32; + int32_t coef1, coef2, coef3, coef4, coef5 = 0; + // Define damp factors as int32_t (pair of int16_t) + int32_t kDampF0 = 0x0000F70A; + int32_t kDampF1 = 0x51EC2000; + int32_t kDampF2 = 0xF70A2000; + int16_t* input1 = inputBuf + ind2t; + int16_t* output1 = outputBuf + ind2t; + int16_t* output2 = outputBuf2 + ind2t + PITCH_BUFFSIZE; + + // Load coefficients outside the loop and sign-extend gain and sign + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "lwl %[coef1], 3(%[coefficient]) \n\t" + "lwl %[coef2], 7(%[coefficient]) \n\t" + "lwl %[coef3], 11(%[coefficient]) \n\t" + "lwl %[coef4], 15(%[coefficient]) \n\t" + "lwr %[coef1], 0(%[coefficient]) \n\t" + "lwr %[coef2], 4(%[coefficient]) \n\t" + "lwr %[coef3], 8(%[coefficient]) \n\t" + "lwr %[coef4], 12(%[coefficient]) \n\t" + "lhu %[coef5], 16(%[coefficient]) \n\t" + "seh %[gain32], %[gain] \n\t" + "seh %[sign32], %[sign] \n\t" + ".set pop \n\t" + : [coef1] "=&r" (coef1), [coef2] "=&r" (coef2), [coef3] "=&r" (coef3), + [coef4] "=&r" (coef4), [coef5] "=&r" (coef5), [gain32] "=&r" (gain32), + [sign32] "=&r" (sign32) + : [coefficient] "r" (coefficient), [gain] "r" (gain), + [sign] "r" (sign) + : "memory" + ); + + for (i = 0; i < loopNumber; i++) { + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + // Filter to get fractional pitch + "li %[w1], 8192 \n\t" + "mtlo %[w1] \n\t" + "mthi $0 \n\t" + "lwl %[w1], 3(%[out2_pos2]) \n\t" + "lwl %[w2], 7(%[out2_pos2]) \n\t" + "lwl %[w3], 11(%[out2_pos2]) \n\t" + "lwl %[w4], 15(%[out2_pos2]) \n\t" + "lwr %[w1], 0(%[out2_pos2]) \n\t" + "lwr %[w2], 4(%[out2_pos2]) \n\t" + "lwr %[w3], 8(%[out2_pos2]) \n\t" + "lwr %[w4], 12(%[out2_pos2]) \n\t" + "lhu %[w5], 16(%[out2_pos2]) \n\t" + "dpa.w.ph $ac0, %[w1], %[coef1] \n\t" + "dpa.w.ph $ac0, %[w2], %[coef2] \n\t" + "dpa.w.ph $ac0, %[w3], %[coef3] \n\t" + "dpa.w.ph $ac0, %[w4], %[coef4] \n\t" + "dpa.w.ph $ac0, %[w5], %[coef5] \n\t" + "addiu %[out2_pos2], %[out2_pos2], 2 \n\t" + "mthi $0, $ac1 \n\t" + "lwl %[w2], 3(%[inputState]) \n\t" + "lwl %[w3], 7(%[inputState]) \n\t" + // Fractional pitch shift & saturation + "extr_s.h %[w1], $ac0, 14 \n\t" + "li %[w4], 16384 \n\t" + "lwr %[w2], 0(%[inputState]) \n\t" + "lwr %[w3], 4(%[inputState]) \n\t" + "mtlo %[w4], $ac1 \n\t" + // Shift low pass filter state + "swl %[w2], 5(%[inputState]) \n\t" + "swl %[w3], 9(%[inputState]) \n\t" + "mul %[w1], %[gain32], %[w1] \n\t" + "swr %[w2], 2(%[inputState]) \n\t" + "swr %[w3], 6(%[inputState]) \n\t" + // Low pass filter accumulation + "dpa.w.ph $ac1, %[kDampF1], %[w2] \n\t" + "dpa.w.ph $ac1, %[kDampF2], %[w3] \n\t" + "lh %[w4], 0(%[input1]) \n\t" + "addiu %[input1], %[input1], 2 \n\t" + "shra_r.w %[w1], %[w1], 12 \n\t" + "sh %[w1], 0(%[inputState]) \n\t" + "dpa.w.ph $ac1, %[kDampF0], %[w1] \n\t" + // Low pass filter shift & saturation + "extr_s.h %[w2], $ac1, 15 \n\t" + "mul %[w2], %[w2], %[sign32] \n\t" + // Buffer update + "subu %[w2], %[w4], %[w2] \n\t" + "shll_s.w %[w2], %[w2], 16 \n\t" + "sra %[w2], %[w2], 16 \n\t" + "sh %[w2], 0(%[output1]) \n\t" + "addu %[w2], %[w2], %[w4] \n\t" + "shll_s.w %[w2], %[w2], 16 \n\t" + "addiu %[output1], %[output1], 2 \n\t" + "sra %[w2], %[w2], 16 \n\t" + "sh %[w2], 0(%[output2]) \n\t" + "addiu %[output2], %[output2], 2 \n\t" + ".set pop \n\t" + : [w1] "=&r" (w1), [w2] "=&r" (w2), [w3] "=&r" (w3), [w4] "=&r" (w4), + [w5] "=&r" (w5), [input1] "+r" (input1), [out2_pos2] "+r" (out2_pos2), + [output1] "+r" (output1), [output2] "+r" (output2) + : [coefficient] "r" (coefficient), [inputState] "r" (inputState), + [gain32] "r" (gain32), [sign32] "r" (sign32), [kDampF0] "r" (kDampF0), + [kDampF1] "r" (kDampF1), [kDampF2] "r" (kDampF2), + [coef1] "r" (coef1), [coef2] "r" (coef2), [coef3] "r" (coef3), + [coef4] "r" (coef4), [coef5] "r" (coef5) + : "hi", "lo", "$ac1hi", "$ac1lo", "memory" + ); + } + (*index2) += loopNumber; +}