MIPS optimizations for ISAC (patch #3)
Implemented functions: - WebRtcIsacfix_MatrixProduct1 - WebRtcIsacfix_MatrixProduct2 The optimizations are bit-exact to the C code. R=andrew@webrtc.org, tina.legrand@webrtc.org Review URL: https://webrtc-codereview.appspot.com/18019004 Patch from Ljubomir Papuga <lpapuga@mips.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@6919 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
@ -166,4 +166,24 @@ void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
const int matrix0_index_step);
#if defined(MIPS32_LE)
void WebRtcIsacfix_MatrixProduct1MIPS(const int16_t matrix0[],
const int32_t matrix1[],
int32_t matrix_product[],
const int matrix1_index_factor1,
const int matrix0_index_factor1,
const int matrix1_index_init_case,
const int matrix1_index_step,
const int matrix0_index_step,
const int inner_loop_count,
const int mid_loop_count,
const int shift);
void WebRtcIsacfix_MatrixProduct2MIPS(const int16_t matrix0[],
const int32_t matrix1[],
int32_t matrix_product[],
const int matrix0_index_factor,
const int matrix0_index_step);
@ -0,0 +1,249 @@
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
#include "modules/audio_coding/codecs/isac/fix/source/entropy_coding.h"
#include "modules/audio_coding/codecs/isac/fix/source/settings.h"
// MIPS optimization of the function WebRtcIsacfix_MatrixProduct1.
// Bit-exact with the function WebRtcIsacfix_MatrixProduct1C from
// entropy_coding.c file.
void WebRtcIsacfix_MatrixProduct1MIPS(const int16_t matrix0[],
const int32_t matrix1[],
int32_t matrix_product[],
const int matrix1_index_factor1,
const int matrix0_index_factor1,
const int matrix1_index_init_case,
const int matrix1_index_step,
const int matrix0_index_step,
const int inner_loop_count,
const int mid_loop_count,
const int shift) {
if (matrix1_index_init_case != 0) {
int j = SUBFRAMES, k = 0, n = 0;
int32_t r0, r1, r2, sum32;
int32_t* product_start = matrix_product;
int32_t* product_ptr;
const uint32_t product_step = 4 * mid_loop_count;
const uint32_t matrix0_step = 2 * matrix0_index_step;
const uint32_t matrix1_step = 4 * matrix1_index_step;
const uint32_t matrix0_step2 = 2 * matrix0_index_factor1;
const uint32_t matrix1_step2 = 4 * matrix1_index_factor1;
const int16_t* matrix0_start = matrix0;
const int32_t* matrix1_start = matrix1;
int16_t* matrix0_ptr;
int32_t* matrix1_ptr;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"addu %[product_ptr], %[product_start], $0 \n\t"
"addu %[k], %[product_step], $0 \n\t"
"addiu %[j], %[j], -1 \n\t"
"addu %[matrix1_start], %[matrix1], $0 \n\t"
"2: \n\t"
"addu %[matrix1_ptr], %[matrix1_start], $0 \n\t"
"addu %[matrix0_ptr], %[matrix0_start], $0 \n\t"
"addu %[n], %[inner_loop_count], $0 \n\t"
"mul %[sum32], $0, $0 \n\t"
"3: \n\t"
"lw %[r0], 0(%[matrix1_ptr]) \n\t"
"lh %[r1], 0(%[matrix0_ptr]) \n\t"
"addu %[matrix1_ptr], %[matrix1_ptr], %[matrix1_step] \n\t"
"sllv %[r0], %[r0], %[shift] \n\t"
"andi %[r2], %[r0], 0xffff \n\t"
"sra %[r2], %[r2], 1 \n\t"
"mul %[r2], %[r2], %[r1] \n\t"
"sra %[r0], %[r0], 16 \n\t"
"mul %[r0], %[r0], %[r1] \n\t"
"addu %[matrix0_ptr], %[matrix0_ptr], %[matrix0_step] \n\t"
"addiu %[n], %[n], -1 \n\t"
#if defined(MIPS_DSP_R1_LE)
"shra_r.w %[r2], %[r2], 15 \n\t"
"addiu %[r2], %[r2], 0x4000 \n\t"
"sra %[r2], %[r2], 15 \n\t"
"addu %[sum32], %[sum32], %[r2] \n\t"
"bgtz %[n], 3b \n\t"
" addu %[sum32], %[sum32], %[r0] \n\t"
"addiu %[k], %[k], -4 \n\t"
"addu %[matrix1_start], %[matrix1_start], %[matrix1_step2] \n\t"
"sw %[sum32], 0(%[product_ptr]) \n\t"
"bgtz %[k], 2b \n\t"
" addiu %[product_ptr], %[product_ptr], 4 \n\t"
"addu %[matrix0_start], %[matrix0_start], %[matrix0_step2] \n\t"
"bgtz %[j], 1b \n\t"
" addu %[product_start], %[product_start], %[product_step] \n\t"
".set pop \n\t"
: [product_ptr] "=&r" (product_ptr), [product_start] "+r" (product_start),
[k] "=&r" (k), [j] "+r" (j), [matrix1_start] "=&r"(matrix1_start),
[matrix1_ptr] "=&r" (matrix1_ptr), [matrix0_ptr] "=&r" (matrix0_ptr),
[matrix0_start] "+r" (matrix0_start), [n] "=&r" (n), [r0] "=&r" (r0),
[sum32] "=&r" (sum32), [r1] "=&r" (r1),[r2] "=&r" (r2)
: [product_step] "r" (product_step), [matrix1] "r" (matrix1),
[inner_loop_count] "r" (inner_loop_count),
[matrix1_step] "r" (matrix1_step), [shift] "r" (shift),
[matrix0_step] "r" (matrix0_step), [matrix1_step2] "r" (matrix1_step2),
[matrix0_step2] "r" (matrix0_step2)
: "hi", "lo", "memory"
} else {
int j = SUBFRAMES, k = 0, n = 0;
int32_t r0, r1, r2, sum32;
int32_t* product_start = matrix_product;
int32_t* product_ptr;
const uint32_t product_step = 4 * mid_loop_count;
const uint32_t matrix0_step = 2 * matrix0_index_step;
const uint32_t matrix1_step = 4 * matrix1_index_step;
const uint32_t matrix0_step2 = 2 * matrix0_index_factor1;
const uint32_t matrix1_step2 = 4 * matrix1_index_factor1;
const int16_t* matrix0_start = matrix0;
const int32_t* matrix1_start = matrix1;
int16_t* matrix0_ptr;
int32_t* matrix1_ptr;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"addu %[product_ptr], %[product_start], $0 \n\t"
"addu %[k], %[product_step], $0 \n\t"
"addiu %[j], %[j], -1 \n\t"
"addu %[matrix0_start], %[matrix0], $0 \n\t"
"2: \n\t"
"addu %[matrix1_ptr], %[matrix1_start], $0 \n\t"
"addu %[matrix0_ptr], %[matrix0_start], $0 \n\t"
"addu %[n], %[inner_loop_count], $0 \n\t"
"mul %[sum32], $0, $0 \n\t"
"3: \n\t"
"lw %[r0], 0(%[matrix1_ptr]) \n\t"
"lh %[r1], 0(%[matrix0_ptr]) \n\t"
"addu %[matrix1_ptr], %[matrix1_ptr], %[matrix1_step] \n\t"
"sllv %[r0], %[r0], %[shift] \n\t"
"andi %[r2], %[r0], 0xffff \n\t"
"sra %[r2], %[r2], 1 \n\t"
"mul %[r2], %[r2], %[r1] \n\t"
"sra %[r0], %[r0], 16 \n\t"
"mul %[r0], %[r0], %[r1] \n\t"
"addu %[matrix0_ptr], %[matrix0_ptr], %[matrix0_step] \n\t"
"addiu %[n], %[n], -1 \n\t"
#if defined(MIPS_DSP_R1_LE)
"shra_r.w %[r2], %[r2], 15 \n\t"
"addiu %[r2], %[r2], 0x4000 \n\t"
"sra %[r2], %[r2], 15 \n\t"
"addu %[sum32], %[sum32], %[r2] \n\t"
"bgtz %[n], 3b \n\t"
" addu %[sum32], %[sum32], %[r0] \n\t"
"addiu %[k], %[k], -4 \n\t"
"addu %[matrix0_start], %[matrix0_start], %[matrix0_step2] \n\t"
"sw %[sum32], 0(%[product_ptr]) \n\t"
"bgtz %[k], 2b \n\t"
" addiu %[product_ptr], %[product_ptr], 4 \n\t"
"addu %[matrix1_start], %[matrix1_start], %[matrix1_step2] \n\t"
"bgtz %[j], 1b \n\t"
" addu %[product_start], %[product_start], %[product_step] \n\t"
".set pop \n\t"
: [product_ptr] "=&r" (product_ptr), [product_start] "+r" (product_start),
[k] "=&r" (k), [j] "+r" (j), [matrix1_start] "+r"(matrix1_start),
[matrix1_ptr] "=&r" (matrix1_ptr), [matrix0_ptr] "=&r" (matrix0_ptr),
[matrix0_start] "=&r" (matrix0_start), [n] "=&r" (n), [r0] "=&r" (r0),
[sum32] "=&r" (sum32), [r1] "=&r" (r1),[r2] "=&r" (r2)
: [product_step] "r" (product_step), [matrix0] "r" (matrix0),
[inner_loop_count] "r" (inner_loop_count),
[matrix1_step] "r" (matrix1_step), [shift] "r" (shift),
[matrix0_step] "r" (matrix0_step), [matrix1_step2] "r" (matrix1_step2),
[matrix0_step2] "r" (matrix0_step2)
: "hi", "lo", "memory"
// MIPS optimization of the function WebRtcIsacfix_MatrixProduct2.
// Bit-exact with the function WebRtcIsacfix_MatrixProduct2C from
// entropy_coding.c file.
void WebRtcIsacfix_MatrixProduct2MIPS(const int16_t matrix0[],
const int32_t matrix1[],
int32_t matrix_product[],
const int matrix0_index_factor,
const int matrix0_index_step) {
int j = 0, n = 0;
int loop_count = SUBFRAMES;
const int16_t* matrix0_ptr;
const int32_t* matrix1_ptr;
const int16_t* matrix0_start = matrix0;
const int matrix0_step = 2 * matrix0_index_step;
const int matrix0_step2 = 2 * matrix0_index_factor;
int32_t r0, r1, r2, r3, r4, sum32, sum32_2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addu %[j], %[loop_count], $0 \n\t"
"addu %[matrix0_start], %[matrix0], $0 \n\t"
"1: \n\t"
"addu %[matrix1_ptr], %[matrix1], $0 \n\t"
"addu %[matrix0_ptr], %[matrix0_start], $0 \n\t"
"addu %[n], %[loop_count], $0 \n\t"
"mul %[sum32], $0, $0 \n\t"
"mul %[sum32_2], $0, $0 \n\t"
"2: \n\t"
"lw %[r0], 0(%[matrix1_ptr]) \n\t"
"lw %[r1], 4(%[matrix1_ptr]) \n\t"
"lh %[r2], 0(%[matrix0_ptr]) \n\t"
"andi %[r3], %[r0], 0xffff \n\t"
"sra %[r3], %[r3], 1 \n\t"
"mul %[r3], %[r3], %[r2] \n\t"
"andi %[r4], %[r1], 0xffff \n\t"
"sra %[r4], %[r4], 1 \n\t"
"mul %[r4], %[r4], %[r2] \n\t"
"sra %[r0], %[r0], 16 \n\t"
"mul %[r0], %[r0], %[r2] \n\t"
"sra %[r1], %[r1], 16 \n\t"
"mul %[r1], %[r1], %[r2] \n\t"
#if defined(MIPS_DSP_R1_LE)
"shra_r.w %[r3], %[r3], 15 \n\t"
"shra_r.w %[r4], %[r4], 15 \n\t"
"addiu %[r3], %[r3], 0x4000 \n\t"
"sra %[r3], %[r3], 15 \n\t"
"addiu %[r4], %[r4], 0x4000 \n\t"
"sra %[r4], %[r4], 15 \n\t"
"addiu %[matrix1_ptr], %[matrix1_ptr], 8 \n\t"
"addu %[matrix0_ptr], %[matrix0_ptr], %[matrix0_step] \n\t"
"addiu %[n], %[n], -1 \n\t"
"addu %[sum32], %[sum32], %[r3] \n\t"
"addu %[sum32_2], %[sum32_2], %[r4] \n\t"
"addu %[sum32], %[sum32], %[r0] \n\t"
"bgtz %[n], 2b \n\t"
" addu %[sum32_2], %[sum32_2], %[r1] \n\t"
"sra %[sum32], %[sum32], 3 \n\t"
"sra %[sum32_2], %[sum32_2], 3 \n\t"
"addiu %[j], %[j], -1 \n\t"
"addu %[matrix0_start], %[matrix0_start], %[matrix0_step2] \n\t"
"sw %[sum32], 0(%[matrix_product]) \n\t"
"sw %[sum32_2], 4(%[matrix_product]) \n\t"
"bgtz %[j], 1b \n\t"
" addiu %[matrix_product], %[matrix_product], 8 \n\t"
".set pop \n\t"
: [j] "=&r" (j), [matrix0_start] "=&r" (matrix0_start),
[matrix1_ptr] "=&r" (matrix1_ptr), [matrix0_ptr] "=&r" (matrix0_ptr),
[n] "=&r" (n), [sum32] "=&r" (sum32), [sum32_2] "=&r" (sum32_2),
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[r4] "=&r" (r4), [matrix_product] "+r" (matrix_product)
: [loop_count] "r" (loop_count), [matrix0] "r" (matrix0),
[matrix1] "r" (matrix1), [matrix0_step] "r" (matrix0_step),
[matrix0_step2] "r" (matrix0_step2)
: "hi", "lo", "memory"
@ -211,6 +211,8 @@ static void WebRtcIsacfix_InitMIPS(void) {
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopMIPS;
WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeMIPS;
WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecMIPS;
WebRtcIsacfix_MatrixProduct1 = WebRtcIsacfix_MatrixProduct1MIPS;
WebRtcIsacfix_MatrixProduct2 = WebRtcIsacfix_MatrixProduct2MIPS;
#if defined(MIPS_DSP_R1_LE)
WebRtcIsacfix_AllpassFilter2FixDec16 =
@ -89,6 +89,7 @@
['target_arch=="mipsel"', {
'sources': [
Reference in New Issue
Block a user