Optimized WebRtcIsacfix_Time2Spec() for iSAC-Fix in ARM Neon processor.

Review URL: https://webrtc-codereview.appspot.com/1005004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@3404 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2013-01-24 01:37:33 +00:00
parent 5dfb1f2cd3
commit 4782911572
6 changed files with 640 additions and 326 deletions

View File

@ -25,68 +25,76 @@
extern "C" {
#endif
int WebRtcIsacfix_EstimateBandwidth(BwEstimatorstr *bwest_str,
Bitstr_dec *streamdata,
WebRtc_Word32 packet_size,
WebRtc_UWord16 rtp_seq_number,
WebRtc_UWord32 send_ts,
WebRtc_UWord32 arr_ts);
int WebRtcIsacfix_EstimateBandwidth(BwEstimatorstr* bwest_str,
Bitstr_dec* streamdata,
WebRtc_Word32 packet_size,
WebRtc_UWord16 rtp_seq_number,
WebRtc_UWord32 send_ts,
WebRtc_UWord32 arr_ts);
WebRtc_Word16 WebRtcIsacfix_DecodeImpl(WebRtc_Word16 *signal_out16,
ISACFIX_DecInst_t *ISACdec_obj,
WebRtc_Word16 *current_framesamples);
WebRtc_Word16 WebRtcIsacfix_DecodeImpl(WebRtc_Word16* signal_out16,
ISACFIX_DecInst_t* ISACdec_obj,
WebRtc_Word16* current_framesamples);
WebRtc_Word16 WebRtcIsacfix_DecodePlcImpl(WebRtc_Word16 *decoded,
ISACFIX_DecInst_t *ISACdec_obj,
WebRtc_Word16 *current_framesample );
WebRtc_Word16 WebRtcIsacfix_DecodePlcImpl(WebRtc_Word16* decoded,
ISACFIX_DecInst_t* ISACdec_obj,
WebRtc_Word16* current_framesample );
int WebRtcIsacfix_EncodeImpl(WebRtc_Word16 *in,
ISACFIX_EncInst_t *ISACenc_obj,
BwEstimatorstr *bw_estimatordata,
WebRtc_Word16 CodingMode);
int WebRtcIsacfix_EncodeImpl(WebRtc_Word16* in,
ISACFIX_EncInst_t* ISACenc_obj,
BwEstimatorstr* bw_estimatordata,
WebRtc_Word16 CodingMode);
int WebRtcIsacfix_EncodeStoredData(ISACFIX_EncInst_t *ISACenc_obj,
int BWnumber,
float scale);
int WebRtcIsacfix_EncodeStoredData(ISACFIX_EncInst_t* ISACenc_obj,
int BWnumber,
float scale);
/* initialization functions */
void WebRtcIsacfix_InitMaskingEnc(MaskFiltstr_enc *maskdata);
void WebRtcIsacfix_InitMaskingDec(MaskFiltstr_dec *maskdata);
void WebRtcIsacfix_InitMaskingEnc(MaskFiltstr_enc* maskdata);
void WebRtcIsacfix_InitMaskingDec(MaskFiltstr_dec* maskdata);
void WebRtcIsacfix_InitPreFilterbank(PreFiltBankstr *prefiltdata);
void WebRtcIsacfix_InitPreFilterbank(PreFiltBankstr* prefiltdata);
void WebRtcIsacfix_InitPostFilterbank(PostFiltBankstr *postfiltdata);
void WebRtcIsacfix_InitPostFilterbank(PostFiltBankstr* postfiltdata);
void WebRtcIsacfix_InitPitchFilter(PitchFiltstr *pitchfiltdata);
void WebRtcIsacfix_InitPitchFilter(PitchFiltstr* pitchfiltdata);
void WebRtcIsacfix_InitPitchAnalysis(PitchAnalysisStruct *State);
void WebRtcIsacfix_InitPitchAnalysis(PitchAnalysisStruct* State);
void WebRtcIsacfix_InitPlc( PLCstr *State );
void WebRtcIsacfix_InitPlc(PLCstr* State);
/* transform functions */
void WebRtcIsacfix_InitTransform();
void WebRtcIsacfix_Time2Spec(WebRtc_Word16 *inre1Q9,
WebRtc_Word16 *inre2Q9,
WebRtc_Word16 *outre,
WebRtc_Word16 *outim);
typedef void (*Time2Spec)(WebRtc_Word16* inre1Q9,
WebRtc_Word16* inre2Q9,
WebRtc_Word16* outre,
WebRtc_Word16* outim);
typedef void (*Spec2Time)(WebRtc_Word16* inreQ7,
WebRtc_Word16* inimQ7,
WebRtc_Word32* outre1Q16,
WebRtc_Word32* outre2Q16);
extern Time2Spec WebRtcIsacfix_Time2Spec;
extern Spec2Time WebRtcIsacfix_Spec2Time;
void WebRtcIsacfix_Time2SpecC(WebRtc_Word16* inre1Q9,
WebRtc_Word16* inre2Q9,
WebRtc_Word16* outre,
WebRtc_Word16* outim);
void WebRtcIsacfix_Spec2TimeC(WebRtc_Word16* inreQ7,
WebRtc_Word16* inimQ7,
WebRtc_Word32* outre1Q16,
WebRtc_Word32* outre2Q16);
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
void WebRtcIsacfix_Time2SpecNeon(WebRtc_Word16* inre1Q9,
WebRtc_Word16* inre2Q9,
WebRtc_Word16* outre,
WebRtc_Word16* outim);
void WebRtcIsacfix_Spec2TimeNeon(WebRtc_Word16* inreQ7,
WebRtc_Word16* inimQ7,
WebRtc_Word32* outre1Q16,
@ -94,52 +102,50 @@ void WebRtcIsacfix_Spec2TimeNeon(WebRtc_Word16* inreQ7,
#endif
/* filterbank functions */
void WebRtcIsacfix_SplitAndFilter1(WebRtc_Word16 *in,
WebRtc_Word16 *LP16,
WebRtc_Word16 *HP16,
PreFiltBankstr *prefiltdata);
void WebRtcIsacfix_SplitAndFilter1(WebRtc_Word16* in,
WebRtc_Word16* LP16,
WebRtc_Word16* HP16,
PreFiltBankstr* prefiltdata);
void WebRtcIsacfix_FilterAndCombine1(WebRtc_Word16 *tempin_ch1,
WebRtc_Word16 *tempin_ch2,
WebRtc_Word16 *out16,
PostFiltBankstr *postfiltdata);
void WebRtcIsacfix_FilterAndCombine1(WebRtc_Word16* tempin_ch1,
WebRtc_Word16* tempin_ch2,
WebRtc_Word16* out16,
PostFiltBankstr* postfiltdata);
#ifdef WEBRTC_ISAC_FIX_NB_CALLS_ENABLED
void WebRtcIsacfix_SplitAndFilter2(WebRtc_Word16 *in,
WebRtc_Word16 *LP16,
WebRtc_Word16 *HP16,
PreFiltBankstr *prefiltdata);
void WebRtcIsacfix_SplitAndFilter2(WebRtc_Word16* in,
WebRtc_Word16* LP16,
WebRtc_Word16* HP16,
PreFiltBankstr* prefiltdata);
void WebRtcIsacfix_FilterAndCombine2(WebRtc_Word16 *tempin_ch1,
WebRtc_Word16 *tempin_ch2,
WebRtc_Word16 *out16,
PostFiltBankstr *postfiltdata,
WebRtc_Word16 len);
void WebRtcIsacfix_FilterAndCombine2(WebRtc_Word16* tempin_ch1,
WebRtc_Word16* tempin_ch2,
WebRtc_Word16* out16,
PostFiltBankstr* postfiltdata,
WebRtc_Word16 len);
#endif
/* normalized lattice filters */
void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
WebRtc_Word32 *stateGQ15,
WebRtc_Word16 *lat_inQ0,
WebRtc_Word16 *filt_coefQ15,
WebRtc_Word32 *gain_lo_hiQ17,
WebRtc_Word32* stateGQ15,
WebRtc_Word16* lat_inQ0,
WebRtc_Word16* filt_coefQ15,
WebRtc_Word32* gain_lo_hiQ17,
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ9);
WebRtc_Word16* lat_outQ9);
void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
WebRtc_Word16 *stateGQ0,
WebRtc_Word32 *lat_inQ25,
WebRtc_Word16 *filt_coefQ15,
WebRtc_Word32 *gain_lo_hiQ17,
WebRtc_Word16* stateGQ0,
WebRtc_Word32* lat_inQ25,
WebRtc_Word16* filt_coefQ15,
WebRtc_Word32* gain_lo_hiQ17,
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ0);
WebRtc_Word16* lat_outQ0);
/* TODO(kma): Remove the following functions into individual header files. */

View File

@ -183,6 +183,7 @@ static void WebRtcIsacfix_InitNeon(void) {
WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon;
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeNeon;
WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecNeon;
WebRtcIsacfix_CalculateResidualEnergy =
WebRtcIsacfix_CalculateResidualEnergyNeon;
WebRtcIsacfix_AllpassFilter2FixDec16 =
@ -273,10 +274,9 @@ WebRtc_Word16 WebRtcIsacfix_EncoderInit(ISACFIX_MainStruct *ISAC_main_inst,
WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC;
WebRtcIsacfix_CalculateResidualEnergy =
WebRtcIsacfix_CalculateResidualEnergyC;
WebRtcIsacfix_AllpassFilter2FixDec16 =
WebRtcIsacfix_AllpassFilter2FixDec16C;
WebRtcIsacfix_Spec2Time =
WebRtcIsacfix_Spec2TimeC;
WebRtcIsacfix_AllpassFilter2FixDec16 = WebRtcIsacfix_AllpassFilter2FixDec16C;
WebRtcIsacfix_Time2Spec = WebRtcIsacfix_Time2SpecC;
WebRtcIsacfix_Spec2Time = WebRtcIsacfix_Spec2TimeC;
#ifdef WEBRTC_DETECT_ARM_NEON
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {

View File

@ -79,23 +79,6 @@ const WebRtc_Word16 kSinTab1[FRAMESAMPLES/2] = {
};
/* Cosine table 2 in Q14 */
const WebRtc_Word16 kCosTab2[FRAMESAMPLES/4] = {
107, -322, 536, -750, 965, -1179, 1392, -1606, 1819, -2032,
2245, -2457, 2669, -2880, 3091, -3301, 3511, -3720, 3929, -4137,
4344, -4550, 4756, -4961, 5165, -5368, 5570, -5771, 5971, -6171,
6369, -6566, 6762, -6957, 7150, -7342, 7534, -7723, 7912, -8099,
8285, -8469, 8652, -8833, 9013, -9191, 9368, -9543, 9717, -9889,
10059, -10227, 10394, -10559, 10722, -10883, 11042, -11200, 11356, -11509,
11661, -11810, 11958, -12104, 12247, -12389, 12528, -12665, 12800, -12933,
13063, -13192, 13318, -13441, 13563, -13682, 13799, -13913, 14025, -14135,
14242, -14347, 14449, -14549, 14647, -14741, 14834, -14924, 15011, -15095,
15178, -15257, 15334, -15408, 15480, -15549, 15615, -15679, 15739, -15798,
15853, -15906, 15956, -16003, 16048, -16090, 16129, -16165, 16199, -16229,
16257, -16283, 16305, -16325, 16342, -16356, 16367, -16375, 16381, -16384
};
/* Sine table 2 in Q14 */
const WebRtc_Word16 kSinTab2[FRAMESAMPLES/4] = {
16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305, 16283, -16257,
@ -112,10 +95,11 @@ const WebRtc_Word16 kSinTab2[FRAMESAMPLES/4] = {
2032, -1819, 1606, -1392, 1179, -965, 750, -536, 322, -107
};
// Declare a function pointer.
// Declare function pointers.
Spec2Time WebRtcIsacfix_Spec2Time;
Time2Spec WebRtcIsacfix_Time2Spec;
void WebRtcIsacfix_Time2Spec(WebRtc_Word16 *inre1Q9,
void WebRtcIsacfix_Time2SpecC(WebRtc_Word16 *inre1Q9,
WebRtc_Word16 *inre2Q9,
WebRtc_Word16 *outreQ7,
WebRtc_Word16 *outimQ7)
@ -187,7 +171,7 @@ void WebRtcIsacfix_Time2Spec(WebRtc_Word16 *inre1Q9,
yiQ16 = -tmpreQ16[k] + tmpreQ16[FRAMESAMPLES/2 - 1 - k];
xiQ16 = tmpimQ16[k] - tmpimQ16[FRAMESAMPLES/2 - 1 - k];
yrQ16 = tmpimQ16[k] + tmpimQ16[FRAMESAMPLES/2 - 1 - k];
tmp1rQ14 = kCosTab2[k];
tmp1rQ14 = -kSinTab2[FRAMESAMPLES/4 - 1 - k];
tmp1iQ14 = kSinTab2[k];
v1Q16 = WEBRTC_SPL_MUL_16_32_RSFT14(tmp1rQ14, xrQ16) - WEBRTC_SPL_MUL_16_32_RSFT14(tmp1iQ14, xiQ16);
v2Q16 = WEBRTC_SPL_MUL_16_32_RSFT14(tmp1iQ14, xrQ16) + WEBRTC_SPL_MUL_16_32_RSFT14(tmp1rQ14, xiQ16);
@ -214,7 +198,7 @@ void WebRtcIsacfix_Spec2TimeC(WebRtc_Word16 *inreQ7, WebRtc_Word16 *inimQ7, WebR
for (k = 0; k < FRAMESAMPLES/4; k++) {
/* Move zero in time to beginning of frames */
tmp1rQ14 = kCosTab2[k];
tmp1rQ14 = -kSinTab2[FRAMESAMPLES/4 - 1 - k];
tmp1iQ14 = kSinTab2[k];
tmpInRe = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) inreQ7[k], 9); // Q7 -> Q16

View File

@ -15,6 +15,302 @@
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon
GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon
@ void WebRtcIsacfix_Time2SpecNeon(WebRtc_Word16* inre1Q9,
@ WebRtc_Word16* inre2Q9,
@ WebRtc_Word16* outreQ7,
@ WebRtc_Word16* outimQ7);
DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon
.align 2
push {r4-r11,lr}
vpush {q4-q7}
sub sp, sp, #(16 + FRAMESAMPLES * 4)
str r0, [sp] @ inre1Q9
str r1, [sp, #4] @ inre2Q9
str r2, [sp, #8] @ outreQ7
str r3, [sp, #12] @ outimQ7
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 4]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 4]
add r4, sp, #16 @ tmpreQ16;
add r5, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
adr r9, kCosTab1
mov r6, #(kSinTab1 - kCosTab1)
add r10, r9, r6 @ kSinTab1
vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe.
movw r6, #16921 @ 0.5 / sqrt(240) in Q19
lsl r6, #5 @ Together with vqdmulh, net effect is ">> 26".
mov r8, #(FRAMESAMPLES / 2) @ loop counter
vdup.s32 q4, r6
Time2Spec_TransformAndFindMax:
@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
subs r8, #8
vld1.16 {q0}, [r9:64]! @ kCosTab1[]
vld1.16 {q2}, [r0]! @ inre1Q9[]
vmull.s16 q8, d0, d4 @ kCosTab1[k] * inre1Q9[k]
vld1.16 {q1}, [r10:64]! @ kSinTab1[]
vmull.s16 q9, d1, d5 @ kCosTab1[k] * inre1Q9[k]
vld1.16 {q3}, [r1]! @ inre2Q9[]
vmlal.s16 q8, d2, d6 @ kSinTab1[k] * inre2Q9[k]
vmlal.s16 q9, d3, d7 @ kSinTab1[k] * inre2Q9[k]
vmull.s16 q12, d0, d6 @ kCosTab1[k] * inre2Q9[k]
vmull.s16 q13, d1, d7 @ kCosTab1[k] * inre2Q9[k]
vmlsl.s16 q12, d2, d4 @ kSinTab1[k] * inre1Q9[k]
vmlsl.s16 q13, d3, d5 @ kSinTab1[k] * inre1Q9[k]
vqdmulh.s32 q0, q8, q4 @ xrQ16 * factQ19
vqdmulh.s32 q1, q9, q4 @ xrQ16 * factQ19
vqdmulh.s32 q2, q12, q4 @ xrQ16 * factQ19
vqdmulh.s32 q3, q13, q4 @ xrQ16 * factQ19
@ Find the absolute maximum in the vectors and store them in q6 and q7.
vabs.s32 q10, q0
vabs.s32 q11, q1
vabs.s32 q12, q2
vst1.32 {q0, q1}, [r4]! @ tmpreQ16[k]
vabs.s32 q13, q3
vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q7, q12
vst1.32 {q2, q3}, [r5]! @ tmpimQ16[k]
vmax.u32 q7, q13
vmax.u32 q6, q11 @ Maximum for outre1Q16[].
bgt Time2Spec_TransformAndFindMax
@ Find the maximum value in the Neon registers
vmax.u32 d12, d13
vmax.u32 d14, d15
vpmax.u32 d12, d12, d12 @ Both 32 bits words hold the same value tmpInIm.
vpmax.u32 d14, d14, d14 @ Both 32 bits words hold the same value tmpInRe.
vmax.s32 d14, d12, d14 @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16};
ldr r4, [sp] @ inre1Q9
vcls.s32 d15, d14 @ sh = WebRtcSpl_NormW32(tmpInRe);
ldr r5, [sp, #4] @ inre2Q9
vmov.i32 d14, #24
add r6, sp, #16 @ tmpreQ16;
vsub.s32 d15, d15, d14 @ sh = sh - 24;
add r7, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
vdup.s32 q8, d15[0] @ sh
mov r8, #(FRAMESAMPLES / 2) @ loop counter
Time2Spec_PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ tmpreQ16[]
vrshl.s32 q0, q0, q8
vld1.32 {q2, q3}, [r6]! @ tmpreQ16[]
vrshl.s32 q1, q1, q8
vld1.32 {q4, q5}, [r7]! @ tmpimQ16[]
vrshl.s32 q2, q2, q8
vld1.32 {q6, q7}, [r7]! @ tmpimQ16[]
vrshl.s32 q3, q3, q8
vrshl.s32 q4, q4, q8
vrshl.s32 q5, q5, q8
vrshl.s32 q6, q6, q8
vrshl.s32 q7, q7, q8
vmovn.s32 d0, q0
vmovn.s32 d1, q1
vmovn.s32 d2, q2
vmovn.s32 d3, q3
vmovn.s32 d4, q4
vmovn.s32 d5, q5
vmovn.s32 d6, q6
vmovn.s32 d7, q7
vst1.16 {q0, q1}, [r4]! @ inre1Q9[]
vst1.16 {q2, q3}, [r5]! @ inre2Q9[]
bgt Time2Spec_PreFftShift
ldr r0, [sp] @ inre1Q9
ldr r1, [sp, #4] @ inre2Q9
mov r2, #-1
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
vneg.s32 q15, q8 @ -sh
vmov.i32 q0, #23
vsub.s32 q15, q15, q0 @ -sh - 23
mov r8, #(FRAMESAMPLES - 8)
ldr r2, [sp, #8] @ outreQ7
ldr r3, [sp, #12] @ outimQ7
add r11, r2, r8 @ &outRe1Q16[FRAMESAMPLES / 2 - 4]
add r12, r3, r8 @ &outim2Q16[FRAMESAMPLES / 2 - 4]
ldr r6, [sp] @ inre1Q9
ldr r7, [sp, #4] @ inre2Q9
add r4, r6, r8 @ &inre1Q9[FRAMESAMPLES / 2 - 4]
add r5, r7, r8 @ &inre2Q9[FRAMESAMPLES / 2 - 4]
adr r10, kSinTab2
add r9, r10, #(120*2 - 8) @ &kSinTab2[119 - 4]
mov r8, #(FRAMESAMPLES / 4) @ loop counter
@ Pre-load variables.
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d0}, [r6]! @ inre1Q9
vld1.16 {d1}, [r7]! @ inre2Q9
Time2Spec_PostFftTransform:
@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
@ ">> 14" and then ">> 9" as in the C code.
vld1.16 {d6}, [r9:64] @ kCosTab2[]
vneg.s16 d6, d6
vld1.16 {d7}, [r10:64]! @ kSinTab2[]
vrev64.16 q1, q1 @ Reverse samples in 2nd half of xrQ16[].
vqadd.s16 d4, d0, d2 @ xrQ16
vqsub.s16 d5, d1, d3 @ xiQ16
vrev64.16 d6, d6
sub r9, #8 @ Update pointers for kCosTab2[].
sub r4, #8 @ Update pointers for inre1Q9[].
sub r5, #8 @ Update pointers for inr22Q9[].
subs r8, #4 @ Update loop counter.
vqadd.s16 d1, d1, d3 @ yrQ16
vqsub.s16 d0, d2, d0 @ yiQ16
vmull.s16 q12, d6, d4 @ kCosTab2[k] * xrQ16
vmlsl.s16 q12, d7, d5 @ kSinTab2[k] * xiQ16
vmull.s16 q13, d7, d4 @ kSinTab2[k] * xrQ16
vmlal.s16 q13, d6, d5 @ kCosTab2[k] * xiQ16
vmull.s16 q6, d7, d1 @ kSinTab2[k] * yrQ16
vmlal.s16 q6, d6, d0 @ kCosTab2[k] * yiQ16
vmull.s16 q7, d7, d0 @ kSinTab2[k] * yiQ16
vmlsl.s16 q7, d6, d1 @ kCosTab2[k] * yrQ16
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vshl.s32 q6, q6, q15
vshl.s32 q7, q7, q15
vneg.s32 q8, q6
vld1.16 {d0}, [r6]! @ inre1Q9
vmovn.s32 d8, q12
vld1.16 {d1}, [r7]! @ inre2Q9
vmovn.s32 d9, q13
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d5, q7
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d4, q8
vst1.16 {d8}, [r2]! @ outreQ7[k]
vrev64.16 q2, q2 @ Reverse the order of the samples.
vst1.16 {d9}, [r3]! @ outimQ7[k]
vst1.16 {d4}, [r11] @ outreQ7[FRAMESAMPLES / 2 - 1 - k]
vst1.16 {d5}, [r12] @ outimQ7[FRAMESAMPLES / 2 - 1 - k]
sub r11, #8 @ Update pointers for outreQ7[].
sub r12, #8 @ Update pointers for outimQ7[].
bgt Time2Spec_PostFftTransform
add sp, sp, #(16 + FRAMESAMPLES * 4)
vpop {q4-q7}
pop {r4-r11,pc}
.align 8
@ Cosine table 1 in Q14
kCosTab1:
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
.short 0, -214, -429, -643, -857, -1072, -1285, -1499
.short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196
.short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859
.short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467
.short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006
.short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456
.short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803
.short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031
.short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128
.short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081
.short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879
.short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515
.short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980
.short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270
.short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383
.align 8
@ Sine table 2 in Q14
kSinTab2:
.short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305
.short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048
.short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615
.short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011
.short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242
.short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318
.short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247
.short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042
.short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717
.short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285
.short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762
.short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165
.short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511
.short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819
.short 1606, -1392, 1179, -965, 750, -536, 322, -107
@ Table kCosTab2 was removed since its data is redundant with kSinTab2.
.align 8
@ Sine table 1 in Q14
kSinTab1:
.short 0, 214, 429, 643, 857, 1072, 1285, 1499
.short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196
.short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859
.short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467
.short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006
.short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456
.short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803
.short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031
.short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128
.short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081
.short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879
.short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515
.short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980
.short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270
.short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
@ void WebRtcIsacfix_Spec2TimeNeon(WebRtc_Word16 *inreQ7,
@ WebRtc_Word16 *inimQ7,
@ -32,70 +328,66 @@ DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon
str r3, [sp, #12] @ outre2Q16
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES/2 - 8]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES/2 - 8]
add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES/2 - 8]
add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES/2 - 8]
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 8]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 8]
add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES / 2 - 8]
add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES / 2 - 8]
mov r8, #(FRAMESAMPLES / 2) @ loop counter
ldr r9, =kCosTab2
ldr r10, =kSinTab2
adr r10, kSinTab2
add r9, r10, #(120*2 - 16) @ &kSinTab2[119 - 8]
mov r5, #-32
mov r7, #-16
vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe.
TRANSFORM_AND_FIND_MAX:
TransformAndFindMax:
@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
@ Bit-exact.
vld1.16 {q0}, [r9]! @ kCosTab2[]
vld1.16 {q1}, [r10]! @ kSinTab2[]
vld1.16 {q2}, [r0]! @ inreQ7[]
vld1.16 {q3}, [r1]! @ inimQ7[]
vmull.s16 q8, d0, d4 @ kCosTab2[k] x inreQ7[k]
vmull.s16 q9, d1, d5 @ kCosTab2[k] x inreQ7[k]
vmull.s16 q10, d2, d6 @ kSinTab2[k] x inimQ7[k]
vmull.s16 q11, d3, d7 @ kSinTab2[k] x inimQ7[k]
vmull.s16 q12, d0, d6 @ kCosTab2[k] x inimQ7[k]
vmull.s16 q13, d1, d7 @ kCosTab2[k] x inimQ7[k]
vmull.s16 q14, d2, d4 @ kSinTab2[k] x inreQ7[k]
vmull.s16 q15, d3, d5 @ kSinTab2[k] x inreQ7[k]
vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES/2 - 9 - i]
vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES/2 - 9 - i]
vadd.s32 q8, q8, q10
vadd.s32 q9, q9, q11
vsub.s32 q12, q12, q14
vsub.s32 q13, q13, q15
subs r8, #16
vld1.16 {q0}, [r9:64] @ kCosTab2[]
sub r9, #16
vld1.16 {q2}, [r0]! @ inreQ7[]
vneg.s16 q0, q0
vld1.16 {q3}, [r1]! @ inimQ7[]
vrev64.16 d0, d0
vrev64.16 d1, d1
vld1.16 {q1}, [r10:64]! @ kSinTab2[]
vswp d0, d1
vmull.s16 q8, d2, d6 @ kSinTab2[k] * inimQ7[k]
vmull.s16 q9, d3, d7 @ kSinTab2[k] * inimQ7[k]
vmlal.s16 q8, d0, d4 @ kCosTab2[k] * inreQ7[k]
vmlal.s16 q9, d1, d5 @ kCosTab2[k] * inreQ7[k]
vmull.s16 q12, d0, d6 @ kCosTab2[k] * inimQ7[k]
vmull.s16 q13, d1, d7 @ kCosTab2[k] * inimQ7[k]
vmlsl.s16 q12, d2, d4 @ kSinTab2[k] * inreQ7[k]
vmlsl.s16 q13, d3, d5 @ kSinTab2[k] * inreQ7[k]
vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES / 2 - 8 + i]
vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES / 2 - 8 + i]
vrev64.16 q2, q2 @ Reverse the order of the samples
vrev64.16 q3, q3 @ Reverse the order of the samples
vmull.s16 q14, d2, d5 @ kSinTab2[k] * inimQ7[k]
vmull.s16 q15, d3, d4 @ kSinTab2[k] * inimQ7[k]
vmlsl.s16 q14, d0, d7 @ kSinTab2[k] * inimQ7[k] - kCosTab2[k] *inreQ7[k]
vmlsl.s16 q15, d1, d6 @ kSinTab2[k] * inimQ7[k] - kCosTab2[k] *inreQ7[k]
vmull.s16 q10, d0, d5 @ kCosTab2[k] * inimQ7[]
vmull.s16 q11, d1, d4 @ kCosTab2[k] * inimQ7[]
vmlal.s16 q10, d2, d7 @ kCosTab2[k] * inimQ7[] + kSinTab2[k] * inreQ7[]
vmlal.s16 q11, d3, d6 @ kCosTab2[k] * inimQ7[] + kSinTab2[k] * inreQ7[]
vshr.s32 q8, q8, #5 @ xrQ16
vshr.s32 q9, q9, #5 @ xrQ16
vshr.s32 q12, q12, #5 @ xiQ16
vshr.s32 q13, q13, #5 @ xiQ16
vmull.s16 q10, d0, d7 @ kCosTab2[k] * inreQ7[k]
vmull.s16 q11, d1, d6 @ kCosTab2[k] * inreQ7[k]
vmull.s16 q14, d2, d5 @ kSinTab2[k] * inimQ7[k]
vmull.s16 q15, d3, d4 @ kSinTab2[k] * inimQ7[k]
vmull.s16 q4, d0, d5 @ kCosTab2[k] * inimQ7[]
vmull.s16 q5, d1, d4 @ kCosTab2[k] * inimQ7[]
vmull.s16 q0, d2, d7 @ kSinTab2[k] * inreQ7[]
vmull.s16 q2, d3, d6 @ kSinTab2[k] * inreQ7[]
vsub.s32 q14, q14, q10 @ kSinTab2[k] * inimQ7[k] -kCosTab2[k] * inreQ7[k]
vsub.s32 q15, q15, q11 @ kSinTab2[k] * inimQ7[k] -kCosTab2[k] * inreQ7[k]
vadd.s32 q10, q4, q0 @ kCosTab2[k] * inimQ7[] + kSinTab2[k] * inreQ7[]
vadd.s32 q11, q5, q2 @ kCosTab2[k] * inimQ7[] + kSinTab2[k] * inreQ7[]
vshr.s32 q14, q14, #5 @ yiQ16
vshr.s32 q15, q15, #5 @ yiQ16
@ -118,8 +410,8 @@ TRANSFORM_AND_FIND_MAX:
vadd.s32 q5, q11, q13
@ yrQ16 - xiQ16
vsub.s32 q9, q10, q12
vsub.s32 q8, q11, q13
vsub.s32 q9, q10, q12
@ Reverse the order of the samples
vrev64.32 q2, q2
@ -128,33 +420,37 @@ TRANSFORM_AND_FIND_MAX:
vrev64.32 q9, q9
vswp d4, d5
vswp d6, d7
vswp d16, d17
vswp d18, d19
vst1.32 {q0, q1}, [r2]! @ outre1Q16[k]
vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES/2 - 1 - k]
vst1.32 {q4, q5}, [r3]! @ outre2Q16[k]
vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES/2 - 1 - k]
vswp d16, d17
vswp d18, d19
vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k]
@ Find the absolute maximum in the vectors and store them in q6 and q7.
vabs.s32 q10, q0
vabs.s32 q11, q1
vabs.s32 q12, q2
vabs.s32 q13, q3
vabs.s32 q14, q4
vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q7, q14 @ Maximum for outre2Q16[].
vabs.s32 q11, q1
vabs.s32 q15, q5
vmax.u32 q6, q11 @ Maximum for outre1Q16[].
vmax.u32 q7, q15
vabs.s32 q12, q2
vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q7, q14 @ Maximum for outre2Q16[].
vabs.s32 q0, q8
vmax.u32 q6, q11 @ Maximum for outre1Q16[].
vmax.u32 q7, q15
vabs.s32 q13, q3
vmax.u32 q6, q12
vmax.u32 q7, q0
vabs.s32 q1, q9
vst1.32 {q4, q5}, [r3]! @ outre2Q16[k]
vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k]
vmax.u32 q6, q13
vmax.u32 q7, q1
bgt TRANSFORM_AND_FIND_MAX
bgt TransformAndFindMax
adr r10, kSinTab1
mov r2, #(kSinTab1 - kCosTab1)
sub r9, r10, r2 @ kCosTab1
@ Find the maximum value in the Neon registers
vmax.u32 d12, d13
@ -174,18 +470,16 @@ TRANSFORM_AND_FIND_MAX:
mov r8, #(FRAMESAMPLES / 2)
PRE_FFT_SHIFT:
PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ outre1Q16[]
vld1.32 {q2, q3}, [r6]! @ outre1Q16[]
vld1.32 {q4, q5}, [r7]! @ outre2Q16[]
vld1.32 {q6, q7}, [r7]! @ outre2Q16[]
subs r8, #16
vrshl.s32 q0, q0, q8
vrshl.s32 q1, q1, q8
vrshl.s32 q2, q2, q8
vrshl.s32 q3, q3, q8
vld1.32 {q4, q5}, [r7]! @ outre2Q16[]
vld1.32 {q6, q7}, [r7]! @ outre2Q16[]
vrshl.s32 q4, q4, q8
vrshl.s32 q5, q5, q8
vrshl.s32 q6, q6, q8
@ -203,12 +497,12 @@ PRE_FFT_SHIFT:
vst1.16 {q0, q1}, [r4]! @ inreQ7[]
vst1.16 {q2, q3}, [r5]! @ inimQ7[]
bgt PRE_FFT_SHIFT
bgt PreFftShift
ldr r0, [sp] @ inreQ7
ldr r1, [sp, #4] @ inimQ7
mov r2, #1
bl WebRtcIsacfix_FftRadix16Fastest(PLT)
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
ldr r4, [sp] @ inreQ7
ldr r5, [sp, #4] @ inimQ7
@ -217,93 +511,67 @@ PRE_FFT_SHIFT:
mov r8, #(FRAMESAMPLES / 2)
vneg.s32 q5, q8 @ -sh
movw r0, #273
vdup.s32 d8, r0
POST_FFT_SHIFT_DIVIDE:
vld1.16 {q0, q1}, [r4]! @ inreQ7
vld1.16 {q2, q3}, [r5]! @ inimQ7
lsl r0, #15 @ Together with vqdmulh, net effect is ">> 16".
vdup.s32 q4, r0
PostFftShiftDivide:
subs r8, #16
vld1.16 {q0, q1}, [r4]! @ inreQ7
vmovl.s16 q6, d0
vmovl.s16 q7, d1
vld1.16 {q2, q3}, [r5]! @ inimQ7
vmovl.s16 q8, d2
vmovl.s16 q9, d3
vmovl.s16 q0, d4
vmovl.s16 q1, d5
vmovl.s16 q2, d6
vmovl.s16 q3, d7
vshl.s32 q6, q6, q5
vshl.s32 q7, q7, q5
vshl.s32 q8, q8, q5
vshl.s32 q9, q9, q5
vqdmulh.s32 q6, q6, q4
vqdmulh.s32 q7, q7, q4
vqdmulh.s32 q8, q8, q4
vqdmulh.s32 q9, q9, q4
vmovl.s16 q0, d4
vmovl.s16 q1, d5
vmovl.s16 q2, d6
vmovl.s16 q3, d7
vshl.s32 q0, q0, q5
vshl.s32 q1, q1, q5
vshl.s32 q2, q2, q5
vshl.s32 q3, q3, q5
@ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre1Q16[k])
vmull.s32 q10, d12, d8
vmull.s32 q11, d13, d8
vmull.s32 q12, d14, d8
vmull.s32 q13, d15, d8
vshrn.s64 d12, q10, #16
vshrn.s64 d13, q11, #16
vshrn.s64 d14, q12, #16
vshrn.s64 d15, q13, #16
vmull.s32 q10, d16, d8
vmull.s32 q11, d17, d8
vmull.s32 q12, d18, d8
vmull.s32 q13, d19, d8
vshrn.s64 d16, q10, #16
vshrn.s64 d17, q11, #16
vshrn.s64 d18, q12, #16
vshrn.s64 d19, q13, #16
@ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k])
vmull.s32 q10, d0, d8
vmull.s32 q11, d1, d8
vmull.s32 q12, d2, d8
vmull.s32 q13, d3, d8
vshrn.s64 d0, q10, #16
vshrn.s64 d1, q11, #16
vshrn.s64 d2, q12, #16
vshrn.s64 d3, q13, #16
vmull.s32 q10, d4, d8
vmull.s32 q11, d5, d8
vmull.s32 q12, d6, d8
vmull.s32 q13, d7, d8
vshrn.s64 d4, q10, #16
vshrn.s64 d5, q11, #16
vshrn.s64 d6, q12, #16
vshrn.s64 d7, q13, #16
vqdmulh.s32 q0, q0, q4
vqdmulh.s32 q1, q1, q4
vst1.32 {q6, q7}, [r6]! @ outre1Q16[]
vqdmulh.s32 q2, q2, q4
vqdmulh.s32 q3, q3, q4
vst1.32 {q8, q9}, [r6]! @ outre1Q16[]
vst1.32 {q0, q1}, [r7]! @ outre2Q16[]
vst1.32 {q2, q3}, [r7]! @ outre2Q16[]
bgt POST_FFT_SHIFT_DIVIDE
bgt PostFftShiftDivide
mov r8, #(FRAMESAMPLES / 2)
ldr r9, =kCosTab1
ldr r10, =kSinTab1
ldr r2, [sp, #8] @ outre1Q16
ldr r3, [sp, #12] @ outre2Q16
movw r0, #31727
lsl r0, #16 @ With vqdmulh and vrshrn, net effect is ">> 25".
DEMODULATE_AND_SEPARATE:
vld1.16 {q0}, [r9]! @ kCosTab1[]
vld1.16 {q1}, [r10]! @ kSinTab1[]
vld1.32 {q2, q3}, [r2] @ outre1Q16
vld1.32 {q4, q5}, [r3] @ outre2Q16
DemodulateAndSeparate:
subs r8, #8
vld1.16 {q0}, [r9:64]! @ kCosTab1[]
vmovl.s16 q6, d0 @ kCosTab1[]
vld1.16 {q1}, [r10:64]! @ kSinTab1[]
vmovl.s16 q7, d1 @ kCosTab1[]
vld1.32 {q2, q3}, [r2] @ outre1Q16
vmovl.s16 q8, d2 @ kSinTab1[]
vld1.32 {q4, q5}, [r3] @ outre2Q16
vmovl.s16 q9, d3 @ kSinTab1[]
vmull.s32 q10, d12, d4 @ kCosTab1[k] * outre1Q16[k]
@ -311,72 +579,47 @@ DEMODULATE_AND_SEPARATE:
vmull.s32 q12, d14, d6 @ kCosTab1[k] * outre1Q16[k]
vmull.s32 q13, d15, d7 @ kCosTab1[k] * outre1Q16[k]
vmull.s32 q0, d16, d8 @ kSinTab1[k] * outre2Q16[k]
vmull.s32 q1, d17, d9 @ kSinTab1[k] * outre2Q16[k]
vmull.s32 q14, d18, d10 @ kSinTab1[k] * outre2Q16[k]
vmull.s32 q15, d19, d11 @ kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q10, d16, d8 @ += kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q11, d17, d9 @ += kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q12, d18, d10 @ += kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q13, d19, d11 @ += kSinTab1[k] * outre2Q16[k]
vsub.s64 q10, q10, q0
vsub.s64 q11, q11, q1
vsub.s64 q12, q12, q14
vsub.s64 q13, q13, q15
vrshrn.s64 d20, q10, #14 @ xrQ16
vrshrn.s64 d21, q11, #14 @ xrQ16
vrshrn.s64 d22, q12, #14 @ xrQ16
vrshrn.s64 d23, q13, #14 @ xrQ16
subs r8, #8
vrshrn.s64 d20, q10, #10 @ xrQ16
vrshrn.s64 d21, q11, #10 @ xrQ16
vrshrn.s64 d22, q12, #10 @ xrQ16
vrshrn.s64 d23, q13, #10 @ xrQ16
vmull.s32 q12, d12, d8 @ kCosTab1[k] * outre2Q16[k]
vmull.s32 q13, d13, d9 @ kCosTab1[k] * outre2Q16[k]
vmull.s32 q14, d14, d10 @ kCosTab1[k] * outre2Q16[k]
vmull.s32 q15, d15, d11 @ kCosTab1[k] * outre2Q16[k]
vdup.s32 d9, r0 @ generic -> Neon doesn't cost extra cycles.
vmlal.s32 q12, d16, d4 @ += kSinTab1[k] * outre1Q16[k]
vmlal.s32 q13, d17, d5 @ += kSinTab1[k] * outre1Q16[k]
vmlal.s32 q14, d18, d6 @ += kSinTab1[k] * outre1Q16[k]
vmlal.s32 q15, d19, d7 @ += kSinTab1[k] * outre1Q16[k]
vmull.s32 q0, d16, d4 @ kSinTab1[k] * outre1Q16[k]
vmull.s32 q1, d17, d5 @ kSinTab1[k] * outre1Q16[k]
vmull.s32 q6, d18, d6 @ kSinTab1[k] * outre1Q16[k]
vmull.s32 q7, d19, d7 @ kSinTab1[k] * outre1Q16[k]
vdup.s32 q4, r0 @ generic -> Neon doesn't cost extra cycles.
vadd.s64 q12, q12, q0
vadd.s64 q13, q13, q1
vadd.s64 q14, q14, q6
vadd.s64 q15, q15, q7
vrshrn.s64 d24, q12, #14 @ xiQ16
vrshrn.s64 d25, q13, #14 @ xiQ16
vrshrn.s64 d26, q14, #14 @ xiQ16
vrshrn.s64 d27, q15, #14 @ xiQ16
vrshrn.s64 d24, q12, #10 @ xiQ16
vrshrn.s64 d25, q13, #10 @ xiQ16
vrshrn.s64 d26, q14, #10 @ xiQ16
vrshrn.s64 d27, q15, #10 @ xiQ16
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16)
vmull.s32 q0, d20, d9
vmull.s32 q1, d21, d9
vmull.s32 q2, d22, d9
vmull.s32 q3, d23, d9
vrshrn.s64 d0, q0, #11
vrshrn.s64 d1, q1, #11
vrshrn.s64 d2, q2, #11
vrshrn.s64 d3, q3, #11
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16)
vmull.s32 q6, d24, d9
vmull.s32 q7, d25, d9
vmull.s32 q8, d26, d9
vmull.s32 q9, d27, d9
vrshrn.s64 d4, q6, #11
vrshrn.s64 d5, q7, #11
vrshrn.s64 d6, q8, #11
vrshrn.s64 d7, q9, #11
vqdmulh.s32 q0, q10, q4
vqdmulh.s32 q1, q11, q4
vqdmulh.s32 q2, q12, q4
vqdmulh.s32 q3, q13, q4
vst1.16 {q0, q1}, [r2]! @ outre1Q16[]
vst1.16 {q2, q3}, [r3]! @ outre2Q16[]
bgt DEMODULATE_AND_SEPARATE
bgt DemodulateAndSeparate
add sp, sp, #16
vpop {q4-q7}
pop {r4-r11,pc}

View File

@ -11,89 +11,152 @@
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
static const int kSamples = FRAMESAMPLES/2;
static int32_t spec2time_out_expected_1[kSamples] = {-3366470, -2285227,
-3415765, -2310215, -3118030, -2222470, -3030254, -2192091, -3423170,
-2216041, -3305541, -2171936, -3195767, -2095779, -3153304, -2157560,
-3071167, -2032108, -3101190, -1972016, -3103824, -2089118, -3139811,
-1898337, -3102801, -2055082, -3029665, -1854140, -2962586, -1966454,
-3071167, -1894588, -2851743, -1917315, -2848087, -1594932, -2799242,
-1462184, -2845887, -1437599, -2691776, -1329637, -2770659, -1268491,
-2625161, -1578991, -2460299, -1186385, -2365613, -1039354, -2322608,
-958518, -2271749, -789860, -2254538, -850308, -2384436, -850959, -2133734,
-587678, -2093316, -495115, -1973364, -475177, -1801282, -173507,
-1848516, -158015, -1792018, -62648, -1643313, 214746, -1500758, 267077,
-1450193, 560521, -1521579, 675283, -1345408, 857559, -1300822, 1116332,
-1294533, 1241117, -1070027, 1263503, -983816, 1529821, -1019586,
1910421, -955420, 2073688, -836459, 2401105, -653905, 2690474, -731425,
2930131, -935234, 3299500, -875978, 3523432, -878906, 3924822, -1081630,
4561267, -1203023, 5105274, -1510983, 6052762, -2294646, 7021597,
-3108053, 8826736, -4935222, 11678789, -8442713, 18725700, -21526692,
25420577, 19589811, -28108666, 12634054, -14483066, 6263217, -9979706,
3665661, -7909736, 2531530, -6434896, 1700772, -5525393, 1479473,
-4894262, 1231760, -4353044, 1032940, -3786590, 941152, -3331614,
665090, -2851619, 830696, -2762201, 958007, -2483118, 788233, -2184965,
804825, -1967306, 1007255, -1862474, 920889, -1457506, 755406, -1405841,
890230, -1302124, 1161599, -701867, 1154163, -1083366, 1204743, -513581,
1547264, -650636, 1493384, -285543, 1771863, -277906, 1841343, -9078,
1751863, 230222, 1819578, 207170, 1978972, 398137, 2106468, 552155,
1997624, 685213, 2129520, 601078, 2238736, 944591, 2441879, 1194178,
2355280, 986124, 2393328, 1049005, 2417944, 1208368, 2489516, 1352023,
2572118, 1445283, 2856081, 1532997, 2742279, 1615877, 2915274, 1808036,
2856871, 1806936, 3241747, 1622461, 2978558, 1841297, 3010378, 1923666,
3271367, 2126700, 3070935, 1956958, 3107588, 2128405, 3288872, 2114911,
3315952, 2406651, 3344038, 2370199, 3368980, 2144361, 3305030, 2183803,
3401450, 2523102, 3405463, 2452475, 3463355, 2421678, 3551968, 2431949,
3477251, 2148125, 3244489, 2174090};
static int32_t spec2time_out_expected_2[kSamples]= {1691694, -2499988, -2035547,
1060469, 988634, -2044502, -306271, 2041000, 201454, -2289456, 93694,
2129427, -369152, -1887834, 860796, 2089102, -929424, -1673956, 1395291,
1785651, -1619673, -1380109, 1963449, 1093311, -2111007, -840456,
2372786, 578119, -2242702, 89774, 2463304, -132717, -2121480, 643634,
2277636, -1125999, -1995858, 1543748, 2227861, -1483779, -1495491,
2102642, 1833876, -1920568, -958378, 2485101, 772261, -2454257, -24942,
2918714, 136838, -2500453, 816118, 3039735, -746560, -2365815, 1586396,
2714951, -1511696, -1942334, 2571792, 2182827, -2325335, -1311543,
3055970, 1367220, -2737182, -110626, 3889222, 631008, -3280879, 853066,
4122279, -706638, -3334449, 2148311, 3993512, -1846301, -3004894,
3426779, 3329522, -3165264, -2242423, 4756866, 2557711, -4131280,
-805259, 5702711, 1120592, -4852821, 743664, 6476444, -621186, -5465828,
2815787, 6768835, -3017442, -5338409, 5658126, 6838454, -5492288,
-4682382, 8874947, 6153814, -8832561, -2649251, 12817398, 4237692,
-13000247, 1190661, 18986363, -115738, -19693978, 9908367, 30660381,
-10632635, -37962068, 47022884, 89744622, -42087632, 40279224,
-88869341, -47542383, 38572364, 10441576, -30339718, -9926740, 19896578,
28009, -18886612, -1124047, 13232498, -4150304, -12770551, 2637074,
9051831, -6162211, -8713972, 4557937, 5489716, -6862312, -5532349,
5415449, 2791310, -6999367, -2790102, 5375806, 546222, -6486452,
-821261, 4994973, -1278840, -5645501, 1060484, 3996285, -2503954,
-4653629, 2220549, 3036977, -3282133, -3318585, 2780636, 1789880,
-4004589, -2041031, 3105373, 574819, -3992722, -971004, 3001703,
-676739, -3841508, 417284, 2897970, -1427018, -3058480, 1189948,
2210960, -2268992, -2603272, 1949785, 1576172, -2720404, -1891738,
2309456, 769178, -2975646, -707150, 2424652, -88039, -2966660, -65452,
2320780, -957557, -2798978, 744640, 1879794, -1672081, -2365319,
1253309, 1366383, -2204082, -1544367, 1801452, 613828, -2531994,
-983847, 2064842, 118326, -2613790, -203220, 2219635, -730341, -2641861,
563557, 1765434, -1329916, -2272927, 1037138, 1266725, -1939220,
-1588643, 1754528, 816552, -2376303, -1099167, 1864999, 122477,
-2422762, -400027, 1889228, -579916, -2490353, 287139, 2011318,
-1176657, -2502978, 812896, 1116502, -1940211};
static int16_t time2spec_out_expected_1[kSamples]= {20342, 23889, -10063, -9419,
3242, 7280, -2012, -5029, 332, 4478, -97, -3244, -891, 3117, 773, -2204,
-1335, 2009, 1236, -1469, -1562, 1277, 1366, -815, -1619, 599, 1449, -177,
-1507, 116, 1294, 263, -1338, -244, 1059, 553, -1045, -549, 829, 826,
-731, -755, 516, 909, -427, -853, 189, 1004, -184, -828, -108, 888, 72,
-700, -280, 717, 342, -611, -534, 601, 534, -374, -646, 399, 567, -171,
-720, 234, 645, -11, -712, -26, 593, 215, -643, -172, 536, 361, -527,
-403, 388, 550, -361, -480, 208, 623, -206, -585, 41, 578, 12, -504,
-182, 583, 218, -437, -339, 499, 263, -354, -450, 347, 456, -193, -524,
212, 475, -74, -566, 94, 511, 112, -577, -201, 408, 217, -546, -295, 338,
387, -13, 4, -46, 2, -76, 103, -83, 108, -55, 100, -150, 131, -156, 141,
-171, 179, -190, 128, -227, 172, -214, 215, -189, 265, -244, 322, -335,
337, -352, 358, -368, 362, -355, 366, -381, 403, -395, 411, -392, 446,
-458, 504, -449, 507, -464, 452, -491, 481, -534, 486, -516, 560, -535,
525, -537, 559, -554, 570, -616, 591, -585, 627, -509, 588, -584, 547,
-610, 580, -614, 635, -620, 655, -554, 546, -591, 642, -590, 660, -656,
629, -604, 620, -580, 617, -645, 648, -573, 612, -604, 584, -571, 597,
-562, 627, -550, 560, -606, 529, -584, 568, -503, 532, -463, 512, -440,
399, -457, 437, -349, 278, -317, 257, -220, 163, -8, -61, 18, -161, 367,
-1306};
static int16_t time2spec_out_expected_2[kSamples]= {14283, -11552, -15335, 6626,
7554, -2150, -6309, 1307, 4523, -4, -3908, -314, 3001, 914, -2715, -1042,
2094, 1272, -1715, -1399, 1263, 1508, -1021, -1534, 735, 1595, -439, -1447,
155, 1433, 22, -1325, -268, 1205, 424, -1030, -608, 950, 643, -733, -787,
661, 861, -502, -888, 331, 852, -144, -849, 19, 833, 99, -826, -154,
771, 368, -735, -459, 645, 513, -491, -604, 431, 630, -314, -598, 183,
622, -78, -612, -48, 641, 154, -645, -257, 610, 281, -529, -444, 450,
441, -327, -506, 274, 476, -232, -570, 117, 554, -86, -531, -21, 572,
151, -606, -221, 496, 322, -407, -388, 407, 394, -268, -428, 280, 505,
-115, -588, 19, 513, -29, -539, -109, 468, 173, -501, -242, 442, 278,
-478, -680, 656, -659, 656, -669, 602, -688, 612, -667, 612, -642, 627,
-648, 653, -676, 596, -680, 655, -649, 678, -672, 587, -608, 637, -645,
637, -620, 556, -580, 553, -635, 518, -599, 583, -501, 536, -544, 473,
-552, 583, -511, 541, -532, 563, -486, 461, -453, 486, -388, 424, -416,
432, -374, 399, -462, 364, -346, 293, -329, 331, -313, 281, -247, 309,
-337, 241, -190, 207, -194, 179, -163, 155, -156, 117, -135, 107, -126,
29, -22, 81, -8, 17, -61, -10, 8, -37, 80, -44, 72, -88, 65, -89, 130,
-114, 181, -215, 189, -245, 260, -288, 294, -339, 344, -396, 407, -429,
438, -439, 485, -556, 629, -612, 637, -645, 661, -737, 829, -830, 831,
-1041};
class TransformTest : public testing::Test {
protected:
TransformTest() {
WebRtcSpl_Init();
}
// Pass a function pointer to the Tester function.
void Time2SpecTester(Time2Spec Time2SpecFunction) {
// WebRtcIsacfix_Time2Spec functions hard coded the buffer lengths. It's a
// large buffer but we have to test it here.
int16_t data_in_1[kSamples] = {0};
int16_t data_in_2[kSamples] = {0};
int16_t data_out_1[kSamples] = {0};
int16_t data_out_2[kSamples] = {0};
for(int i = 0; i < kSamples; i++) {
data_in_1[i] = i * i + 1777;
data_in_2[i] = WEBRTC_SPL_WORD16_MAX / (i + 1) + 17;
}
Time2SpecFunction(data_in_1, data_in_2, data_out_1, data_out_2);
for (int i = 0; i < kSamples; i++) {
// We don't require bit-exact for ARM assembly code.
EXPECT_LE(abs(time2spec_out_expected_1[i] - data_out_1[i]), 1);
EXPECT_LE(abs(time2spec_out_expected_2[i] - data_out_2[i]), 1);
}
}
// Pass a function pointer to the Tester function.
void Spec2TimeTester(Spec2Time Spec2TimeFunction) {
// WebRtcIsacfix_Spec2Time functions hard coded the buffer lengths. It's a
// large buffer but we have to test it here.
const int kSamples = FRAMESAMPLES/2;
int16_t data_in_1[kSamples] = {0};
int16_t data_in_2[kSamples] = {0};
int32_t data_out_1[kSamples] = {0};
int32_t data_out_2[kSamples] = {0};
int32_t out_expected_1[kSamples]= {-3366470, -2285227, -3415765,
-2310215, -3118030, -2222470, -3030254, -2192091, -3423170, -2216041,
-3305541, -2171936, -3195767, -2095779, -3153304, -2157560, -3071167,
-2032108, -3101190, -1972016, -3103824, -2089118, -3139811, -1898337,
-3102801, -2055082, -3029665, -1854140, -2962586, -1966454, -3071167,
-1894588, -2851743, -1917315, -2848087, -1594932, -2799242, -1462184,
-2845887, -1437599, -2691776, -1329637, -2770659, -1268491, -2625161,
-1578991, -2460299, -1186385, -2365613, -1039354, -2322608, -958518,
-2271749, -789860, -2254538, -850308, -2384436, -850959, -2133734,
-587678, -2093316, -495115, -1973364, -475177, -1801282, -173507,
-1848516, -158015, -1792018, -62648, -1643313, 214746, -1500758, 267077,
-1450193, 560521, -1521579, 675283, -1345408, 857559, -1300822, 1116332,
-1294533, 1241117, -1070027, 1263503, -983816, 1529821, -1019586,
1910421, -955420, 2073688, -836459, 2401105, -653905, 2690474, -731425,
2930131, -935234, 3299500, -875978, 3523432, -878906, 3924822, -1081630,
4561267, -1203023, 5105274, -1510983, 6052762, -2294646, 7021597,
-3108053, 8826736, -4935222, 11678789, -8442713, 18725700, -21526692,
25420577, 19589811, -28108666, 12634054, -14483066, 6263217, -9979706,
3665661, -7909736, 2531530, -6434896, 1700772, -5525393, 1479473,
-4894262, 1231760, -4353044, 1032940, -3786590, 941152, -3331614,
665090, -2851619, 830696, -2762201, 958007, -2483118, 788233, -2184965,
804825, -1967306, 1007255, -1862474, 920889, -1457506, 755406, -1405841,
890230, -1302124, 1161599, -701867, 1154163, -1083366, 1204743, -513581,
1547264, -650636, 1493384, -285543, 1771863, -277906, 1841343, -9078,
1751863, 230222, 1819578, 207170, 1978972, 398137, 2106468, 552155,
1997624, 685213, 2129520, 601078, 2238736, 944591, 2441879, 1194178,
2355280, 986124, 2393328, 1049005, 2417944, 1208368, 2489516, 1352023,
2572118, 1445283, 2856081, 1532997, 2742279, 1615877, 2915274, 1808036,
2856871, 1806936, 3241747, 1622461, 2978558, 1841297, 3010378, 1923666,
3271367, 2126700, 3070935, 1956958, 3107588, 2128405, 3288872, 2114911,
3315952, 2406651, 3344038, 2370199, 3368980, 2144361, 3305030, 2183803,
3401450, 2523102, 3405463, 2452475, 3463355, 2421678, 3551968, 2431949,
3477251, 2148125, 3244489, 2174090};
int32_t out_expected_2[kSamples]= {1691694, -2499988, -2035547,
1060469, 988634, -2044502, -306271, 2041000, 201454, -2289456, 93694,
2129427, -369152, -1887834, 860796, 2089102, -929424, -1673956, 1395291,
1785651, -1619673, -1380109, 1963449, 1093311, -2111007, -840456,
2372786, 578119, -2242702, 89774, 2463304, -132717, -2121480, 643634,
2277636, -1125999, -1995858, 1543748, 2227861, -1483779, -1495491,
2102642, 1833876, -1920568, -958378, 2485101, 772261, -2454257, -24942,
2918714, 136838, -2500453, 816118, 3039735, -746560, -2365815, 1586396,
2714951, -1511696, -1942334, 2571792, 2182827, -2325335, -1311543,
3055970, 1367220, -2737182, -110626, 3889222, 631008, -3280879, 853066,
4122279, -706638, -3334449, 2148311, 3993512, -1846301, -3004894,
3426779, 3329522, -3165264, -2242423, 4756866, 2557711, -4131280,
-805259, 5702711, 1120592, -4852821, 743664, 6476444, -621186, -5465828,
2815787, 6768835, -3017442, -5338409, 5658126, 6838454, -5492288,
-4682382, 8874947, 6153814, -8832561, -2649251, 12817398, 4237692,
-13000247, 1190661, 18986363, -115738, -19693978, 9908367, 30660381,
-10632635, -37962068, 47022884, 89744622, -42087632, 40279224,
-88869341, -47542383, 38572364, 10441576, -30339718, -9926740, 19896578,
28009, -18886612, -1124047, 13232498, -4150304, -12770551, 2637074,
9051831, -6162211, -8713972, 4557937, 5489716, -6862312, -5532349,
5415449, 2791310, -6999367, -2790102, 5375806, 546222, -6486452,
-821261, 4994973, -1278840, -5645501, 1060484, 3996285, -2503954,
-4653629, 2220549, 3036977, -3282133, -3318585, 2780636, 1789880,
-4004589, -2041031, 3105373, 574819, -3992722, -971004, 3001703,
-676739, -3841508, 417284, 2897970, -1427018, -3058480, 1189948,
2210960, -2268992, -2603272, 1949785, 1576172, -2720404, -1891738,
2309456, 769178, -2975646, -707150, 2424652, -88039, -2966660, -65452,
2320780, -957557, -2798978, 744640, 1879794, -1672081, -2365319,
1253309, 1366383, -2204082, -1544367, 1801452, 613828, -2531994,
-983847, 2064842, 118326, -2613790, -203220, 2219635, -730341, -2641861,
563557, 1765434, -1329916, -2272927, 1037138, 1266725, -1939220,
-1588643, 1754528, 816552, -2376303, -1099167, 1864999, 122477,
-2422762, -400027, 1889228, -579916, -2490353, 287139, 2011318,
-1176657, -2502978, 812896, 1116502, -1940211};
for(int i = 0; i < kSamples; i++) {
data_in_1[i] = i * i + 1777;
data_in_2[i] = WEBRTC_SPL_WORD16_MAX / (i + 1) + 17;
@ -103,12 +166,24 @@ class TransformTest : public testing::Test {
for (int i = 0; i < kSamples; i++) {
// We don't require bit-exact for ARM assembly code.
EXPECT_LE(abs(out_expected_1[i] - data_out_1[i]), 16);
EXPECT_LE(abs(out_expected_2[i] - data_out_2[i]), 16);
EXPECT_LE(abs(spec2time_out_expected_1[i] - data_out_1[i]), 16);
EXPECT_LE(abs(spec2time_out_expected_2[i] - data_out_2[i]), 16);
}
}
};
TEST_F(TransformTest, Time2SpecTest) {
Time2SpecTester(WebRtcIsacfix_Time2SpecC);
#ifdef WEBRTC_DETECT_ARM_NEON
if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
Time2SpecTester(WebRtcIsacfix_Time2SpecNeon);
}
#elif defined(WEBRTC_ARCH_ARM_NEON)
Time2SpecTester(WebRtcIsacfix_Time2SpecNeon);
#endif
}
TEST_F(TransformTest, Spec2TimeTest) {
Spec2TimeTester(WebRtcIsacfix_Spec2TimeC);
#ifdef WEBRTC_DETECT_ARM_NEON

View File

@ -24,6 +24,9 @@
.macro DEFINE_FUNCTION name
_\name:
.endm
.macro CALL_FUNCTION name
bl _\name
.endm
#else
.macro GLOBAL_FUNCTION name
.global \name
@ -31,6 +34,9 @@ _\name:
.macro DEFINE_FUNCTION name
\name:
.endm
.macro CALL_FUNCTION name
bl \name
.endm
#endif
.text