Porting ARM optimization from Android to ios.
Tested APM and iSAC in Android. Bit-exact with original versions. Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc. Review URL: https://webrtc-codereview.appspot.com/934009 git-svn-id: http://webrtc.googlecode.com/svn/trunk@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
2ec58dc4d1
commit
55cd78cfc2
@ -60,7 +60,7 @@ LOCAL_C_INCLUDES := \
|
||||
|
||||
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
|
||||
LOCAL_SRC_FILES += \
|
||||
filter_ar_fast_q12_armv7.s
|
||||
filter_ar_fast_q12_armv7.S
|
||||
else
|
||||
LOCAL_SRC_FILES += \
|
||||
filter_ar_fast_q12.c
|
||||
@ -68,8 +68,8 @@ endif
|
||||
|
||||
ifeq ($(TARGET_ARCH),arm)
|
||||
LOCAL_SRC_FILES += \
|
||||
complex_bit_reverse_arm.s \
|
||||
spl_sqrt_floor_arm.s
|
||||
complex_bit_reverse_arm.S \
|
||||
spl_sqrt_floor_arm.S
|
||||
else
|
||||
LOCAL_SRC_FILES += \
|
||||
complex_bit_reverse.c \
|
||||
@ -102,10 +102,10 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
|
||||
LOCAL_MODULE := libwebrtc_spl_neon
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
LOCAL_SRC_FILES := \
|
||||
cross_correlation_neon.s \
|
||||
downsample_fast_neon.s \
|
||||
min_max_operations_neon.s \
|
||||
vector_scaling_operations_neon.s
|
||||
cross_correlation_neon.S \
|
||||
downsample_fast_neon.S \
|
||||
min_max_operations_neon.S \
|
||||
vector_scaling_operations_neon.S
|
||||
|
||||
# Flags passed to both C and C++ files.
|
||||
LOCAL_CFLAGS := \
|
||||
|
@ -12,15 +12,11 @@
|
||||
@ for ARMv5 platforms.
|
||||
@ Reference C code is in file complex_bit_reverse.c. Bit-exact.
|
||||
|
||||
.arch armv5
|
||||
|
||||
.global WebRtcSpl_ComplexBitReverse
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_ComplexBitReverse
|
||||
.align 2
|
||||
|
||||
WebRtcSpl_ComplexBitReverse:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_ComplexBitReverse
|
||||
push {r4-r7}
|
||||
|
||||
cmp r1, #7
|
||||
@ -88,39 +84,36 @@ END:
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
||||
@ The index tables. Note the values are doubles of the actual indexes for 16-bit
|
||||
@ elements, different from the generic C code. It actually provides byte offsets
|
||||
@ for the indexes.
|
||||
|
||||
.align 2
|
||||
index_7: @ Indexes for stages == 7.
|
||||
.hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
|
||||
.hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
|
||||
.hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
|
||||
.hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
|
||||
.hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
|
||||
.hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
|
||||
.hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
|
||||
.hword 468, 364, 436, 380, 500, 412, 460, 444, 492
|
||||
.short 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288
|
||||
.short 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144
|
||||
.short 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116
|
||||
.short 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156
|
||||
.short 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204
|
||||
.short 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268
|
||||
.short 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348
|
||||
.short 468, 364, 436, 380, 500, 412, 460, 444, 492
|
||||
|
||||
index_8: @ Indexes for stages == 8.
|
||||
.hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
|
||||
.hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
|
||||
.hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
|
||||
.hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
|
||||
.hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
|
||||
.hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
|
||||
.hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
|
||||
.hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
|
||||
.hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
|
||||
.hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
|
||||
.hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
|
||||
.hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
|
||||
.hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
|
||||
.hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
|
||||
.hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
|
||||
.hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
|
||||
.hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
|
||||
.short 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64
|
||||
.short 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544
|
||||
.short 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104
|
||||
.short 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136
|
||||
.short 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172
|
||||
.short 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204
|
||||
.short 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244
|
||||
.short 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284
|
||||
.short 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324
|
||||
.short 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372
|
||||
.short 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420
|
||||
.short 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468
|
||||
.short 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532
|
||||
.short 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596
|
||||
.short 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684
|
||||
.short 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796
|
||||
.short 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988
|
@ -29,24 +29,18 @@
|
||||
@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
|
||||
@ r8, r9, r10, r11, r12: scratch
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
|
||||
.align 2
|
||||
.global WebRtcSpl_CrossCorrelationNeon
|
||||
|
||||
WebRtcSpl_CrossCorrelationNeon:
|
||||
|
||||
.fnstart
|
||||
|
||||
.save {r4-r11}
|
||||
DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
|
||||
push {r4-r11}
|
||||
|
||||
@ Put the shift value (-right_shifts) into a Neon register.
|
||||
ldrsh r10, [sp, #36]
|
||||
rsb r10, r10, #0
|
||||
mov r8, r10, asr #31
|
||||
vmov.32 d16, r10, r8
|
||||
vmov d16, r10, r8
|
||||
|
||||
@ Initialize loop counters.
|
||||
and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
|
||||
@ -63,7 +57,7 @@ LOOP_DIM_CROSS_CORRELATION:
|
||||
|
||||
LOOP_DIM_SEQ:
|
||||
vld1.16 {d20, d21}, [r6]! @ seq1_ptr
|
||||
vld1.16 {d22, d23}, [r5]! @ seq2_ptr
|
||||
vld1.16 {d22, d23}, [r5]! @ seq2_ptr
|
||||
subs r8, r8, #1
|
||||
vmull.s16 q12, d20, d22
|
||||
vmull.s16 q13, d21, d23
|
||||
@ -105,9 +99,6 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
||||
@ TODO(kma): Place this piece of reference code into a C code file.
|
||||
@ void WebRtcSpl_CrossCorrelationNeon(WebRtc_Word32* cross_correlation,
|
||||
@ WebRtc_Word16* seq1,
|
||||
@ -120,15 +111,15 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
|
||||
@ int j = 0;
|
||||
@ int inner_loop_len1 = dim_seq >> 3;
|
||||
@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
|
||||
@
|
||||
@
|
||||
@ assert(dim_cross_correlation > 0);
|
||||
@ assert(dim_seq > 0);
|
||||
@
|
||||
@
|
||||
@ for (i = 0; i < dim_cross_correlation; i++) {
|
||||
@ int16_t *seq1_ptr = seq1;
|
||||
@ int16_t *seq2_ptr = seq2 + (step_seq2 * i);
|
||||
@ int64_t sum = 0;
|
||||
@
|
||||
@
|
||||
@ for (j = inner_loop_len1; j > 0; j -= 1) {
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ -155,14 +146,14 @@ POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ }
|
||||
@
|
||||
@
|
||||
@ // Calculate the rest of the samples.
|
||||
@ for (j = inner_loop_len2; j > 0; j -= 1) {
|
||||
@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
|
||||
@ seq1_ptr++;
|
||||
@ seq2_ptr++;
|
||||
@ }
|
||||
@
|
||||
@
|
||||
@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
|
||||
@ }
|
||||
@ }
|
@ -14,17 +14,11 @@
|
||||
@
|
||||
@ The reference C code is in file downsample_fast.c. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||||
.align 2
|
||||
.global WebRtcSpl_DownsampleFastNeon
|
||||
|
||||
WebRtcSpl_DownsampleFastNeon:
|
||||
|
||||
.fnstart
|
||||
|
||||
.save {r4-r11}
|
||||
DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||||
push {r4-r11}
|
||||
|
||||
cmp r3, #0 @ data_out_length <= 0?
|
||||
@ -168,14 +162,15 @@ LOOP_COEFF_LENGTH_FACTOR4:
|
||||
vmlal.s16 q3, d18, d17
|
||||
bge LOOP_COEFF_LENGTH_FACTOR4
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
|
||||
@ Shift, saturate, and store the result.
|
||||
vqshrn.s32 d0, q2, #12
|
||||
vqshrn.s32 d1, q3, #12
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
vst1.16 {d0, d1}, [r2]!
|
||||
|
||||
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||||
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||||
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||||
blt LOOP_ENDPOS_FACTOR4
|
||||
|
||||
@
|
||||
@ -218,5 +213,3 @@ LOOP2_COEFF_LENGTH:
|
||||
END:
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
.fnend
|
@ -35,16 +35,11 @@
|
||||
@ r11: Scratch
|
||||
@ r12: &coefficients[j]
|
||||
|
||||
.arch armv7-a
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
|
||||
.align 2
|
||||
.global WebRtcSpl_FilterARFastQ12
|
||||
|
||||
WebRtcSpl_FilterARFastQ12:
|
||||
|
||||
.fnstart
|
||||
|
||||
.save {r4-r11}
|
||||
DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
|
||||
push {r4-r11}
|
||||
|
||||
ldrsh r12, [sp, #32] @ data_length
|
||||
@ -155,9 +150,6 @@ END:
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
||||
@Reference C code:
|
||||
@
|
||||
@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
|
@ -166,7 +166,7 @@ static __inline int WebRtcSpl_NormW16(WebRtc_Word16 a) {
|
||||
static __inline WebRtc_Word16 WebRtcSpl_SatW32ToW16(WebRtc_Word32 value32) {
|
||||
WebRtc_Word16 out16 = 0;
|
||||
|
||||
__asm __volatile ("ssat %r0, #16, %r1" : "=r"(out16) : "r"(value32));
|
||||
__asm __volatile ("ssat %0, #16, %1" : "=r"(out16) : "r"(value32));
|
||||
|
||||
return out16;
|
||||
}
|
||||
|
@ -15,20 +15,18 @@
|
||||
@ The reference C code is in file min_max_operations.c. Code here is basically
|
||||
@ a loop unrolling by 8 with Neon instructions. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.global WebRtcSpl_MaxAbsValueW16Neon
|
||||
.global WebRtcSpl_MaxAbsValueW32Neon
|
||||
.global WebRtcSpl_MaxValueW16Neon
|
||||
.global WebRtcSpl_MaxValueW32Neon
|
||||
.global WebRtcSpl_MinValueW16Neon
|
||||
.global WebRtcSpl_MinValueW32Neon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
|
||||
GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
|
||||
|
||||
.align 2
|
||||
|
||||
@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
|
||||
WebRtcSpl_MaxAbsValueW16Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
|
||||
mov r2, #-1 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_ABS_VALUE_W16
|
||||
@ -50,8 +48,8 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.u16 d24, d25
|
||||
vpmax.u16 d24, d24
|
||||
vpmax.u16 d24, d24
|
||||
vpmax.u16 d24, d24, d24
|
||||
vpmax.u16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u16 r2, d24[0]
|
||||
beq END_MAX_ABS_VALUE_W16
|
||||
@ -71,12 +69,10 @@ END_MAX_ABS_VALUE_W16:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
||||
@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
|
||||
WebRtcSpl_MaxAbsValueW32Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
|
||||
cmp r0, #0
|
||||
moveq r0, #-1
|
||||
beq EXIT @ Return -1 for a NULL pointer.
|
||||
@ -103,7 +99,7 @@ LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.u32 q12, q11
|
||||
vmax.u32 d24, d25
|
||||
vpmax.u32 d24, d24
|
||||
vpmax.u32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u32 r2, d24[0]
|
||||
beq END_MAX_ABS_VALUE_W32
|
||||
@ -125,12 +121,8 @@ END_MAX_ABS_VALUE_W32:
|
||||
EXIT:
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
|
||||
WebRtcSpl_MaxValueW16Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
|
||||
mov r2, #0x8000 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_VALUE_W16
|
||||
@ -151,8 +143,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.s16 d24, d25
|
||||
vpmax.s16 d24, d24
|
||||
vpmax.s16 d24, d24
|
||||
vpmax.s16 d24, d24, d24
|
||||
vpmax.s16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.u16 r2, d24[0]
|
||||
beq END_MAX_VALUE_W16
|
||||
@ -168,12 +160,8 @@ END_MAX_VALUE_W16:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
|
||||
WebRtcSpl_MaxValueW32Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
|
||||
mov r2, #0x80000000 @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MAX_VALUE_W32
|
||||
@ -196,8 +184,8 @@ LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmax.s32 q12, q11
|
||||
vpmax.s32 d24, d25
|
||||
vpmax.s32 d24, d24
|
||||
vpmax.s32 d24, d24, d25
|
||||
vpmax.s32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s32 r2, d24[0]
|
||||
beq END_MAX_VALUE_W32
|
||||
@ -213,12 +201,8 @@ END_MAX_VALUE_W32:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
|
||||
WebRtcSpl_MinValueW16Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
|
||||
movw r2, #0x7FFF @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MIN_VALUE_W16
|
||||
@ -239,8 +223,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmin.s16 d24, d25
|
||||
vpmin.s16 d24, d24
|
||||
vpmin.s16 d24, d24
|
||||
vpmin.s16 d24, d24, d24
|
||||
vpmin.s16 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s16 r2, d24[0]
|
||||
sxth r2, r2
|
||||
@ -257,12 +241,8 @@ END_MIN_VALUE_W16:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
|
||||
WebRtcSpl_MinValueW32Neon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
|
||||
mov r2, #0x7FFFFFFF @ Initialize the return value.
|
||||
cmp r0, #0
|
||||
beq END_MIN_VALUE_W32
|
||||
@ -285,8 +265,8 @@ LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
|
||||
|
||||
@ Find the maximum value in the Neon registers and move it to r2.
|
||||
vmin.s32 q12, q11
|
||||
vpmin.s32 d24, d25
|
||||
vpmin.s32 d24, d24
|
||||
vpmin.s32 d24, d24, d25
|
||||
vpmin.s32 d24, d24, d24
|
||||
adds r1, #8
|
||||
vmov.s32 r2, d24[0]
|
||||
beq END_MIN_VALUE_W32
|
||||
@ -301,5 +281,3 @@ LOOP_MIN_VALUE_W32:
|
||||
END_MIN_VALUE_W32:
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
.fnend
|
@ -31,8 +31,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_1(WebRtc_Word32 tbl_value,
|
||||
WebRtc_Word32 diff,
|
||||
WebRtc_Word32 state) {
|
||||
WebRtc_Word32 result;
|
||||
__asm__("smlawb %r0, %r1, %r2, %r3": "=r"(result): "r"(diff),
|
||||
"r"(tbl_value), "r"(state));
|
||||
__asm __volatile ("smlawb %0, %1, %2, %3": "=r"(result): "r"(diff),
|
||||
"r"(tbl_value), "r"(state));
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -47,8 +47,8 @@ static __inline WebRtc_Word32 MUL_ACCUM_2(WebRtc_Word32 tbl_value,
|
||||
WebRtc_Word32 diff,
|
||||
WebRtc_Word32 state) {
|
||||
WebRtc_Word32 result;
|
||||
__asm__("smmla %r0, %r1, %r2, %r3": "=r"(result): "r"(diff << 1),
|
||||
"r"(tbl_value), "r"(state));
|
||||
__asm __volatile ("smmla %0, %1, %2, %3": "=r"(result): "r"(diff << 1),
|
||||
"r"(tbl_value), "r"(state));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -65,8 +65,8 @@
|
||||
'conditions': [
|
||||
['target_arch=="arm"', {
|
||||
'sources': [
|
||||
'complex_bit_reverse_arm.s',
|
||||
'spl_sqrt_floor_arm.s',
|
||||
'complex_bit_reverse_arm.S',
|
||||
'spl_sqrt_floor_arm.S',
|
||||
],
|
||||
'sources!': [
|
||||
'complex_bit_reverse.c',
|
||||
@ -76,7 +76,7 @@
|
||||
['armv7==1', {
|
||||
'dependencies': ['signal_processing_neon',],
|
||||
'sources': [
|
||||
'filter_ar_fast_q12_armv7.s',
|
||||
'filter_ar_fast_q12_armv7.S',
|
||||
],
|
||||
'sources!': [
|
||||
'filter_ar_fast_q12.c',
|
||||
@ -112,10 +112,10 @@
|
||||
'type': '<(library)',
|
||||
'includes': ['../../build/arm_neon.gypi',],
|
||||
'sources': [
|
||||
'cross_correlation_neon.s',
|
||||
'downsample_fast_neon.s',
|
||||
'min_max_operations_neon.s',
|
||||
'vector_scaling_operations_neon.s',
|
||||
'cross_correlation_neon.S',
|
||||
'downsample_fast_neon.S',
|
||||
'min_max_operations_neon.S',
|
||||
'vector_scaling_operations_neon.S',
|
||||
],
|
||||
},
|
||||
],
|
||||
|
@ -8,10 +8,11 @@
|
||||
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits
|
||||
@ Registers touched: r1, r2
|
||||
|
||||
.global WebRtcSpl_SqrtFloor
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_SqrtFloor
|
||||
.align 2
|
||||
WebRtcSpl_SqrtFloor:
|
||||
DEFINE_FUNCTION WebRtcSpl_SqrtFloor
|
||||
mov r1, #3 << 30
|
||||
mov r2, #1 << 30
|
||||
|
@ -13,15 +13,11 @@
|
||||
@ optimized for ARM Neon platform. Output is bit-exact with the reference
|
||||
@ C code in vector_scaling_operations.c.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
|
||||
.align 2
|
||||
.global WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
|
||||
|
||||
WebRtcSpl_ScaleAndAddVectorsWithRoundNeon:
|
||||
.fnstart
|
||||
|
||||
DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
|
||||
push {r4-r9}
|
||||
|
||||
ldr r4, [sp, #32] @ length
|
||||
@ -84,5 +80,3 @@ LOOP_NO_UNROLLING:
|
||||
END:
|
||||
pop {r4-r9}
|
||||
bx lr
|
||||
|
||||
.fnend
|
@ -13,9 +13,9 @@
|
||||
@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
|
||||
@ C code is at end of this file.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.global WebRtcIsacfix_AllpassFilter2FixDec16Neon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
|
||||
.align 2
|
||||
|
||||
@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
|
||||
@ -27,7 +27,7 @@
|
||||
@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
|
||||
@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
|
||||
|
||||
WebRtcIsacfix_AllpassFilter2FixDec16Neon:
|
||||
DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
|
||||
push {r4 - r7}
|
||||
|
||||
ldr r5, [sp, #24] @ filter_state_ch2
|
||||
|
@ -9,9 +9,9 @@
|
||||
@
|
||||
@ Reference code in filters.c. Output is bit-exact.
|
||||
|
||||
#include "settings.h"
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
.global WebRtcIsacfix_AutocorrNeon
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
|
||||
.align 2
|
||||
|
||||
@ int WebRtcIsacfix_AutocorrNeon(
|
||||
@ -21,7 +21,7 @@
|
||||
@ WebRtc_Word16 order,
|
||||
@ WebRtc_Word16* __restrict scale);
|
||||
|
||||
WebRtcIsacfix_AutocorrNeon:
|
||||
DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
|
||||
push {r3 - r12}
|
||||
|
||||
@ Constant initializations
|
||||
|
@ -97,8 +97,8 @@
|
||||
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
|
||||
],
|
||||
'sources': [
|
||||
'filters_neon.S',
|
||||
'filterbanks_neon.S',
|
||||
'filters_neon.S',
|
||||
'lattice_neon.S',
|
||||
'lpc_masking_model_neon.S',
|
||||
],
|
||||
|
@ -25,16 +25,12 @@
|
||||
@ r12: constant #16384
|
||||
@ r6, r7, r8, r10, r11: scratch
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
#include "settings.h"
|
||||
|
||||
.arch armv7-a
|
||||
.global WebRtcIsacfix_FilterArLoop
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_FilterArLoop
|
||||
.align 2
|
||||
|
||||
WebRtcIsacfix_FilterArLoop:
|
||||
.fnstart
|
||||
|
||||
.save {r4-r11}
|
||||
DEFINE_FUNCTION WebRtcIsacfix_FilterArLoop
|
||||
push {r4-r11}
|
||||
|
||||
add r1, #2 @ &ar_f_Q0[1]
|
||||
@ -77,6 +73,3 @@ ORDER_COEF_LOOP: @ for(k = order_coef - 1 ; k >= 0; k--)
|
||||
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
||||
|
@ -29,19 +29,12 @@
|
||||
@ instructions, smulwb, and smull. Speech quality was not degraded by
|
||||
@ testing speech and tone vectors.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
#include "settings.h"
|
||||
|
||||
.global WebRtcIsacfix_FilterMaLoopNeon
|
||||
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
|
||||
.align 2
|
||||
|
||||
WebRtcIsacfix_FilterMaLoopNeon:
|
||||
.fnstart
|
||||
|
||||
.save {r4-r8}
|
||||
DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
|
||||
push {r4-r8}
|
||||
|
||||
vdup.32 d28, r0 @ Initialize Neon register with input0
|
||||
@ -151,5 +144,3 @@ LAST_SAMPLE:
|
||||
END:
|
||||
pop {r4-r8}
|
||||
bx lr
|
||||
|
||||
.fnend
|
||||
|
@ -12,9 +12,9 @@
|
||||
@ iSAC codec, optimized for ARM Neon platform. Reference code in
|
||||
@ lpc_masking_model.c.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.global WebRtcIsacfix_CalculateResidualEnergyNeon
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
|
||||
.align 2
|
||||
|
||||
@ int32_t WebRtcIsacfix_CalculateResidualEnergyNeon(int lpc_order,
|
||||
@ -23,10 +23,7 @@
|
||||
@ int16_t* a_polynomial,
|
||||
@ int32_t* corr_coeffs,
|
||||
@ int* q_val_residual_energy);
|
||||
|
||||
WebRtcIsacfix_CalculateResidualEnergyNeon:
|
||||
.fnstart
|
||||
.save {r4-r11}
|
||||
DEFINE_FUNCTION WebRtcIsacfix_CalculateResidualEnergyNeon
|
||||
push {r4-r11}
|
||||
|
||||
sub r13, r13, #16
|
||||
@ -173,5 +170,4 @@ GET_SHIFT_NORM:
|
||||
pop {r4-r11}
|
||||
bx r14
|
||||
|
||||
.fnend
|
||||
|
||||
|
@ -13,12 +13,11 @@
|
||||
@
|
||||
@ Output is bit-exact with the reference C code in pitch_filter.c.
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
#include "settings.h"
|
||||
|
||||
.arch armv6
|
||||
GLOBAL_FUNCTION WebRtcIsacfix_PitchFilterCore
|
||||
.align 2
|
||||
.global WebRtcIsacfix_PitchFilterCore
|
||||
|
||||
|
||||
@ void WebRtcIsacfix_PitchFilterCore(int loopNumber,
|
||||
@ WebRtc_Word16 gain,
|
||||
@ -30,9 +29,7 @@
|
||||
@ WebRtc_Word16* inputBuf,
|
||||
@ WebRtc_Word16* outputBuf,
|
||||
@ int* index2) {
|
||||
|
||||
WebRtcIsacfix_PitchFilterCore:
|
||||
.fnstart
|
||||
DEFINE_FUNCTION WebRtcIsacfix_PitchFilterCore
|
||||
push {r4-r11}
|
||||
sub sp, #8
|
||||
|
||||
@ -140,7 +137,6 @@ LOOP:
|
||||
add sp, #8
|
||||
pop {r4-r11}
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
.align 2
|
||||
kDampFilter:
|
||||
|
@ -12,19 +12,17 @@
|
||||
@ This file contains some functions in AECM, optimized for ARM Neon
|
||||
@ platforms. Reference C code is in file aecm_core.c. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
#include "aecm_defines.h"
|
||||
#include "aecm_core_neon_offsets.h"
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
|
||||
.extern WebRtcAecm_kSqrtHanning
|
||||
|
||||
.global WebRtcAecm_WindowAndFFTNeon
|
||||
.global WebRtcAecm_InverseFFTAndWindowNeon
|
||||
.global WebRtcAecm_CalcLinearEnergiesNeon
|
||||
.global WebRtcAecm_StoreAdaptiveChannelNeon
|
||||
.global WebRtcAecm_ResetAdaptiveChannelNeon
|
||||
GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon
|
||||
GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
|
||||
GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
|
||||
GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
|
||||
GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
|
||||
|
||||
@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
|
||||
@ WebRtc_Word16* fft,
|
||||
@ -32,9 +30,7 @@
|
||||
@ complex16_t* freq_signal,
|
||||
@ int time_signal_scaling);
|
||||
.align 2
|
||||
WebRtcAecm_WindowAndFFTNeon:
|
||||
.fnstart
|
||||
.save {r4, r5, r6, lr}
|
||||
DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon
|
||||
push {r4, r5, r6, lr}
|
||||
|
||||
ldr r12, [sp, #16] @ time_signal_scaling
|
||||
@ -84,7 +80,6 @@ LOOP_PART_LEN2:
|
||||
bgt LOOP_PART_LEN2
|
||||
|
||||
pop {r4, r5, r6, pc}
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
@ WebRtc_Word16* fft,
|
||||
@ -92,9 +87,7 @@ LOOP_PART_LEN2:
|
||||
@ WebRtc_Word16* output,
|
||||
@ const WebRtc_Word16* nearendClean);
|
||||
.align 2
|
||||
WebRtcAecm_InverseFFTAndWindowNeon:
|
||||
.fnstart
|
||||
.save {r4-r8, lr}
|
||||
DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
|
||||
push {r4-r8, lr}
|
||||
|
||||
@ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
|
||||
@ -158,12 +151,12 @@ LOOP_POST_IFFT:
|
||||
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
|
||||
vadd.i32 q8, q10
|
||||
vmull.s16 q0, d0, d1
|
||||
vqshrn.s32 d4, q8, #0
|
||||
vqmovn.s32 d16, q8
|
||||
vshr.s32 q0, q0, #14
|
||||
vst2.16 {d4, d5}, [r4, :128]! @ &efw[i];
|
||||
vshl.s32 q0, q0, q9
|
||||
vst1.16 d16, [r7, :64]! @ output[i]
|
||||
vqshrn.s32 d0, q0, #0
|
||||
vqmovn.s32 d0, q0
|
||||
subs r3, #1
|
||||
vst1.16 d0, [r8, :64]! @ aecm->outBuf[i]
|
||||
bgt LOOP_POST_IFFT
|
||||
@ -203,7 +196,6 @@ LOOP_COPY:
|
||||
|
||||
END:
|
||||
pop {r4-r8, pc}
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
|
||||
@ const WebRtc_UWord16* far_spectrum,
|
||||
@ -212,9 +204,7 @@ END:
|
||||
@ WebRtc_UWord32* echo_energy_adapt,
|
||||
@ WebRtc_UWord32* echo_energy_stored);
|
||||
.align 2
|
||||
WebRtcAecm_CalcLinearEnergiesNeon:
|
||||
.fnstart
|
||||
.save {r4-r7}
|
||||
DEFINE_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
|
||||
push {r4-r7}
|
||||
|
||||
vmov.i32 q14, #0
|
||||
@ -274,14 +264,12 @@ LOOP_CALC_LINEAR_ENERGIES:
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
|
||||
@ const uint16_t* far_spectrum,
|
||||
@ int32_t* echo_est);
|
||||
.align 2
|
||||
WebRtcAecm_StoreAdaptiveChannelNeon:
|
||||
.fnstart
|
||||
DEFINE_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
|
||||
ldr r3, =offset_aecm_channelAdapt16
|
||||
ldr r12, =offset_aecm_channelStored
|
||||
ldr r3, [r0, r3]
|
||||
@ -305,12 +293,10 @@ LOOP_STORE_ADAPTIVE_CHANNEL:
|
||||
str r3, [r2]
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm);
|
||||
.align 2
|
||||
WebRtcAecm_ResetAdaptiveChannelNeon:
|
||||
.fnstart
|
||||
DEFINE_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
|
||||
ldr r1, =offset_aecm_channelAdapt16
|
||||
ldr r2, =offset_aecm_channelAdapt32
|
||||
movw r3, #offset_aecm_channelStored
|
||||
@ -334,15 +320,14 @@ LOOP_RESET_ADAPTIVE_CHANNEL:
|
||||
str r0, [r2]
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ Square root of Hanning window in Q14. Compared to WebRtcAecm_kSqrtHanning,
|
||||
@ the order was reversed and one useless element (0) was removed.
|
||||
.align 3
|
||||
kSqrtHanningReversed:
|
||||
.hword 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
|
||||
.hword 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
|
||||
.hword 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
|
||||
.hword 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
|
||||
.hword 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
|
||||
.hword 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
|
||||
.short 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, 16034, 15947
|
||||
.short 15851, 15746, 15631, 15506, 15373, 15231, 15079, 14918, 14749, 14571
|
||||
.short 14384, 14189, 13985, 13773, 13553, 13325, 13089, 12845, 12594, 12335
|
||||
.short 12068, 11795, 11514, 11227, 10933, 10633, 10326, 10013, 9695, 9370
|
||||
.short 9040, 8705, 8364, 8019, 7668, 7313, 6954, 6591, 6224, 5853, 5478, 5101
|
||||
.short 4720, 4337, 3951, 3562, 3172, 2780, 2386, 1990, 1594, 1196, 798, 399
|
||||
|
@ -139,7 +139,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
"vneg.s16 d23, d23\n\t"
|
||||
"vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
|
||||
"vrev64.16 q10, q10\n\t"
|
||||
"vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t"
|
||||
"vst2.16 {q10}, [%[p_fft_offset]], %[offset]\n\t"
|
||||
:[p_efw]"+r"(p_efw),
|
||||
[p_fft]"+r"(p_fft),
|
||||
[p_fft_offset]"+r"(p_fft_offset)
|
||||
@ -181,7 +181,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
|
||||
__asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
|
||||
__asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
|
||||
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
|
||||
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
|
||||
|
||||
@ -196,7 +196,7 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
|
||||
__asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
|
||||
// aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
|
||||
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
|
||||
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm __volatile("vqmovn.s32 %P0, %q1" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
|
||||
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
|
||||
}
|
||||
|
||||
|
@ -12,18 +12,16 @@
|
||||
@ This file contains some functions in NS, optimized for ARM Neon
|
||||
@ platforms. Reference C code is in file nsx_core.c. Bit-exact.
|
||||
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||||
#include "nsx_defines.h"
|
||||
#include "nsx_core_neon_offsets.h"
|
||||
|
||||
.global WebRtcNsx_NoiseEstimationNeon
|
||||
.global WebRtcNsx_PrepareSpectrumNeon
|
||||
.global WebRtcNsx_SynthesisUpdateNeon
|
||||
.global WebRtcNsx_AnalysisUpdateNeon
|
||||
.global WebRtcNsx_DenormalizeNeon
|
||||
.global WebRtcNsx_CreateComplexBufferNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon
|
||||
GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon
|
||||
|
||||
@ void NoiseEstimationNeon(NsxInst_t* inst,
|
||||
@ uint16_t* magn,
|
||||
@ -42,12 +40,7 @@
|
||||
@ r11: countDiv
|
||||
@ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER
|
||||
|
||||
WebRtcNsx_NoiseEstimationNeon:
|
||||
.fnstart
|
||||
.save {r4-r11, r14}
|
||||
.vsave {d8-d15}
|
||||
.pad #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
|
||||
|
||||
DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon
|
||||
push {r4-r11, r14}
|
||||
vpush {d8-d15}
|
||||
sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
|
||||
@ -312,14 +305,10 @@ UPDATE_Q_NOISE:
|
||||
add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
|
||||
vpop {d8-d15}
|
||||
pop {r4-r11, pc}
|
||||
.fnend
|
||||
|
||||
@ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset);
|
||||
@ Neon registers touched: q0-q3, q8-q13.
|
||||
UpdateNoiseEstimateNeon:
|
||||
.fnstart
|
||||
.save {r4, r5, r6, r14}
|
||||
|
||||
DEFINE_FUNCTION UpdateNoiseEstimateNeon
|
||||
push {r4, r5, r6, r14}
|
||||
mov r5, r0
|
||||
|
||||
@ -385,13 +374,9 @@ POST_LOOP_MAGNLEN:
|
||||
strh r3, [r2]
|
||||
|
||||
pop {r4, r5, r6, pc}
|
||||
.fnend
|
||||
|
||||
@ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
|
||||
WebRtcNsx_PrepareSpectrumNeon:
|
||||
.fnstart
|
||||
.save {r4-r8}
|
||||
|
||||
DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
|
||||
push {r4-r8}
|
||||
|
||||
movw r2, #offset_nsx_real
|
||||
@ -478,11 +463,9 @@ LOOP_ANALEN2:
|
||||
|
||||
pop {r4-r8}
|
||||
bx r14
|
||||
.fnend
|
||||
|
||||
@ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
|
||||
WebRtcNsx_DenormalizeNeon:
|
||||
.fnstart
|
||||
DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon
|
||||
movw r12, #offset_nsx_normData
|
||||
movw r3, #offset_nsx_real
|
||||
ldr r12, [r0, r12] @ inst->normData
|
||||
@ -508,14 +491,11 @@ LOOP_ANALEN:
|
||||
blt LOOP_ANALEN
|
||||
|
||||
bx r14
|
||||
.fnend
|
||||
|
||||
@ void SynthesisUpdateNeon(NsxInst_t* inst,
|
||||
@ int16_t* out_frame,
|
||||
@ int16_t gain_factor);
|
||||
WebRtcNsx_SynthesisUpdateNeon:
|
||||
.fnstart
|
||||
.save {r4, r5}
|
||||
DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon
|
||||
push {r4, r5}
|
||||
|
||||
vdup.16 d31, r2
|
||||
@ -586,12 +566,8 @@ EXIT_SYNTHESISUPDATE:
|
||||
pop {r4, r5}
|
||||
bx r14
|
||||
|
||||
.fnend
|
||||
|
||||
@ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech);
|
||||
WebRtcNsx_AnalysisUpdateNeon:
|
||||
.fnstart
|
||||
.save {r4-r6}
|
||||
DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon
|
||||
push {r4-r6}
|
||||
|
||||
movw r3, #offset_nsx_analysisBuffer
|
||||
@ -647,11 +623,9 @@ LOOP_WINDOW_DATA:
|
||||
POST_LOOP_WINDOW_DATA:
|
||||
pop {r4-r6}
|
||||
bx r14
|
||||
.fnend
|
||||
|
||||
@ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out);
|
||||
WebRtcNsx_CreateComplexBufferNeon:
|
||||
.fnstart
|
||||
DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon
|
||||
movw r3, #offset_nsx_anaLen
|
||||
movw r12, #offset_nsx_normData
|
||||
ldrsh r3, [r0, r3] @ inst->anaLen
|
||||
@ -678,4 +652,3 @@ LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16.
|
||||
blt LOOP_CREATE_COMPLEX_BUFFER
|
||||
|
||||
bx r14
|
||||
.fnend
|
||||
|
32
webrtc/system_wrappers/interface/asm_defines.h
Normal file
32
webrtc/system_wrappers/interface/asm_defines.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
|
||||
#define WEBRTC_SYSTEM_WRAPPERS_INTERFACE_ASM_DEFINES_H_
|
||||
|
||||
// Define the macros used in ARM assembly code, so that for Mac or iOS builds
|
||||
// we add leading underscores for the function names.
|
||||
#ifdef __APPLE__
|
||||
.macro GLOBAL_FUNCTION name
|
||||
.global _\name
|
||||
.endm
|
||||
.macro DEFINE_FUNCTION name
|
||||
_\name:
|
||||
.endm
|
||||
#else
|
||||
.macro GLOBAL_FUNCTION name
|
||||
.global \name
|
||||
.endm
|
||||
.macro DEFINE_FUNCTION name
|
||||
\name:
|
||||
.endm
|
||||
#endif
|
||||
|
||||
#endif // WEBRTC_SYSTEM_WRAPPERS_INTERFACE_COMPILE_ASSERT_H_
|
Loading…
x
Reference in New Issue
Block a user