diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft.c b/src/modules/audio_processing/aec/main/source/aec_rdft.c index 8489d4347..255021210 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft.c +++ b/src/modules/audio_processing/aec/main/source/aec_rdft.c @@ -38,6 +38,7 @@ ALIGN16_BEG float ALIGN16_END rdft_wk3r[32]; ALIGN16_BEG float ALIGN16_END rdft_wk1i[32]; ALIGN16_BEG float ALIGN16_END rdft_wk2i[32]; ALIGN16_BEG float ALIGN16_END rdft_wk3i[32]; +ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4]; static int ip[16]; @@ -138,7 +139,11 @@ static void makewt_32(void) { } bitrv2_32or128(nw, ip + 2, rdft_w); - // pre-calculate constants used by cft1st_128 ... + // pre-calculate constants used by cft1st_128 and cftmdl_128... + cftmdl_wk1r[0] = rdft_w[2]; + cftmdl_wk1r[1] = rdft_w[2]; + cftmdl_wk1r[2] = rdft_w[2]; + cftmdl_wk1r[3] = -rdft_w[2]; { int k1; @@ -306,58 +311,59 @@ static void cft1st_128_C(float *a) { } } -static void cftmdl_128(int l, float *a) { +static void cftmdl_128_C(float *a) { + const int l = 8; const int n = 128; - int j, j1, j2, j3, k, k1, k2, m, m2; + const int m = 32; + int j0, j1, j2, j3, k, k1, k2, m2; float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; - m = l << 2; - for (j = 0; j < l; j += 2) { - j1 = j + l; - j2 = j1 + l; - j3 = j2 + l; - x0r = a[j] + a[j1]; - x0i = a[j + 1] + a[j1 + 1]; - x1r = a[j] - a[j1]; - x1i = a[j + 1] - a[j1 + 1]; - x2r = a[j2] + a[j3]; + for (j0 = 0; j0 < l; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; x2i = a[j2 + 1] + a[j3 + 1]; - x3r = a[j2] - a[j3]; + x3r = a[j2 + 0] - a[j3 + 0]; x3i = a[j2 + 1] - a[j3 + 1]; - a[j] = x0r + x2r; - a[j + 1] = x0i + x2i; - a[j2] = x0r - x2r; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j2 + 0] = x0r - x2r; a[j2 + 1] = x0i - x2i; - a[j1] = x1r - x3i; + a[j1 + 0] = x1r - x3i; a[j1 + 1] = x1i + x3r; - a[j3] = x1r + x3i; + a[j3 + 0] = x1r + x3i; a[j3 + 1] = x1i - x3r; } wk1r = rdft_w[2]; - for (j = m; j < l + m; j += 2) { - j1 = j + l; - j2 = j1 + l; - j3 = j2 + l; - x0r = a[j] + a[j1]; - x0i = a[j + 1] + a[j1 + 1]; - x1r = a[j] - a[j1]; - x1i = a[j + 1] - a[j1 + 1]; - x2r = a[j2] + a[j3]; + for (j0 = m; j0 < l + m; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; x2i = a[j2 + 1] + a[j3 + 1]; - x3r = a[j2] - a[j3]; + x3r = a[j2 + 0] - a[j3 + 0]; x3i = a[j2 + 1] - a[j3 + 1]; - a[j] = x0r + x2r; - a[j + 1] = x0i + x2i; - a[j2] = x2i - x0i; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j2 + 0] = x2i - x0i; a[j2 + 1] = x0r - x2r; x0r = x1r - x3i; x0i = x1i + x3r; - a[j1] = wk1r * (x0r - x0i); + a[j1 + 0] = wk1r * (x0r - x0i); a[j1 + 1] = wk1r * (x0r + x0i); x0r = x3i + x1r; x0i = x3r - x1i; - a[j3] = wk1r * (x0i - x0r); + a[j3 + 0] = wk1r * (x0i - x0r); a[j3 + 1] = wk1r * (x0i + x0r); } k1 = 0; @@ -365,68 +371,68 @@ static void cftmdl_128(int l, float *a) { for (k = m2; k < n; k += m2) { k1 += 2; k2 = 2 * k1; - wk2r = rdft_w[k1]; + wk2r = rdft_w[k1 + 0]; wk2i = rdft_w[k1 + 1]; - wk1r = rdft_w[k2]; + wk1r = rdft_w[k2 + 0]; wk1i = rdft_w[k2 + 1]; - wk3r = wk1r - 2 * wk2i * wk1i; - wk3i = 2 * wk2i * wk1r - wk1i; - for (j = k; j < l + k; j += 2) { - j1 = j + l; - j2 = j1 + l; - j3 = j2 + l; - x0r = a[j] + a[j1]; - x0i = a[j + 1] + a[j1 + 1]; - x1r = a[j] - a[j1]; - x1i = a[j + 1] - a[j1 + 1]; - x2r = a[j2] + a[j3]; + wk3r = rdft_wk3ri_first[k1 + 0]; + wk3i = rdft_wk3ri_first[k1 + 1]; + for (j0 = k; j0 < l + k; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; x2i = a[j2 + 1] + a[j3 + 1]; - x3r = a[j2] - a[j3]; + x3r = a[j2 + 0] - a[j3 + 0]; x3i = a[j2 + 1] - a[j3 + 1]; - a[j] = x0r + x2r; - a[j + 1] = x0i + x2i; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; x0r -= x2r; x0i -= x2i; - a[j2] = wk2r * x0r - wk2i * x0i; + a[j2 + 0] = wk2r * x0r - wk2i * x0i; a[j2 + 1] = wk2r * x0i + wk2i * x0r; x0r = x1r - x3i; x0i = x1i + x3r; - a[j1] = wk1r * x0r - wk1i * x0i; + a[j1 + 0] = wk1r * x0r - wk1i * x0i; a[j1 + 1] = wk1r * x0i + wk1i * x0r; x0r = x1r + x3i; x0i = x1i - x3r; - a[j3] = wk3r * x0r - wk3i * x0i; + a[j3 + 0] = wk3r * x0r - wk3i * x0i; a[j3 + 1] = wk3r * x0i + wk3i * x0r; } wk1r = rdft_w[k2 + 2]; wk1i = rdft_w[k2 + 3]; - wk3r = wk1r - 2 * wk2r * wk1i; - wk3i = 2 * wk2r * wk1r - wk1i; - for (j = k + m; j < l + (k + m); j += 2) { - j1 = j + l; - j2 = j1 + l; - j3 = j2 + l; - x0r = a[j] + a[j1]; - x0i = a[j + 1] + a[j1 + 1]; - x1r = a[j] - a[j1]; - x1i = a[j + 1] - a[j1 + 1]; - x2r = a[j2] + a[j3]; + wk3r = rdft_wk3ri_second[k1 + 0]; + wk3i = rdft_wk3ri_second[k1 + 1]; + for (j0 = k + m; j0 < l + (k + m); j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; x2i = a[j2 + 1] + a[j3 + 1]; - x3r = a[j2] - a[j3]; + x3r = a[j2 + 0] - a[j3 + 0]; x3i = a[j2 + 1] - a[j3 + 1]; - a[j] = x0r + x2r; - a[j + 1] = x0i + x2i; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; x0r -= x2r; x0i -= x2i; - a[j2] = -wk2i * x0r - wk2r * x0i; + a[j2 + 0] = -wk2i * x0r - wk2r * x0i; a[j2 + 1] = -wk2i * x0i + wk2r * x0r; x0r = x1r - x3i; x0i = x1i + x3r; - a[j1] = wk1r * x0r - wk1i * x0i; + a[j1 + 0] = wk1r * x0r - wk1i * x0i; a[j1 + 1] = wk1r * x0i + wk1i * x0r; x0r = x1r + x3i; x0i = x1i - x3r; - a[j3] = wk3r * x0r - wk3i * x0i; + a[j3 + 0] = wk3r * x0r - wk3i * x0i; a[j3 + 1] = wk3r * x0i + wk3i * x0r; } } @@ -437,7 +443,7 @@ static void cftfsub_128(float *a) { float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; cft1st_128(a); - cftmdl_128(8, a); + cftmdl_128(a); l = 32; for (j = 0; j < l; j += 2) { j1 = j + l; @@ -467,7 +473,7 @@ static void cftbsub_128(float *a) { float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; cft1st_128(a); - cftmdl_128(8, a); + cftmdl_128(a); l = 32; for (j = 0; j < l; j += 2) { @@ -565,11 +571,13 @@ void aec_rdft_inverse_128(float *a) { // code path selection rft_sub_128_t cft1st_128; +rft_sub_128_t cftmdl_128; rft_sub_128_t rftfsub_128; rft_sub_128_t rftbsub_128; void aec_rdft_init(void) { cft1st_128 = cft1st_128_C; + cftmdl_128 = cftmdl_128_C; rftfsub_128 = rftfsub_128_C; rftbsub_128 = rftbsub_128_C; if (WebRtc_GetCPUInfo(kSSE2)) { diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft.h b/src/modules/audio_processing/aec/main/source/aec_rdft.h index 05564d4c7..5f4085bdb 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft.h +++ b/src/modules/audio_processing/aec/main/source/aec_rdft.h @@ -31,12 +31,14 @@ extern float rdft_wk3r[32]; extern float rdft_wk1i[32]; extern float rdft_wk2i[32]; extern float rdft_wk3i[32]; +extern float cftmdl_wk1r[4]; // code path selection function pointers typedef void (*rft_sub_128_t)(float *a); extern rft_sub_128_t rftfsub_128; extern rft_sub_128_t rftbsub_128; extern rft_sub_128_t cft1st_128; +extern rft_sub_128_t cftmdl_128; // entry points void aec_rdft_init(void); diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c index eb6f8adc4..16503ba6f 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c +++ b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c @@ -15,9 +15,10 @@ #include "aec_rdft.h" +static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = + {-1.f, 1.f, -1.f, 1.f}; + static void cft1st_128_SSE2(float *a) { - static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = - {-1.f, 1.f, -1.f, 1.f}; const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); int j, k2; @@ -74,6 +75,138 @@ static void cft1st_128_SSE2(float *a) { } } +static void cftmdl_128_SSE2(float *a) { + const int l = 8; + const int m = 32; + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j0, k, k1, k2, m2; + + __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); + for (j0 = 0; j0 < l; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, + _MM_SHUFFLE(1, 0, 1 ,0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + _mm_storel_epi64((__m128i*)&a[j0 + 0], (__m128i)xx0); + _mm_storel_epi64((__m128i*)&a[j0 + 32], + _mm_shuffle_epi32((__m128i)xx0, _MM_SHUFFLE(3, 2, 3, 2))); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx1); + _mm_storel_epi64((__m128i*)&a[j0 + 48], + _mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 2, 3))); + a[j0 + 48] = -a[j0 + 48]; + + const __m128 x3i0_3r0_3i1_x3r1 = (__m128) + _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + _mm_storel_epi64((__m128i*)&a[j0 + 8], (__m128i)x1_x3_add); + _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)x1_x3_sub); + + const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, + _MM_SHUFFLE(2, 2, 2 ,2)); + const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, + _MM_SHUFFLE(3, 3, 3 ,3)); + const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); + const __m128 yy3 = _mm_add_ps(yy0, yy2); + const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); + _mm_storel_epi64((__m128i*)&a[j0 + 40], (__m128i)yy4); + _mm_storel_epi64((__m128i*)&a[j0 + 56], + _mm_shuffle_epi32((__m128i)yy4, _MM_SHUFFLE(2, 3, 2, 3))); + } + + k1 = 0; + m2 = 2 * m; + k = 64; + k1 += 2; + k2 = 2 * k1; + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); + wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); + for (j0 = k; j0 < l + k; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, + _MM_SHUFFLE(1, 0, 1 ,0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + _mm_storel_epi64((__m128i*)&a[j0 + 0], (__m128i)xx); + _mm_storel_epi64((__m128i*)&a[j0 + 32], + _mm_shuffle_epi32((__m128i)xx, _MM_SHUFFLE(3, 2, 3, 2))); + + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); + const __m128 xx3 = _mm_mul_ps(wk2iv, + (__m128)_mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 xx4 = _mm_add_ps(xx2, xx3); + _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx4); + _mm_storel_epi64((__m128i*)&a[j0 + 48], + _mm_shuffle_epi32((__m128i)xx4, _MM_SHUFFLE(3, 2, 3, 2))); + + const __m128 x3i0_3r0_3i1_x3r1 = (__m128) + _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + + const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); + const __m128 xx11 = _mm_mul_ps(wk1iv, + (__m128)_mm_shuffle_epi32((__m128i)x1_x3_add, _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 xx12 = _mm_add_ps(xx10, xx11); + _mm_storel_epi64((__m128i*)&a[j0 + 8], (__m128i)xx12); + _mm_storel_epi64((__m128i*)&a[j0 + 40], + _mm_shuffle_epi32((__m128i)xx12, _MM_SHUFFLE(3, 2, 3, 2))); + + const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); + const __m128 xx21 = _mm_mul_ps(wk3iv, + (__m128)_mm_shuffle_epi32((__m128i)x1_x3_sub, _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 xx22 = _mm_add_ps(xx20, xx21); + _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)xx22); + _mm_storel_epi64((__m128i*)&a[j0 + 56], + _mm_shuffle_epi32((__m128i)xx22, _MM_SHUFFLE(3, 2, 3, 2))); + } +} + static void rftfsub_128_SSE2(float *a) { const float *c = rdft_w + 32; int j1, j2, k1, k2; @@ -221,7 +354,6 @@ static void rftbsub_128_SSE2(float *a) { const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, // Shuffle in right order and store. - // Shuffle in right order and store. const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); // 2, 3, 4, 5, const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); @@ -259,6 +391,7 @@ static void rftbsub_128_SSE2(float *a) { void aec_rdft_init_sse2(void) { cft1st_128 = cft1st_128_SSE2; + cftmdl_128 = cftmdl_128_SSE2; rftfsub_128 = rftfsub_128_SSE2; rftbsub_128 = rftbsub_128_SSE2; }