Optimization of 'cftmdl':
* scalar optimization, vectorization. * 1.7% AEC overall speedup for the straight C path. * 9.2% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/109008 git-svn-id: http://webrtc.googlecode.com/svn/trunk@416 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
		| @@ -38,6 +38,7 @@ ALIGN16_BEG float ALIGN16_END rdft_wk3r[32]; | |||||||
| ALIGN16_BEG float ALIGN16_END rdft_wk1i[32]; | ALIGN16_BEG float ALIGN16_END rdft_wk1i[32]; | ||||||
| ALIGN16_BEG float ALIGN16_END rdft_wk2i[32]; | ALIGN16_BEG float ALIGN16_END rdft_wk2i[32]; | ||||||
| ALIGN16_BEG float ALIGN16_END rdft_wk3i[32]; | ALIGN16_BEG float ALIGN16_END rdft_wk3i[32]; | ||||||
|  | ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4]; | ||||||
|  |  | ||||||
| static int ip[16]; | static int ip[16]; | ||||||
|  |  | ||||||
| @@ -138,7 +139,11 @@ static void makewt_32(void) { | |||||||
|   } |   } | ||||||
|   bitrv2_32or128(nw, ip + 2, rdft_w); |   bitrv2_32or128(nw, ip + 2, rdft_w); | ||||||
|  |  | ||||||
|   // pre-calculate constants used by cft1st_128 ... |   // pre-calculate constants used by cft1st_128 and cftmdl_128... | ||||||
|  |   cftmdl_wk1r[0] = rdft_w[2]; | ||||||
|  |   cftmdl_wk1r[1] = rdft_w[2]; | ||||||
|  |   cftmdl_wk1r[2] = rdft_w[2]; | ||||||
|  |   cftmdl_wk1r[3] = -rdft_w[2]; | ||||||
|   { |   { | ||||||
|     int k1; |     int k1; | ||||||
|  |  | ||||||
| @@ -306,58 +311,59 @@ static void cft1st_128_C(float *a) { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| static void cftmdl_128(int l, float *a) { | static void cftmdl_128_C(float *a) { | ||||||
|  |   const int l = 8; | ||||||
|   const int n = 128; |   const int n = 128; | ||||||
|   int j, j1, j2, j3, k, k1, k2, m, m2; |   const int m = 32; | ||||||
|  |   int j0, j1, j2, j3, k, k1, k2, m2; | ||||||
|   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; |   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; | ||||||
|   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | ||||||
|  |  | ||||||
|   m = l << 2; |   for (j0 = 0; j0 < l; j0 += 2) { | ||||||
|   for (j = 0; j < l; j += 2) { |     j1 = j0 +  8; | ||||||
|     j1 = j + l; |     j2 = j0 + 16; | ||||||
|     j2 = j1 + l; |     j3 = j0 + 24; | ||||||
|     j3 = j2 + l; |     x0r = a[j0 + 0] + a[j1 + 0]; | ||||||
|     x0r = a[j] + a[j1]; |     x0i = a[j0 + 1] + a[j1 + 1]; | ||||||
|     x0i = a[j + 1] + a[j1 + 1]; |     x1r = a[j0 + 0] - a[j1 + 0]; | ||||||
|     x1r = a[j] - a[j1]; |     x1i = a[j0 + 1] - a[j1 + 1]; | ||||||
|     x1i = a[j + 1] - a[j1 + 1]; |     x2r = a[j2 + 0] + a[j3 + 0]; | ||||||
|     x2r = a[j2] + a[j3]; |  | ||||||
|     x2i = a[j2 + 1] + a[j3 + 1]; |     x2i = a[j2 + 1] + a[j3 + 1]; | ||||||
|     x3r = a[j2] - a[j3]; |     x3r = a[j2 + 0] - a[j3 + 0]; | ||||||
|     x3i = a[j2 + 1] - a[j3 + 1]; |     x3i = a[j2 + 1] - a[j3 + 1]; | ||||||
|     a[j] = x0r + x2r; |     a[j0 + 0] = x0r + x2r; | ||||||
|     a[j + 1] = x0i + x2i; |     a[j0 + 1] = x0i + x2i; | ||||||
|     a[j2] = x0r - x2r; |     a[j2 + 0] = x0r - x2r; | ||||||
|     a[j2 + 1] = x0i - x2i; |     a[j2 + 1] = x0i - x2i; | ||||||
|     a[j1] = x1r - x3i; |     a[j1 + 0] = x1r - x3i; | ||||||
|     a[j1 + 1] = x1i + x3r; |     a[j1 + 1] = x1i + x3r; | ||||||
|     a[j3] = x1r + x3i; |     a[j3 + 0] = x1r + x3i; | ||||||
|     a[j3 + 1] = x1i - x3r; |     a[j3 + 1] = x1i - x3r; | ||||||
|   } |   } | ||||||
|   wk1r = rdft_w[2]; |   wk1r = rdft_w[2]; | ||||||
|   for (j = m; j < l + m; j += 2) { |   for (j0 = m; j0 < l + m; j0 += 2) { | ||||||
|     j1 = j + l; |     j1 = j0 +  8; | ||||||
|     j2 = j1 + l; |     j2 = j0 + 16; | ||||||
|     j3 = j2 + l; |     j3 = j0 + 24; | ||||||
|     x0r = a[j] + a[j1]; |     x0r = a[j0 + 0] + a[j1 + 0]; | ||||||
|     x0i = a[j + 1] + a[j1 + 1]; |     x0i = a[j0 + 1] + a[j1 + 1]; | ||||||
|     x1r = a[j] - a[j1]; |     x1r = a[j0 + 0] - a[j1 + 0]; | ||||||
|     x1i = a[j + 1] - a[j1 + 1]; |     x1i = a[j0 + 1] - a[j1 + 1]; | ||||||
|     x2r = a[j2] + a[j3]; |     x2r = a[j2 + 0] + a[j3 + 0]; | ||||||
|     x2i = a[j2 + 1] + a[j3 + 1]; |     x2i = a[j2 + 1] + a[j3 + 1]; | ||||||
|     x3r = a[j2] - a[j3]; |     x3r = a[j2 + 0] - a[j3 + 0]; | ||||||
|     x3i = a[j2 + 1] - a[j3 + 1]; |     x3i = a[j2 + 1] - a[j3 + 1]; | ||||||
|     a[j] = x0r + x2r; |     a[j0 + 0] = x0r + x2r; | ||||||
|     a[j + 1] = x0i + x2i; |     a[j0 + 1] = x0i + x2i; | ||||||
|     a[j2] = x2i - x0i; |     a[j2 + 0] = x2i - x0i; | ||||||
|     a[j2 + 1] = x0r - x2r; |     a[j2 + 1] = x0r - x2r; | ||||||
|     x0r = x1r - x3i; |     x0r = x1r - x3i; | ||||||
|     x0i = x1i + x3r; |     x0i = x1i + x3r; | ||||||
|     a[j1] = wk1r * (x0r - x0i); |     a[j1 + 0] = wk1r * (x0r - x0i); | ||||||
|     a[j1 + 1] = wk1r * (x0r + x0i); |     a[j1 + 1] = wk1r * (x0r + x0i); | ||||||
|     x0r = x3i + x1r; |     x0r = x3i + x1r; | ||||||
|     x0i = x3r - x1i; |     x0i = x3r - x1i; | ||||||
|     a[j3] = wk1r * (x0i - x0r); |     a[j3 + 0] = wk1r * (x0i - x0r); | ||||||
|     a[j3 + 1] = wk1r * (x0i + x0r); |     a[j3 + 1] = wk1r * (x0i + x0r); | ||||||
|   } |   } | ||||||
|   k1 = 0; |   k1 = 0; | ||||||
| @@ -365,68 +371,68 @@ static void cftmdl_128(int l, float *a) { | |||||||
|   for (k = m2; k < n; k += m2) { |   for (k = m2; k < n; k += m2) { | ||||||
|     k1 += 2; |     k1 += 2; | ||||||
|     k2 = 2 * k1; |     k2 = 2 * k1; | ||||||
|     wk2r = rdft_w[k1]; |     wk2r = rdft_w[k1 + 0]; | ||||||
|     wk2i = rdft_w[k1 + 1]; |     wk2i = rdft_w[k1 + 1]; | ||||||
|     wk1r = rdft_w[k2]; |     wk1r = rdft_w[k2 + 0]; | ||||||
|     wk1i = rdft_w[k2 + 1]; |     wk1i = rdft_w[k2 + 1]; | ||||||
|     wk3r = wk1r - 2 * wk2i * wk1i; |     wk3r = rdft_wk3ri_first[k1 + 0]; | ||||||
|     wk3i = 2 * wk2i * wk1r - wk1i; |     wk3i = rdft_wk3ri_first[k1 + 1]; | ||||||
|     for (j = k; j < l + k; j += 2) { |     for (j0 = k; j0 < l + k; j0 += 2) { | ||||||
|       j1 = j + l; |       j1 = j0 +  8; | ||||||
|       j2 = j1 + l; |       j2 = j0 + 16; | ||||||
|       j3 = j2 + l; |       j3 = j0 + 24; | ||||||
|       x0r = a[j] + a[j1]; |       x0r = a[j0 + 0] + a[j1 + 0]; | ||||||
|       x0i = a[j + 1] + a[j1 + 1]; |       x0i = a[j0 + 1] + a[j1 + 1]; | ||||||
|       x1r = a[j] - a[j1]; |       x1r = a[j0 + 0] - a[j1 + 0]; | ||||||
|       x1i = a[j + 1] - a[j1 + 1]; |       x1i = a[j0 + 1] - a[j1 + 1]; | ||||||
|       x2r = a[j2] + a[j3]; |       x2r = a[j2 + 0] + a[j3 + 0]; | ||||||
|       x2i = a[j2 + 1] + a[j3 + 1]; |       x2i = a[j2 + 1] + a[j3 + 1]; | ||||||
|       x3r = a[j2] - a[j3]; |       x3r = a[j2 + 0] - a[j3 + 0]; | ||||||
|       x3i = a[j2 + 1] - a[j3 + 1]; |       x3i = a[j2 + 1] - a[j3 + 1]; | ||||||
|       a[j] = x0r + x2r; |       a[j0 + 0] = x0r + x2r; | ||||||
|       a[j + 1] = x0i + x2i; |       a[j0 + 1] = x0i + x2i; | ||||||
|       x0r -= x2r; |       x0r -= x2r; | ||||||
|       x0i -= x2i; |       x0i -= x2i; | ||||||
|       a[j2] = wk2r * x0r - wk2i * x0i; |       a[j2 + 0] = wk2r * x0r - wk2i * x0i; | ||||||
|       a[j2 + 1] = wk2r * x0i + wk2i * x0r; |       a[j2 + 1] = wk2r * x0i + wk2i * x0r; | ||||||
|       x0r = x1r - x3i; |       x0r = x1r - x3i; | ||||||
|       x0i = x1i + x3r; |       x0i = x1i + x3r; | ||||||
|       a[j1] = wk1r * x0r - wk1i * x0i; |       a[j1 + 0] = wk1r * x0r - wk1i * x0i; | ||||||
|       a[j1 + 1] = wk1r * x0i + wk1i * x0r; |       a[j1 + 1] = wk1r * x0i + wk1i * x0r; | ||||||
|       x0r = x1r + x3i; |       x0r = x1r + x3i; | ||||||
|       x0i = x1i - x3r; |       x0i = x1i - x3r; | ||||||
|       a[j3] = wk3r * x0r - wk3i * x0i; |       a[j3 + 0] = wk3r * x0r - wk3i * x0i; | ||||||
|       a[j3 + 1] = wk3r * x0i + wk3i * x0r; |       a[j3 + 1] = wk3r * x0i + wk3i * x0r; | ||||||
|     } |     } | ||||||
|     wk1r = rdft_w[k2 + 2]; |     wk1r = rdft_w[k2 + 2]; | ||||||
|     wk1i = rdft_w[k2 + 3]; |     wk1i = rdft_w[k2 + 3]; | ||||||
|     wk3r = wk1r - 2 * wk2r * wk1i; |     wk3r = rdft_wk3ri_second[k1 + 0]; | ||||||
|     wk3i = 2 * wk2r * wk1r - wk1i; |     wk3i = rdft_wk3ri_second[k1 + 1]; | ||||||
|     for (j = k + m; j < l + (k + m); j += 2) { |     for (j0 = k + m; j0 < l + (k + m); j0 += 2) { | ||||||
|       j1 = j + l; |       j1 = j0 +  8; | ||||||
|       j2 = j1 + l; |       j2 = j0 + 16; | ||||||
|       j3 = j2 + l; |       j3 = j0 + 24; | ||||||
|       x0r = a[j] + a[j1]; |       x0r = a[j0 + 0] + a[j1 + 0]; | ||||||
|       x0i = a[j + 1] + a[j1 + 1]; |       x0i = a[j0 + 1] + a[j1 + 1]; | ||||||
|       x1r = a[j] - a[j1]; |       x1r = a[j0 + 0] - a[j1 + 0]; | ||||||
|       x1i = a[j + 1] - a[j1 + 1]; |       x1i = a[j0 + 1] - a[j1 + 1]; | ||||||
|       x2r = a[j2] + a[j3]; |       x2r = a[j2 + 0] + a[j3 + 0]; | ||||||
|       x2i = a[j2 + 1] + a[j3 + 1]; |       x2i = a[j2 + 1] + a[j3 + 1]; | ||||||
|       x3r = a[j2] - a[j3]; |       x3r = a[j2 + 0] - a[j3 + 0]; | ||||||
|       x3i = a[j2 + 1] - a[j3 + 1]; |       x3i = a[j2 + 1] - a[j3 + 1]; | ||||||
|       a[j] = x0r + x2r; |       a[j0 + 0] = x0r + x2r; | ||||||
|       a[j + 1] = x0i + x2i; |       a[j0 + 1] = x0i + x2i; | ||||||
|       x0r -= x2r; |       x0r -= x2r; | ||||||
|       x0i -= x2i; |       x0i -= x2i; | ||||||
|       a[j2] = -wk2i * x0r - wk2r * x0i; |       a[j2 + 0] = -wk2i * x0r - wk2r * x0i; | ||||||
|       a[j2 + 1] = -wk2i * x0i + wk2r * x0r; |       a[j2 + 1] = -wk2i * x0i + wk2r * x0r; | ||||||
|       x0r = x1r - x3i; |       x0r = x1r - x3i; | ||||||
|       x0i = x1i + x3r; |       x0i = x1i + x3r; | ||||||
|       a[j1] = wk1r * x0r - wk1i * x0i; |       a[j1 + 0] = wk1r * x0r - wk1i * x0i; | ||||||
|       a[j1 + 1] = wk1r * x0i + wk1i * x0r; |       a[j1 + 1] = wk1r * x0i + wk1i * x0r; | ||||||
|       x0r = x1r + x3i; |       x0r = x1r + x3i; | ||||||
|       x0i = x1i - x3r; |       x0i = x1i - x3r; | ||||||
|       a[j3] = wk3r * x0r - wk3i * x0i; |       a[j3 + 0] = wk3r * x0r - wk3i * x0i; | ||||||
|       a[j3 + 1] = wk3r * x0i + wk3i * x0r; |       a[j3 + 1] = wk3r * x0i + wk3i * x0r; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -437,7 +443,7 @@ static void cftfsub_128(float *a) { | |||||||
|   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | ||||||
|  |  | ||||||
|   cft1st_128(a); |   cft1st_128(a); | ||||||
|   cftmdl_128(8, a); |   cftmdl_128(a); | ||||||
|   l = 32; |   l = 32; | ||||||
|   for (j = 0; j < l; j += 2) { |   for (j = 0; j < l; j += 2) { | ||||||
|     j1 = j + l; |     j1 = j + l; | ||||||
| @@ -467,7 +473,7 @@ static void cftbsub_128(float *a) { | |||||||
|   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; |   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; | ||||||
|  |  | ||||||
|   cft1st_128(a); |   cft1st_128(a); | ||||||
|   cftmdl_128(8, a); |   cftmdl_128(a); | ||||||
|   l = 32; |   l = 32; | ||||||
|  |  | ||||||
|   for (j = 0; j < l; j += 2) { |   for (j = 0; j < l; j += 2) { | ||||||
| @@ -565,11 +571,13 @@ void aec_rdft_inverse_128(float *a) { | |||||||
|  |  | ||||||
| // code path selection | // code path selection | ||||||
| rft_sub_128_t cft1st_128; | rft_sub_128_t cft1st_128; | ||||||
|  | rft_sub_128_t cftmdl_128; | ||||||
| rft_sub_128_t rftfsub_128; | rft_sub_128_t rftfsub_128; | ||||||
| rft_sub_128_t rftbsub_128; | rft_sub_128_t rftbsub_128; | ||||||
|  |  | ||||||
| void aec_rdft_init(void) { | void aec_rdft_init(void) { | ||||||
|   cft1st_128 = cft1st_128_C; |   cft1st_128 = cft1st_128_C; | ||||||
|  |   cftmdl_128 = cftmdl_128_C; | ||||||
|   rftfsub_128 = rftfsub_128_C; |   rftfsub_128 = rftfsub_128_C; | ||||||
|   rftbsub_128 = rftbsub_128_C; |   rftbsub_128 = rftbsub_128_C; | ||||||
|   if (WebRtc_GetCPUInfo(kSSE2)) { |   if (WebRtc_GetCPUInfo(kSSE2)) { | ||||||
|   | |||||||
| @@ -31,12 +31,14 @@ extern float rdft_wk3r[32]; | |||||||
| extern float rdft_wk1i[32]; | extern float rdft_wk1i[32]; | ||||||
| extern float rdft_wk2i[32]; | extern float rdft_wk2i[32]; | ||||||
| extern float rdft_wk3i[32]; | extern float rdft_wk3i[32]; | ||||||
|  | extern float cftmdl_wk1r[4]; | ||||||
|  |  | ||||||
| // code path selection function pointers | // code path selection function pointers | ||||||
| typedef void (*rft_sub_128_t)(float *a); | typedef void (*rft_sub_128_t)(float *a); | ||||||
| extern rft_sub_128_t rftfsub_128; | extern rft_sub_128_t rftfsub_128; | ||||||
| extern rft_sub_128_t rftbsub_128; | extern rft_sub_128_t rftbsub_128; | ||||||
| extern rft_sub_128_t cft1st_128; | extern rft_sub_128_t cft1st_128; | ||||||
|  | extern rft_sub_128_t cftmdl_128; | ||||||
|  |  | ||||||
| // entry points | // entry points | ||||||
| void aec_rdft_init(void); | void aec_rdft_init(void); | ||||||
|   | |||||||
| @@ -15,9 +15,10 @@ | |||||||
|  |  | ||||||
| #include "aec_rdft.h" | #include "aec_rdft.h" | ||||||
|  |  | ||||||
| static void cft1st_128_SSE2(float *a) { | static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = | ||||||
|   static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = |  | ||||||
|   {-1.f, 1.f, -1.f, 1.f}; |   {-1.f, 1.f, -1.f, 1.f}; | ||||||
|  |  | ||||||
|  | static void cft1st_128_SSE2(float *a) { | ||||||
|   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); |   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | ||||||
|   int j, k2; |   int j, k2; | ||||||
|  |  | ||||||
| @@ -74,6 +75,138 @@ static void cft1st_128_SSE2(float *a) { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void cftmdl_128_SSE2(float *a) { | ||||||
|  |   const int l = 8; | ||||||
|  |   const int m = 32; | ||||||
|  |   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); | ||||||
|  |   int j0, k, k1, k2, m2; | ||||||
|  |  | ||||||
|  |   __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); | ||||||
|  |   for (j0 = 0; j0 < l; j0 += 2) { | ||||||
|  |     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]); | ||||||
|  |     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]); | ||||||
|  |     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | ||||||
|  |     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | ||||||
|  |     const __m128  a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, | ||||||
|  |                                            _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128  a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, | ||||||
|  |                                            _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |           __m128  x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | ||||||
|  |     const __m128  x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | ||||||
|  |  | ||||||
|  |     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | ||||||
|  |     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | ||||||
|  |     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | ||||||
|  |     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | ||||||
|  |     const __m128  a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, | ||||||
|  |                                            _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128  a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, | ||||||
|  |                                            _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128  x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | ||||||
|  |     const __m128  x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | ||||||
|  |  | ||||||
|  |     const __m128  xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 +  0], (__m128i)xx0); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 32], | ||||||
|  |                      _mm_shuffle_epi32((__m128i)xx0, _MM_SHUFFLE(3, 2, 3, 2))); | ||||||
|  |     const __m128  xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx1); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 48], | ||||||
|  |                      _mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 2, 3))); | ||||||
|  |     a[j0 + 48] = -a[j0 + 48]; | ||||||
|  |  | ||||||
|  |     const __m128  x3i0_3r0_3i1_x3r1 = (__m128) | ||||||
|  |         _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); | ||||||
|  |     const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | ||||||
|  |     const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | ||||||
|  |     const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 +  8], (__m128i)x1_x3_add); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)x1_x3_sub); | ||||||
|  |  | ||||||
|  |     const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, | ||||||
|  |                                       _MM_SHUFFLE(2, 2, 2 ,2)); | ||||||
|  |     const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, | ||||||
|  |                                       _MM_SHUFFLE(3, 3, 3 ,3)); | ||||||
|  |     const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); | ||||||
|  |     const __m128 yy3 = _mm_add_ps(yy0, yy2); | ||||||
|  |     const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 40], (__m128i)yy4); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 56], | ||||||
|  |                      _mm_shuffle_epi32((__m128i)yy4, _MM_SHUFFLE(2, 3, 2, 3))); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   k1 = 0; | ||||||
|  |   m2 = 2 * m; | ||||||
|  |   k = 64; | ||||||
|  |   k1 += 2; | ||||||
|  |   k2 = 2 * k1; | ||||||
|  |   const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); | ||||||
|  |   const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); | ||||||
|  |                wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); | ||||||
|  |   const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); | ||||||
|  |   const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); | ||||||
|  |   const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); | ||||||
|  |   for (j0 = k; j0 < l + k; j0 += 2) { | ||||||
|  |     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]); | ||||||
|  |     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]); | ||||||
|  |     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); | ||||||
|  |     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); | ||||||
|  |     const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, | ||||||
|  |                                           _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, | ||||||
|  |                                           _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |           __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); | ||||||
|  |     const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); | ||||||
|  |  | ||||||
|  |     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); | ||||||
|  |     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); | ||||||
|  |     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); | ||||||
|  |     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); | ||||||
|  |     const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, | ||||||
|  |                                           _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, | ||||||
|  |                                           _MM_SHUFFLE(1, 0, 1 ,0)); | ||||||
|  |     const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); | ||||||
|  |     const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); | ||||||
|  |  | ||||||
|  |     const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 +  0], (__m128i)xx); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 32], | ||||||
|  |                        _mm_shuffle_epi32((__m128i)xx, _MM_SHUFFLE(3, 2, 3, 2))); | ||||||
|  |  | ||||||
|  |     const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); | ||||||
|  |     const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); | ||||||
|  |     const __m128 xx3 = _mm_mul_ps(wk2iv, | ||||||
|  |               (__m128)_mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 0, 1))); | ||||||
|  |     const __m128 xx4 = _mm_add_ps(xx2, xx3); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx4); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 48], | ||||||
|  |                       _mm_shuffle_epi32((__m128i)xx4, _MM_SHUFFLE(3, 2, 3, 2))); | ||||||
|  |  | ||||||
|  |     const __m128  x3i0_3r0_3i1_x3r1 = (__m128) | ||||||
|  |          _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); | ||||||
|  |     const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); | ||||||
|  |     const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | ||||||
|  |     const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); | ||||||
|  |  | ||||||
|  |     const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); | ||||||
|  |     const __m128 xx11 = _mm_mul_ps(wk1iv, | ||||||
|  |         (__m128)_mm_shuffle_epi32((__m128i)x1_x3_add, _MM_SHUFFLE(2, 3, 0, 1))); | ||||||
|  |     const __m128 xx12 = _mm_add_ps(xx10, xx11); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 +  8], (__m128i)xx12); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 40], | ||||||
|  |                      _mm_shuffle_epi32((__m128i)xx12, _MM_SHUFFLE(3, 2, 3, 2))); | ||||||
|  |  | ||||||
|  |     const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); | ||||||
|  |     const __m128 xx21 = _mm_mul_ps(wk3iv, | ||||||
|  |         (__m128)_mm_shuffle_epi32((__m128i)x1_x3_sub, _MM_SHUFFLE(2, 3, 0, 1))); | ||||||
|  |     const __m128 xx22 = _mm_add_ps(xx20, xx21); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)xx22); | ||||||
|  |     _mm_storel_epi64((__m128i*)&a[j0 + 56], | ||||||
|  |                      _mm_shuffle_epi32((__m128i)xx22, _MM_SHUFFLE(3, 2, 3, 2))); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| static void rftfsub_128_SSE2(float *a) { | static void rftfsub_128_SSE2(float *a) { | ||||||
|   const float *c = rdft_w + 32; |   const float *c = rdft_w + 32; | ||||||
|   int j1, j2, k1, k2; |   int j1, j2, k1, k2; | ||||||
| @@ -221,7 +354,6 @@ static void rftbsub_128_SSE2(float *a) { | |||||||
|     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120, |     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120, | ||||||
|     const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121, |     const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121, | ||||||
|     // Shuffle in right order and store. |     // Shuffle in right order and store. | ||||||
|     // Shuffle in right order and store. |  | ||||||
|     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); |     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); | ||||||
|                                                        //   2,   3,   4,   5, |                                                        //   2,   3,   4,   5, | ||||||
|     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); |     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); | ||||||
| @@ -259,6 +391,7 @@ static void rftbsub_128_SSE2(float *a) { | |||||||
|  |  | ||||||
| void aec_rdft_init_sse2(void) { | void aec_rdft_init_sse2(void) { | ||||||
|   cft1st_128 = cft1st_128_SSE2; |   cft1st_128 = cft1st_128_SSE2; | ||||||
|  |   cftmdl_128 = cftmdl_128_SSE2; | ||||||
|   rftfsub_128 = rftfsub_128_SSE2; |   rftfsub_128 = rftfsub_128_SSE2; | ||||||
|   rftbsub_128 = rftbsub_128_SSE2; |   rftbsub_128 = rftbsub_128_SSE2; | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 cduvivier@google.com
					cduvivier@google.com