Fix MSVC issues in AEC to enable SSE2 optimization on Windows.

Variables now declared at top of scope and replacing C casts with intrinsic cast functions. Review URL: http://webrtc-codereview.appspot.com/160001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@611 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-09-16 22:51:57 +00:00 · 2011-09-16 22:51:57 +00:00 · 7b7c045b75
commit 7b7c045b75
parent b37ec71dbd
4 changed files with 138 additions and 108 deletions
--- a/src/modules/audio_processing/aec/main/source/aec_core_sse2.c
+++ b/src/modules/audio_processing/aec/main/source/aec_core_sse2.c
@ -235,10 +235,9 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
-    const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
-        shift_exponent_into_top_mantissa);
-    const __m128 n_0 = _mm_or_ps(
-        (__m128)n_1, *((__m128 *)eight_biased_exponent));
+    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n),
+        shift_exponent_into_top_mantissa));
+    const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent));
    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));

    // Compute y.
@ -317,8 +316,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
    static const int float_exponent_shift = 23;
    const __m128i two_n_exponent = _mm_add_epi32(
        x_minus_half_floor, *((__m128i *)float_exponent_bias));
-    const __m128  two_n = (__m128)_mm_slli_epi32(
-        two_n_exponent, float_exponent_shift);
+    const __m128  two_n = _mm_castsi128_ps(_mm_slli_epi32(
+        two_n_exponent, float_exponent_shift));
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
--- a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c
+++ b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c
@ -42,27 +42,33 @@ static void cft1st_128_SSE2(float *a) {
    const __m128 x1v    = _mm_sub_ps(a01v, a23v);
    const __m128 x2v    = _mm_add_ps(a45v, a67v);
    const __m128 x3v    = _mm_sub_ps(a45v, a67v);
+          __m128 x0w;
                 a01v   = _mm_add_ps(x0v, x2v);
                 x0v    = _mm_sub_ps(x0v, x2v);
-          __m128 x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
-
-    const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
-    const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
-                 a45v   = _mm_add_ps(a45_0v, a45_1v);
-
-    const __m128 x3w    = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
-    const __m128 x3s    = _mm_mul_ps(mm_swap_sign, x3w);
-                 x0v    = _mm_add_ps(x1v, x3s);
                 x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
-    const __m128 a23_0v = _mm_mul_ps(wk1rv, x0v);
-    const __m128 a23_1v = _mm_mul_ps(wk1iv, x0w);
-                 a23v   = _mm_add_ps(a23_0v, a23_1v);
+    {
+      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
+      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
+                   a45v   = _mm_add_ps(a45_0v, a45_1v);
+    }
+    {
+            __m128 a23_0v, a23_1v;
+      const __m128 x3w    = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1));
+      const __m128 x3s    = _mm_mul_ps(mm_swap_sign, x3w);
+                   x0v    = _mm_add_ps(x1v, x3s);
+                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+                   a23_0v = _mm_mul_ps(wk1rv, x0v);
+                   a23_1v = _mm_mul_ps(wk1iv, x0w);
+                   a23v   = _mm_add_ps(a23_0v, a23_1v);

-                 x0v    = _mm_sub_ps(x1v, x3s);
-                 x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
-    const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
-    const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
-                 a67v   = _mm_add_ps(a67_0v, a67_1v);
+                   x0v    = _mm_sub_ps(x1v, x3s);
+                   x0w    = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1));
+    }
+    {
+      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
+      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
+                   a67v   = _mm_add_ps(a67_0v, a67_1v);
+    }

                 a00v   = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0));
                 a04v   = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0));
@ -78,7 +84,7 @@ static void cft1st_128_SSE2(float *a) {
 static void cftmdl_128_SSE2(float *a) {
  const int l = 8;
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
-  int j0, k, k1, k2;
+  int j0;

  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
  for (j0 = 0; j0 < l; j0 += 2) {
@ -86,9 +92,11 @@ static void cftmdl_128_SSE2(float *a) {
    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
-    const __m128  a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32,
+    const __m128  a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                           _mm_castsi128_ps(a_32),
                                           _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128  a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40,
+    const __m128  a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                           _mm_castsi128_ps(a_40),
                                           _MM_SHUFFLE(1, 0, 1 ,0));
          __m128  x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
    const __m128  x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
@ -97,30 +105,24 @@ static void cftmdl_128_SSE2(float *a) {
    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
-    const __m128  a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48,
+    const __m128  a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                           _mm_castsi128_ps(a_48),
                                           _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128  a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56,
+    const __m128  a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                           _mm_castsi128_ps(a_56),
                                           _MM_SHUFFLE(1, 0, 1 ,0));
    const __m128  x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
    const __m128  x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

    const __m128  xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-    _mm_storel_epi64((__m128i*)&a[j0 +  0], (__m128i)xx0);
-    _mm_storel_epi64((__m128i*)&a[j0 + 32],
-                     _mm_shuffle_epi32((__m128i)xx0, _MM_SHUFFLE(3, 2, 3, 2)));
    const __m128  xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-    _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx1);
-    _mm_storel_epi64((__m128i*)&a[j0 + 48],
-                     _mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 2, 3)));
-    a[j0 + 48] = -a[j0 + 48];

-    const __m128  x3i0_3r0_3i1_x3r1 = (__m128)
-        _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128  x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(
+        _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
+                          _MM_SHUFFLE(2, 3, 0, 1)));
    const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
    const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
    const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
-    _mm_storel_epi64((__m128i*)&a[j0 +  8], (__m128i)x1_x3_add);
-    _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)x1_x3_sub);

    const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub,
                                      _MM_SHUFFLE(2, 2, 2 ,2));
@ -129,79 +131,111 @@ static void cftmdl_128_SSE2(float *a) {
    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
    const __m128 yy3 = _mm_add_ps(yy0, yy2);
    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
-    _mm_storel_epi64((__m128i*)&a[j0 + 40], (__m128i)yy4);
+
+    _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx0));
+    _mm_storel_epi64((__m128i*)&a[j0 + 32],
+                     _mm_shuffle_epi32(_mm_castps_si128(xx0),
+                                       _MM_SHUFFLE(3, 2, 3, 2)));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
+    _mm_storel_epi64((__m128i*)&a[j0 + 48],
+                     _mm_shuffle_epi32(_mm_castps_si128(xx1),
+                                       _MM_SHUFFLE(2, 3, 2, 3)));
+    a[j0 + 48] = -a[j0 + 48];
+
+    _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(x1_x3_add));
+    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
    _mm_storel_epi64((__m128i*)&a[j0 + 56],
-                     _mm_shuffle_epi32((__m128i)yy4, _MM_SHUFFLE(2, 3, 2, 3)));
+                     _mm_shuffle_epi32(_mm_castps_si128(yy4),
+                     _MM_SHUFFLE(2, 3, 2, 3)));
  }

-  k1 = 0;
-  k = 64;
-  k1 += 2;
-  k2 = 2 * k1;
-  const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
-  const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
-               wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
-  const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
-  const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
-  const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
-  for (j0 = k; j0 < l + k; j0 += 2) {
-    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
-    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
-    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
-    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
-    const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32,
-                                          _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40,
-                                          _MM_SHUFFLE(1, 0, 1 ,0));
-          __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
-    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+  {
+    int k = 64;
+    int k1 = 2;
+    int k2 = 2 * k1;
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]);
+                 wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]);
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 +  0]);
+      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 +  8]);
+      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                            _mm_castsi128_ps(a_32),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                            _mm_castsi128_ps(a_40),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+            __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

-    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
-    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
-    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
-    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
-    const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48,
-                                          _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56,
-                                          _MM_SHUFFLE(1, 0, 1 ,0));
-    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
-    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                            _mm_castsi128_ps(a_48),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                            _mm_castsi128_ps(a_56),
+                                            _MM_SHUFFLE(1, 0, 1 ,0));
+      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

-    const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-    _mm_storel_epi64((__m128i*)&a[j0 +  0], (__m128i)xx);
-    _mm_storel_epi64((__m128i*)&a[j0 + 32],
-                       _mm_shuffle_epi32((__m128i)xx, _MM_SHUFFLE(3, 2, 3, 2)));
+      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
+      const __m128 xx3 = _mm_mul_ps(wk2iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx4 = _mm_add_ps(xx2, xx3);

-    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
-    const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv);
-    const __m128 xx3 = _mm_mul_ps(wk2iv,
-              (__m128)_mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 0, 1)));
-    const __m128 xx4 = _mm_add_ps(xx2, xx3);
-    _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx4);
-    _mm_storel_epi64((__m128i*)&a[j0 + 48],
-                      _mm_shuffle_epi32((__m128i)xx4, _MM_SHUFFLE(3, 2, 3, 2)));
+      const __m128  x3i0_3r0_3i1_x3r1 =  _mm_castsi128_ps(
+          _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1),
+                            _MM_SHUFFLE(2, 3, 0, 1)));
+      const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+      const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

-    const __m128  x3i0_3r0_3i1_x3r1 = (__m128)
-         _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128  x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
-    const __m128  x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
-    const __m128  x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
+      const __m128 xx11 = _mm_mul_ps(wk1iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx12 = _mm_add_ps(xx10, xx11);

-    const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
-    const __m128 xx11 = _mm_mul_ps(wk1iv,
-        (__m128)_mm_shuffle_epi32((__m128i)x1_x3_add, _MM_SHUFFLE(2, 3, 0, 1)));
-    const __m128 xx12 = _mm_add_ps(xx10, xx11);
-    _mm_storel_epi64((__m128i*)&a[j0 +  8], (__m128i)xx12);
-    _mm_storel_epi64((__m128i*)&a[j0 + 40],
-                     _mm_shuffle_epi32((__m128i)xx12, _MM_SHUFFLE(3, 2, 3, 2)));
+      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
+      const __m128 xx21 = _mm_mul_ps(wk3iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
+                           _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx22 = _mm_add_ps(xx20, xx21);

-    const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
-    const __m128 xx21 = _mm_mul_ps(wk3iv,
-        (__m128)_mm_shuffle_epi32((__m128i)x1_x3_sub, _MM_SHUFFLE(2, 3, 0, 1)));
-    const __m128 xx22 = _mm_add_ps(xx20, xx21);
-    _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)xx22);
-    _mm_storel_epi64((__m128i*)&a[j0 + 56],
-                     _mm_shuffle_epi32((__m128i)xx22, _MM_SHUFFLE(3, 2, 3, 2)));
+      _mm_storel_epi64((__m128i*)&a[j0 +  0], _mm_castps_si128(xx));
+      _mm_storel_epi64((__m128i*)&a[j0 + 32],
+                         _mm_shuffle_epi32(_mm_castps_si128(xx),
+                                           _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
+      _mm_storel_epi64((__m128i*)&a[j0 + 48],
+                        _mm_shuffle_epi32(_mm_castps_si128(xx4),
+                                          _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 +  8], _mm_castps_si128(xx12));
+      _mm_storel_epi64((__m128i*)&a[j0 + 40],
+                       _mm_shuffle_epi32(_mm_castps_si128(xx12),
+                                         _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
+      _mm_storel_epi64((__m128i*)&a[j0 + 56],
+                       _mm_shuffle_epi32(_mm_castps_si128(xx22),
+                                         _MM_SHUFFLE(3, 2, 3, 2)));
+    }
  }
 }

--- a/src/modules/video_processing/main/source/content_analysis.cc
+++ b/src/modules/video_processing/main/source/content_analysis.cc
@ -14,7 +14,7 @@
 #include <math.h>
 #include <stdlib.h>
 #if defined(WEBRTC_USE_SSE2)
-#include <xmmintrin.h>
+#include <emmintrin.h>
 #endif
 namespace webrtc {

--- a/src/typedefs.h
+++ b/src/typedefs.h
@ -80,10 +80,7 @@
 #error Please add support for your architecture in typedefs.h
 #endif

-// TODO(andrew): SSE2 is disabled on Windows for the moment, because AEC
-// optimization is broken. Enable it as soon as AEC is fixed.
-//#if defined(__SSE2__) || defined(_MSC_VER)
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(_MSC_VER)
 #define WEBRTC_USE_SSE2
 #endif