Use -msse2 for SSE2 optimized code.

When targeting 32-bit Linux, we need to pass -msse2 to gcc to compile SSE2 intrinsics. However, -msse2 also gives gcc license to automatically generate SSE2 instructions wherever it pleases. This will crash our code on processors without SSE2 support. This change breaks the files with SSE2 intrinsics into separate targets, such that we can limit the scope of -msse2 to where it's needed. We no longer need to employ the WEBRTC_USE_SSE2 define; the build system decides when SSE2 is supported and compiles the appropriate files. TBR=bjornv@webrtc.org TEST=audioproc (performance testing), audioproc_unittest, video_processing_unittests, build on Linux (targeting ia32/x64, with disable_sse2==0/1), Mac, Windows Review URL: http://webrtc-codereview.appspot.com/352008 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1425 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-01-13 19:43:09 +00:00
parent ee3fe5b982
commit c8d012fb32
16 changed files with 382 additions and 335 deletions
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@@ -42,8 +42,3 @@ ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
 MY_WEBRTC_COMMON_DEFS += \
    '-DWEBRTC_ARCH_ARM_V7A'
 endif
-
-else ifeq ($(TARGET_ARCH),x86)
-MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_USE_SSE2'
-endif
--- a/src/modules/audio_processing/aec/Android.mk
+++ b/src/modules/audio_processing/aec/Android.mk
@@ -20,9 +20,12 @@ LOCAL_SRC_FILES := \
    aec_resampler.c \
    aec_core.c \
    aec_rdft.c \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
    aec_core_sse2.c \
    aec_rdft_sse2.c
-
+endif

 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
--- a/src/modules/audio_processing/aec/aec.gypi
+++ b/src/modules/audio_processing/aec/aec.gypi
@@ -16,8 +16,8 @@
        'aec_debug_dump%': 0,
      },
      'dependencies': [
+        'apm_util',
        '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-        'apm_util'
      ],
      'include_dirs': [
        'interface',
@@ -32,18 +32,37 @@
        'echo_cancellation.c',
        'aec_core.h',
        'aec_core.c',
-        'aec_core_sse2.c',
        'aec_rdft.h',
        'aec_rdft.c',
-        'aec_rdft_sse2.c',
        'aec_resampler.h',
        'aec_resampler.c',
      ],
      'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'aec_sse2', ],
+        }],
        ['aec_debug_dump==1', {
          'defines': [ 'WEBRTC_AEC_DEBUG_DUMP', ],
        }],
      ],
    },
+    {
+      'target_name': 'aec_sse2',
+      'type': '<(library)',
+      'sources': [
+        'aec_core_sse2.c',
+        'aec_rdft_sse2.c',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
+    },
  ],
 }
--- a/src/modules/audio_processing/aec/aec_core.c
+++ b/src/modules/audio_processing/aec/aec_core.c
@@ -21,6 +21,7 @@
 #include <string.h>

 #include "aec_rdft.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "delay_estimator_wrapper.h"
 #include "ring_buffer.h"
 #include "system_wrappers/interface/cpu_features_wrapper.h"
@@ -516,11 +517,13 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
    WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
    WebRtcAec_FilterAdaptation = FilterAdaptation;
    WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
    if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
      WebRtcAec_InitAec_SSE2();
-#endif
    }
+#endif
+
    aec_rdft_init();

    return 0;
--- a/src/modules/audio_processing/aec/aec_core.h
+++ b/src/modules/audio_processing/aec/aec_core.h
@@ -15,9 +15,10 @@
 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_

+#ifdef WEBRTC_AEC_DEBUG_DUMP
 #include <stdio.h>
+#endif

-#include "signal_processing_library.h"
 #include "typedefs.h"

 #define FRAME_LEN 80
--- a/src/modules/audio_processing/aec/aec_core_sse2.c
+++ b/src/modules/audio_processing/aec/aec_core_sse2.c
@@ -12,13 +12,12 @@
 * The core AEC algorithm, SSE2 version of speed-critical functions.
 */

-#include "typedefs.h"
+#include "aec_core.h"

-#if defined(WEBRTC_USE_SSE2)
 #include <emmintrin.h>
 #include <math.h>
+#include <string.h>  // memset

-#include "aec_core.h"
 #include "aec_rdft.h"

 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
@@ -414,4 +413,3 @@ void WebRtcAec_InitAec_SSE2(void) {
  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
 }

-#endif   // WEBRTC_USE_SSE2
--- a/src/modules/audio_processing/aec/aec_rdft.c
+++ b/src/modules/audio_processing/aec/aec_rdft.c
@@ -576,11 +576,11 @@ void aec_rdft_init(void) {
  cftmdl_128 = cftmdl_128_C;
  rftfsub_128 = rftfsub_128_C;
  rftbsub_128 = rftbsub_128_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
  if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
    aec_rdft_init_sse2();
-#endif
  }
+#endif
  // init library constants.
  makewt_32();
  makect_32();
--- a/src/modules/audio_processing/aec/aec_rdft_sse2.c
+++ b/src/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -8,13 +8,10 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "typedefs.h"
-
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-
 #include "aec_rdft.h"

+#include <emmintrin.h>
+
 static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
  {-1.f, 1.f, -1.f, 1.f};

@@ -428,4 +425,3 @@ void aec_rdft_init_sse2(void) {
  rftbsub_128 = rftbsub_128_SSE2;
 }

-#endif  // WEBRTC_USE_SS2
--- a/src/modules/audio_processing/aec/echo_cancellation.c
+++ b/src/modules/audio_processing/aec/echo_cancellation.c
@@ -22,6 +22,7 @@

 #include "aec_core.h"
 #include "aec_resampler.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "ring_buffer.h"
 #include "typedefs.h"

--- a/src/modules/video_processing/main/source/Android.mk
+++ b/src/modules/video_processing/main/source/Android.mk
@@ -18,7 +18,6 @@ LOCAL_MODULE := libwebrtc_video_processing
 LOCAL_MODULE_TAGS := optional
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES := \
-    video_processing_impl.cc \
    brightness_detection.cc \
    color_enhancement.cc \
    content_analysis.cc \
@@ -27,6 +26,12 @@ LOCAL_SRC_FILES := \
    frame_preprocessor.cc \
    spatial_resampler.cc \
    video_decimator.cc
+    video_processing_impl.cc \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
+    content_analysis_sse2.cc
+endif

 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
--- a/src/modules/video_processing/main/source/content_analysis.cc
+++ b/src/modules/video_processing/main/source/content_analysis.cc
@@ -13,12 +13,10 @@

 #include <math.h>
 #include <stdlib.h>
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-#endif
+
 namespace webrtc {

-VPMContentAnalysis::VPMContentAnalysis(bool RTCD):
+VPMContentAnalysis::VPMContentAnalysis(bool runtime_cpu_detection):
 _origFrame(NULL),
 _prevFrame(NULL),
 _width(0),
@@ -40,16 +38,16 @@ _cMetrics(NULL)
    ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_C;
    TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_C;

-    if (RTCD)
+    if (runtime_cpu_detection)
    {
-        if(WebRtc_GetCPUInfo(kSSE2))
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+        if (WebRtc_GetCPUInfo(kSSE2))
        {
-#if defined(WEBRTC_USE_SSE2)
            ComputeSpatialMetrics =
                          &VPMContentAnalysis::ComputeSpatialMetrics_SSE2;
            TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_SSE2;
-#endif
        }
+#endif
    }

    Release();
@@ -249,110 +247,6 @@ VPMContentAnalysis::TemporalDiffMetric_C()

 }

-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::TemporalDiffMetric_SSE2()
-{
-    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
-
-    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
-    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
-
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i sad_64   = _mm_setzero_si128();
-    __m128i sum_64   = _mm_setzero_si128();
-    __m128i sqsum_64 = _mm_setzero_si128();
-    const __m128i z  = _mm_setzero_si128();
-
-    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i sqsum_32  = _mm_setzero_si128();
-
-        const WebRtc_UWord8 *lineO = imgBufO;
-        const WebRtc_UWord8 *lineP = imgBufP;
-
-        // Work on 16 pixels at a time.  For HD content with a width of 1920
-        // this loop will run ~67 times (depending on border).  Maximum for
-        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
-        // results which are then accumulated.  There is no chance of
-        // rollover for these two accumulators.
-        // o*o will have a maximum of 255*255 = 65025.  This will roll over
-        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
-        // 32 bit accumulator.
-        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
-            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
-
-            lineO += 16;
-            lineP += 16;
-
-            // abs pixel difference between frames
-            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
-
-            // sum of all pixels in frame
-            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
-
-            // squared sum of all pixels in frame
-            const __m128i olo = _mm_unpacklo_epi8(o,z);
-            const __m128i ohi = _mm_unpackhi_epi8(o,z);
-
-            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
-            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
-
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
-        }
-
-        // Add to 64 bit running sum as to not roll over.
-        sqsum_64 = _mm_add_epi64(sqsum_64,
-                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
-                                              _mm_unpacklo_epi32(sqsum_32,z)));
-
-        imgBufO += _width * _skipNum;
-        imgBufP += _width * _skipNum;
-        numPixels += (width_end - _border);
-    }
-
-    WebRtc_Word64 sad_final_64[2];
-    WebRtc_Word64 sum_final_64[2];
-    WebRtc_Word64 sqsum_final_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
-    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
-    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
-
-    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
-    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
-    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
-
-    // default
-    _motionMagnitudeNZ = 0.0f;
-
-    if (tempDiffSum == 0)
-    {
-        return VPM_OK;
-    }
-
-    // normalize over all pixels
-    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
-    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
-    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
-    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
-
-    if (contrast > 0.0)
-    {
-        contrast = sqrt(contrast);
-       _motionMagnitudeNZ = tempDiffAvg/contrast;
-    }
-
-    return VPM_OK;
-}
-#endif
-
 // Compute spatial metrics:
 // To reduce complexity, we compute the metric for a reduced set of points.
 // The spatial metrics are rough estimates of the prediction error cost for
@@ -427,172 +321,6 @@ VPMContentAnalysis::ComputeSpatialMetrics_C()
    return VPM_OK;
 }

-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
-{
-    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i se_32  = _mm_setzero_si128();
-    __m128i sev_32 = _mm_setzero_si128();
-    __m128i seh_32 = _mm_setzero_si128();
-    __m128i msa_32 = _mm_setzero_si128();
-    const __m128i z = _mm_setzero_si128();
-
-    // Error is accumulated as a 32 bit value.  Looking at HD content with a
-    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
-    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
-    // will not roll over a 32 bit accumulator.
-    // _skipNum is also used to reduce the number of rows
-    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i se_16  = _mm_setzero_si128();
-        __m128i sev_16 = _mm_setzero_si128();
-        __m128i seh_16 = _mm_setzero_si128();
-        __m128i msa_16 = _mm_setzero_si128();
-
-        // Row error is accumulated as a 16 bit value.  There are 8
-        // accumulators.  Max value of a 16 bit number is 65529.  Looking
-        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
-        // A mb at a time is processed at a time.  Absolute max error at
-        // a point would be abs(0-255+255+255+255) which equals 1020.
-        // 120*1020 = 122400.  The probability of hitting this is quite low
-        // on well behaved content.  A specially crafted image could roll over.
-        // _border could also be adjusted to concentrate on just the center of
-        // the images for an HD capture in order to reduce the possiblity of
-        // rollover.
-        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
-        const WebRtc_UWord8 *lineCen = imgBuf + _border;
-        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
-
-        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
-            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
-            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
-            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
-            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
-
-            lineTop += 16;
-            lineCen += 16;
-            lineBot += 16;
-
-            // center pixel unpacked
-            __m128i clo = _mm_unpacklo_epi8(c,z);
-            __m128i chi = _mm_unpackhi_epi8(c,z);
-
-            // left right pixels unpacked and added together
-            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
-                                               _mm_unpacklo_epi8(r,z));
-            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
-                                               _mm_unpackhi_epi8(r,z));
-
-            // top & bottom pixels unpacked and added together
-            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
-                                               _mm_unpacklo_epi8(b,z));
-            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
-                                               _mm_unpackhi_epi8(b,z));
-
-            // running sum of all pixels
-            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
-            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
-            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
-            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i setlo = _mm_subs_epi16(clo,
-                                                 _mm_add_epi16(lrlo, tblo));
-            const __m128i sethi = _mm_subs_epi16(chi,
-                                                 _mm_add_epi16(lrhi, tbhi));
-
-            // Add to 16 bit running sum
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(setlo,
-                                                 _mm_subs_epi16(z, setlo)));
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(sethi,
-                                                 _mm_subs_epi16(z, sethi)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevtlo,
-                                                 _mm_subs_epi16(z, sevtlo)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevthi,
-                                                 _mm_subs_epi16(z, sevthi)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehtlo,
-                                                 _mm_subs_epi16(z, sehtlo)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehthi,
-                                                 _mm_subs_epi16(z, sehthi)));
-        }
-
-        // Add to 32 bit running sum as to not roll over.
-        se_32  = _mm_add_epi32(se_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
-                                             _mm_unpacklo_epi16(se_16,z)));
-        sev_32 = _mm_add_epi32(sev_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
-                                             _mm_unpacklo_epi16(sev_16,z)));
-        seh_32 = _mm_add_epi32(seh_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
-                                             _mm_unpacklo_epi16(seh_16,z)));
-        msa_32 = _mm_add_epi32(msa_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
-                                             _mm_unpacklo_epi16(msa_16,z)));
-
-        imgBuf += _width * _skipNum;
-    }
-
-    WebRtc_Word64 se_64[2];
-    WebRtc_Word64 sev_64[2];
-    WebRtc_Word64 seh_64[2];
-    WebRtc_Word64 msa_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)se_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
-                                   _mm_unpacklo_epi32(se_32,z)));
-    _mm_store_si128 ((__m128i*)sev_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
-                                   _mm_unpacklo_epi32(sev_32,z)));
-    _mm_store_si128 ((__m128i*)seh_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
-                                   _mm_unpacklo_epi32(seh_32,z)));
-    _mm_store_si128 ((__m128i*)msa_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
-                                   _mm_unpacklo_epi32(msa_32,z)));
-
-    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
-    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
-    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
-    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
-
-    // normalize over all pixels
-    const float spatialErr  = (float)(spatialErrSum >> 2);
-    const float spatialErrH = (float)(spatialErrHSum >> 1);
-    const float spatialErrV = (float)(spatialErrVSum >> 1);
-    const float norm = (float)pixelMSA;
-
-    // 2X2:
-    _spatialPredErr = spatialErr / norm;
-
-    // 1X2:
-    _spatialPredErrH = spatialErrH / norm;
-
-    // 2X1:
-    _spatialPredErrV = spatialErrV / norm;
-
-    return VPM_OK;
-}
-#endif // #if defined(WEBRTC_USE_SSE2)
-
 VideoContentMetrics*
 VPMContentAnalysis::ContentMetrics()
 {
--- a/src/modules/video_processing/main/source/content_analysis.h
+++ b/src/modules/video_processing/main/source/content_analysis.h
@@ -8,10 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-/*
- * content_analysis.h
- */
-
 #ifndef VPM_CONTENT_ANALYSIS_H
 #define VPM_CONTENT_ANALYSIS_H

@@ -24,7 +20,9 @@ namespace webrtc {
 class VPMContentAnalysis
 {
 public:
-    VPMContentAnalysis(bool RTCD = true);
+    // When |runtime_cpu_detection| is true, runtime selection of an optimized
+    // code path is allowed.
+    VPMContentAnalysis(bool runtime_cpu_detection);
    ~VPMContentAnalysis();

    // Initialize ContentAnalysis - should be called prior to
@@ -62,7 +60,7 @@ private:
    ComputeSpatialMetricsFunc ComputeSpatialMetrics;
    WebRtc_Word32 ComputeSpatialMetrics_C();

-#if defined(WEBRTC_USE_SSE2)
+#if defined(WEBRTC_ARCH_X86_FAMILY)
    WebRtc_Word32 ComputeSpatialMetrics_SSE2();
    WebRtc_Word32 TemporalDiffMetric_SSE2();
 #endif
--- a/src/modules/video_processing/main/source/content_analysis_sse2.cc
+++ b/src/modules/video_processing/main/source/content_analysis_sse2.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "content_analysis.h"
+
+#include <emmintrin.h>
+#include <math.h>
+
+namespace webrtc {
+
+WebRtc_Word32
+VPMContentAnalysis::TemporalDiffMetric_SSE2()
+{
+    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
+
+    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
+    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
+
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i sad_64   = _mm_setzero_si128();
+    __m128i sum_64   = _mm_setzero_si128();
+    __m128i sqsum_64 = _mm_setzero_si128();
+    const __m128i z  = _mm_setzero_si128();
+
+    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i sqsum_32  = _mm_setzero_si128();
+
+        const WebRtc_UWord8 *lineO = imgBufO;
+        const WebRtc_UWord8 *lineP = imgBufP;
+
+        // Work on 16 pixels at a time.  For HD content with a width of 1920
+        // this loop will run ~67 times (depending on border).  Maximum for
+        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
+        // results which are then accumulated.  There is no chance of
+        // rollover for these two accumulators.
+        // o*o will have a maximum of 255*255 = 65025.  This will roll over
+        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
+        // 32 bit accumulator.
+        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
+            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
+
+            lineO += 16;
+            lineP += 16;
+
+            // abs pixel difference between frames
+            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
+
+            // sum of all pixels in frame
+            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
+
+            // squared sum of all pixels in frame
+            const __m128i olo = _mm_unpacklo_epi8(o,z);
+            const __m128i ohi = _mm_unpackhi_epi8(o,z);
+
+            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
+            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
+
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
+        }
+
+        // Add to 64 bit running sum as to not roll over.
+        sqsum_64 = _mm_add_epi64(sqsum_64,
+                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
+                                              _mm_unpacklo_epi32(sqsum_32,z)));
+
+        imgBufO += _width * _skipNum;
+        imgBufP += _width * _skipNum;
+        numPixels += (width_end - _border);
+    }
+
+    WebRtc_Word64 sad_final_64[2];
+    WebRtc_Word64 sum_final_64[2];
+    WebRtc_Word64 sqsum_final_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
+    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
+    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
+
+    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
+    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
+    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
+
+    // default
+    _motionMagnitudeNZ = 0.0f;
+
+    if (tempDiffSum == 0)
+    {
+        return VPM_OK;
+    }
+
+    // normalize over all pixels
+    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
+    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
+    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
+    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
+
+    if (contrast > 0.0)
+    {
+        contrast = sqrt(contrast);
+       _motionMagnitudeNZ = tempDiffAvg/contrast;
+    }
+
+    return VPM_OK;
+}
+
+WebRtc_Word32
+VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
+{
+    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i se_32  = _mm_setzero_si128();
+    __m128i sev_32 = _mm_setzero_si128();
+    __m128i seh_32 = _mm_setzero_si128();
+    __m128i msa_32 = _mm_setzero_si128();
+    const __m128i z = _mm_setzero_si128();
+
+    // Error is accumulated as a 32 bit value.  Looking at HD content with a
+    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
+    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
+    // will not roll over a 32 bit accumulator.
+    // _skipNum is also used to reduce the number of rows
+    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i se_16  = _mm_setzero_si128();
+        __m128i sev_16 = _mm_setzero_si128();
+        __m128i seh_16 = _mm_setzero_si128();
+        __m128i msa_16 = _mm_setzero_si128();
+
+        // Row error is accumulated as a 16 bit value.  There are 8
+        // accumulators.  Max value of a 16 bit number is 65529.  Looking
+        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
+        // A mb at a time is processed at a time.  Absolute max error at
+        // a point would be abs(0-255+255+255+255) which equals 1020.
+        // 120*1020 = 122400.  The probability of hitting this is quite low
+        // on well behaved content.  A specially crafted image could roll over.
+        // _border could also be adjusted to concentrate on just the center of
+        // the images for an HD capture in order to reduce the possiblity of
+        // rollover.
+        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
+        const WebRtc_UWord8 *lineCen = imgBuf + _border;
+        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
+
+        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
+            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
+            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
+            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
+            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
+
+            lineTop += 16;
+            lineCen += 16;
+            lineBot += 16;
+
+            // center pixel unpacked
+            __m128i clo = _mm_unpacklo_epi8(c,z);
+            __m128i chi = _mm_unpackhi_epi8(c,z);
+
+            // left right pixels unpacked and added together
+            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
+                                               _mm_unpacklo_epi8(r,z));
+            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
+                                               _mm_unpackhi_epi8(r,z));
+
+            // top & bottom pixels unpacked and added together
+            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
+                                               _mm_unpacklo_epi8(b,z));
+            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
+                                               _mm_unpackhi_epi8(b,z));
+
+            // running sum of all pixels
+            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
+            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
+            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
+            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i setlo = _mm_subs_epi16(clo,
+                                                 _mm_add_epi16(lrlo, tblo));
+            const __m128i sethi = _mm_subs_epi16(chi,
+                                                 _mm_add_epi16(lrhi, tbhi));
+
+            // Add to 16 bit running sum
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(setlo,
+                                                 _mm_subs_epi16(z, setlo)));
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(sethi,
+                                                 _mm_subs_epi16(z, sethi)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevtlo,
+                                                 _mm_subs_epi16(z, sevtlo)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevthi,
+                                                 _mm_subs_epi16(z, sevthi)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehtlo,
+                                                 _mm_subs_epi16(z, sehtlo)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehthi,
+                                                 _mm_subs_epi16(z, sehthi)));
+        }
+
+        // Add to 32 bit running sum as to not roll over.
+        se_32  = _mm_add_epi32(se_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
+                                             _mm_unpacklo_epi16(se_16,z)));
+        sev_32 = _mm_add_epi32(sev_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
+                                             _mm_unpacklo_epi16(sev_16,z)));
+        seh_32 = _mm_add_epi32(seh_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
+                                             _mm_unpacklo_epi16(seh_16,z)));
+        msa_32 = _mm_add_epi32(msa_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
+                                             _mm_unpacklo_epi16(msa_16,z)));
+
+        imgBuf += _width * _skipNum;
+    }
+
+    WebRtc_Word64 se_64[2];
+    WebRtc_Word64 sev_64[2];
+    WebRtc_Word64 seh_64[2];
+    WebRtc_Word64 msa_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)se_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
+                                   _mm_unpacklo_epi32(se_32,z)));
+    _mm_store_si128 ((__m128i*)sev_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
+                                   _mm_unpacklo_epi32(sev_32,z)));
+    _mm_store_si128 ((__m128i*)seh_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
+                                   _mm_unpacklo_epi32(seh_32,z)));
+    _mm_store_si128 ((__m128i*)msa_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
+                                   _mm_unpacklo_epi32(msa_32,z)));
+
+    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
+    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
+    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
+    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
+
+    // normalize over all pixels
+    const float spatialErr  = (float)(spatialErrSum >> 2);
+    const float spatialErrH = (float)(spatialErrHSum >> 1);
+    const float spatialErrV = (float)(spatialErrVSum >> 1);
+    const float norm = (float)pixelMSA;
+
+    // 2X2:
+    _spatialPredErr = spatialErr / norm;
+
+    // 1X2:
+    _spatialPredErrH = spatialErrH / norm;
+
+    // 2X1:
+    _spatialPredErrV = spatialErrV / norm;
+
+    return VPM_OK;
+}
+
+}  // namespace webrtc
--- a/src/modules/video_processing/main/source/frame_preprocessor.cc
+++ b/src/modules/video_processing/main/source/frame_preprocessor.cc
@@ -22,7 +22,7 @@ _resampledFrame(),
 _enableCA(false)
 {
    _spatialResampler = new VPMSimpleSpatialResampler();
-    _ca = new VPMContentAnalysis();
+    _ca = new VPMContentAnalysis(true);
    _vd = new VPMVideoDecimator();
 }

--- a/src/modules/video_processing/main/source/video_processing.gypi
+++ b/src/modules/video_processing/main/source/video_processing.gypi
@@ -14,7 +14,7 @@
      'dependencies': [
        'webrtc_utility',
        '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-         '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
+        '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
        '<(webrtc_root)/system_wrappers/source/system_wrappers.gyp:system_wrappers',
      ],
      'include_dirs': [
@@ -26,41 +26,57 @@
        ],
      },
      'sources': [
-        # interfaces
        '../interface/video_processing.h',
        '../interface/video_processing_defines.h',
-
-        # headers
-        'video_processing_impl.h',
+        'brighten.cc',
+        'brighten.h',
+        'brightness_detection.cc',
        'brightness_detection.h',
-	'brighten.h',
+        'color_enhancement.cc',
        'color_enhancement.h',
        'color_enhancement_private.h',
-        'content_analysis.h',
-        'deflickering.h',
-        'denoising.h',
-        'frame_preprocessor.h',
-        'spatial_resampler.h',
-        'video_decimator.h',
-
-        # sources
-        'video_processing_impl.cc',
-        'brightness_detection.cc',
-	'brighten.cc',
-        'color_enhancement.cc',
        'content_analysis.cc',
+        'content_analysis.h',
        'deflickering.cc',
+        'deflickering.h',
        'denoising.cc',
+        'denoising.h',
        'frame_preprocessor.cc',
+        'frame_preprocessor.h',
        'spatial_resampler.cc',
+        'spatial_resampler.h',
        'video_decimator.cc',
-      ], # source
+        'video_decimator.h',
+        'video_processing_impl.cc',
+        'video_processing_impl.h',
+      ],
+      'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'video_processing_sse2', ],
+        }],
+      ],
+    },
+    {
+      'target_name': 'video_processing_sse2',
+      'type': '<(library)',
+      'sources': [
+        'content_analysis_sse2.cc',
+      ],
+      'include_dirs': [
+        '../interface',
+        '../../../interface',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
    },
  ],
 }

-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
--- a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
+++ b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
@@ -17,7 +17,7 @@ namespace webrtc {
 TEST_F(VideoProcessingModuleTest, ContentAnalysis)
 {
    VPMContentAnalysis    _ca_c(false);
-    VPMContentAnalysis    _ca_sse;
+    VPMContentAnalysis    _ca_sse(true);
    VideoContentMetrics  *_cM_c, *_cM_SSE;

    _ca_c.Initialize(_width,_height);