Use -msse2 for SSE2 optimized code.

When targeting 32-bit Linux, we need to pass -msse2 to gcc to compile
SSE2 intrinsics. However, -msse2 also gives gcc license to automatically
generate SSE2 instructions wherever it pleases. This will crash our code
on processors without SSE2 support.

This change breaks the files with SSE2 intrinsics into separate targets,
such that we can limit the scope of -msse2 to where it's needed.

We no longer need to employ the WEBRTC_USE_SSE2 define; the build system
decides when SSE2 is supported and compiles the appropriate files.

TBR=bjornv@webrtc.org
TEST=audioproc (performance testing), audioproc_unittest, video_processing_unittests, build on Linux (targeting ia32/x64, with disable_sse2==0/1), Mac, Windows

Review URL: http://webrtc-codereview.appspot.com/352008

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1425 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org
2012-01-13 19:43:09 +00:00
parent ee3fe5b982
commit c8d012fb32
16 changed files with 382 additions and 335 deletions

View File

@@ -42,8 +42,3 @@ ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_ARCH_ARM_V7A'
endif
else ifeq ($(TARGET_ARCH),x86)
MY_WEBRTC_COMMON_DEFS += \
'-DWEBRTC_USE_SSE2'
endif

View File

@@ -20,9 +20,12 @@ LOCAL_SRC_FILES := \
aec_resampler.c \
aec_core.c \
aec_rdft.c \
ifeq ($(TARGET_ARCH),x86)
LOCAL_SRC_FILES += \
aec_core_sse2.c \
aec_rdft_sse2.c
endif
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \

View File

@@ -16,8 +16,8 @@
'aec_debug_dump%': 0,
},
'dependencies': [
'apm_util',
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
'apm_util'
],
'include_dirs': [
'interface',
@@ -32,18 +32,37 @@
'echo_cancellation.c',
'aec_core.h',
'aec_core.c',
'aec_core_sse2.c',
'aec_rdft.h',
'aec_rdft.c',
'aec_rdft_sse2.c',
'aec_resampler.h',
'aec_resampler.c',
],
'conditions': [
['target_arch=="ia32" or target_arch=="x64"', {
'dependencies': [ 'aec_sse2', ],
}],
['aec_debug_dump==1', {
'defines': [ 'WEBRTC_AEC_DEBUG_DUMP', ],
}],
],
},
{
'target_name': 'aec_sse2',
'type': '<(library)',
'sources': [
'aec_core_sse2.c',
'aec_rdft_sse2.c',
],
'conditions': [
['os_posix==1 and OS!="mac"', {
'cflags': [ '-msse2', ],
}],
['OS=="mac"', {
'xcode_settings': {
'OTHER_CFLAGS': [ '-msse2', ],
},
}],
],
},
],
}

View File

@@ -21,6 +21,7 @@
#include <string.h>
#include "aec_rdft.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "delay_estimator_wrapper.h"
#include "ring_buffer.h"
#include "system_wrappers/interface/cpu_features_wrapper.h"
@@ -516,11 +517,13 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
WebRtcAec_FilterAdaptation = FilterAdaptation;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2)) {
#if defined(WEBRTC_USE_SSE2)
WebRtcAec_InitAec_SSE2();
#endif
}
#endif
aec_rdft_init();
return 0;

View File

@@ -15,9 +15,10 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
#ifdef WEBRTC_AEC_DEBUG_DUMP
#include <stdio.h>
#endif
#include "signal_processing_library.h"
#include "typedefs.h"
#define FRAME_LEN 80

View File

@@ -12,13 +12,12 @@
* The core AEC algorithm, SSE2 version of speed-critical functions.
*/
#include "typedefs.h"
#include "aec_core.h"
#if defined(WEBRTC_USE_SSE2)
#include <emmintrin.h>
#include <math.h>
#include <string.h> // memset
#include "aec_core.h"
#include "aec_rdft.h"
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
@@ -414,4 +413,3 @@ void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
}
#endif // WEBRTC_USE_SSE2

View File

@@ -576,11 +576,11 @@ void aec_rdft_init(void) {
cftmdl_128 = cftmdl_128_C;
rftfsub_128 = rftfsub_128_C;
rftbsub_128 = rftbsub_128_C;
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2)) {
#if defined(WEBRTC_USE_SSE2)
aec_rdft_init_sse2();
#endif
}
#endif
// init library constants.
makewt_32();
makect_32();

View File

@@ -8,13 +8,10 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "typedefs.h"
#if defined(WEBRTC_USE_SSE2)
#include <emmintrin.h>
#include "aec_rdft.h"
#include <emmintrin.h>
static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
{-1.f, 1.f, -1.f, 1.f};
@@ -428,4 +425,3 @@ void aec_rdft_init_sse2(void) {
rftbsub_128 = rftbsub_128_SSE2;
}
#endif // WEBRTC_USE_SS2

View File

@@ -22,6 +22,7 @@
#include "aec_core.h"
#include "aec_resampler.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "ring_buffer.h"
#include "typedefs.h"

View File

@@ -18,7 +18,6 @@ LOCAL_MODULE := libwebrtc_video_processing
LOCAL_MODULE_TAGS := optional
LOCAL_CPP_EXTENSION := .cc
LOCAL_SRC_FILES := \
video_processing_impl.cc \
brightness_detection.cc \
color_enhancement.cc \
content_analysis.cc \
@@ -27,6 +26,12 @@ LOCAL_SRC_FILES := \
frame_preprocessor.cc \
spatial_resampler.cc \
video_decimator.cc
video_processing_impl.cc \
ifeq ($(TARGET_ARCH),x86)
LOCAL_SRC_FILES += \
content_analysis_sse2.cc
endif
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \

View File

@@ -13,12 +13,10 @@
#include <math.h>
#include <stdlib.h>
#if defined(WEBRTC_USE_SSE2)
#include <emmintrin.h>
#endif
namespace webrtc {
VPMContentAnalysis::VPMContentAnalysis(bool RTCD):
VPMContentAnalysis::VPMContentAnalysis(bool runtime_cpu_detection):
_origFrame(NULL),
_prevFrame(NULL),
_width(0),
@@ -40,16 +38,16 @@ _cMetrics(NULL)
ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_C;
TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_C;
if (RTCD)
if (runtime_cpu_detection)
{
if(WebRtc_GetCPUInfo(kSSE2))
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2))
{
#if defined(WEBRTC_USE_SSE2)
ComputeSpatialMetrics =
&VPMContentAnalysis::ComputeSpatialMetrics_SSE2;
TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_SSE2;
#endif
}
#endif
}
Release();
@@ -249,110 +247,6 @@ VPMContentAnalysis::TemporalDiffMetric_C()
}
#if defined(WEBRTC_USE_SSE2)
WebRtc_Word32
VPMContentAnalysis::TemporalDiffMetric_SSE2()
{
WebRtc_UWord32 numPixels = 0; // counter for # of pixels
const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
__m128i sad_64 = _mm_setzero_si128();
__m128i sum_64 = _mm_setzero_si128();
__m128i sqsum_64 = _mm_setzero_si128();
const __m128i z = _mm_setzero_si128();
for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
{
__m128i sqsum_32 = _mm_setzero_si128();
const WebRtc_UWord8 *lineO = imgBufO;
const WebRtc_UWord8 *lineP = imgBufP;
// Work on 16 pixels at a time. For HD content with a width of 1920
// this loop will run ~67 times (depending on border). Maximum for
// abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
// results which are then accumulated. There is no chance of
// rollover for these two accumulators.
// o*o will have a maximum of 255*255 = 65025. This will roll over
// a 16 bit accumulator as 67*65025 > 65535, but will fit in a
// 32 bit accumulator.
for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
{
const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
lineO += 16;
lineP += 16;
// abs pixel difference between frames
sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
// sum of all pixels in frame
sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
// squared sum of all pixels in frame
const __m128i olo = _mm_unpacklo_epi8(o,z);
const __m128i ohi = _mm_unpackhi_epi8(o,z);
const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
}
// Add to 64 bit running sum as to not roll over.
sqsum_64 = _mm_add_epi64(sqsum_64,
_mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
_mm_unpacklo_epi32(sqsum_32,z)));
imgBufO += _width * _skipNum;
imgBufP += _width * _skipNum;
numPixels += (width_end - _border);
}
WebRtc_Word64 sad_final_64[2];
WebRtc_Word64 sum_final_64[2];
WebRtc_Word64 sqsum_final_64[2];
// bring sums out of vector registers and into integer register
// domain, summing them along the way
_mm_store_si128 ((__m128i*)sad_final_64, sad_64);
_mm_store_si128 ((__m128i*)sum_final_64, sum_64);
_mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
// default
_motionMagnitudeNZ = 0.0f;
if (tempDiffSum == 0)
{
return VPM_OK;
}
// normalize over all pixels
const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
if (contrast > 0.0)
{
contrast = sqrt(contrast);
_motionMagnitudeNZ = tempDiffAvg/contrast;
}
return VPM_OK;
}
#endif
// Compute spatial metrics:
// To reduce complexity, we compute the metric for a reduced set of points.
// The spatial metrics are rough estimates of the prediction error cost for
@@ -427,172 +321,6 @@ VPMContentAnalysis::ComputeSpatialMetrics_C()
return VPM_OK;
}
#if defined(WEBRTC_USE_SSE2)
WebRtc_Word32
VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
{
const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
__m128i se_32 = _mm_setzero_si128();
__m128i sev_32 = _mm_setzero_si128();
__m128i seh_32 = _mm_setzero_si128();
__m128i msa_32 = _mm_setzero_si128();
const __m128i z = _mm_setzero_si128();
// Error is accumulated as a 32 bit value. Looking at HD content with a
// height of 1080 lines, or about 67 macro blocks. If the 16 bit row
// value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
// will not roll over a 32 bit accumulator.
// _skipNum is also used to reduce the number of rows
for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
{
__m128i se_16 = _mm_setzero_si128();
__m128i sev_16 = _mm_setzero_si128();
__m128i seh_16 = _mm_setzero_si128();
__m128i msa_16 = _mm_setzero_si128();
// Row error is accumulated as a 16 bit value. There are 8
// accumulators. Max value of a 16 bit number is 65529. Looking
// at HD content, 1080p, has a width of 1920, 120 macro blocks.
// A mb at a time is processed at a time. Absolute max error at
// a point would be abs(0-255+255+255+255) which equals 1020.
// 120*1020 = 122400. The probability of hitting this is quite low
// on well behaved content. A specially crafted image could roll over.
// _border could also be adjusted to concentrate on just the center of
// the images for an HD capture in order to reduce the possiblity of
// rollover.
const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
const WebRtc_UWord8 *lineCen = imgBuf + _border;
const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
{
const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
lineTop += 16;
lineCen += 16;
lineBot += 16;
// center pixel unpacked
__m128i clo = _mm_unpacklo_epi8(c,z);
__m128i chi = _mm_unpackhi_epi8(c,z);
// left right pixels unpacked and added together
const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
_mm_unpacklo_epi8(r,z));
const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
_mm_unpackhi_epi8(r,z));
// top & bottom pixels unpacked and added together
const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
_mm_unpacklo_epi8(b,z));
const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
_mm_unpackhi_epi8(b,z));
// running sum of all pixels
msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
clo = _mm_slli_epi16(clo, 1);
chi = _mm_slli_epi16(chi, 1);
const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
clo = _mm_slli_epi16(clo, 1);
chi = _mm_slli_epi16(chi, 1);
const __m128i setlo = _mm_subs_epi16(clo,
_mm_add_epi16(lrlo, tblo));
const __m128i sethi = _mm_subs_epi16(chi,
_mm_add_epi16(lrhi, tbhi));
// Add to 16 bit running sum
se_16 = _mm_add_epi16(se_16,
_mm_max_epi16(setlo,
_mm_subs_epi16(z, setlo)));
se_16 = _mm_add_epi16(se_16,
_mm_max_epi16(sethi,
_mm_subs_epi16(z, sethi)));
sev_16 = _mm_add_epi16(sev_16,
_mm_max_epi16(sevtlo,
_mm_subs_epi16(z, sevtlo)));
sev_16 = _mm_add_epi16(sev_16,
_mm_max_epi16(sevthi,
_mm_subs_epi16(z, sevthi)));
seh_16 = _mm_add_epi16(seh_16,
_mm_max_epi16(sehtlo,
_mm_subs_epi16(z, sehtlo)));
seh_16 = _mm_add_epi16(seh_16,
_mm_max_epi16(sehthi,
_mm_subs_epi16(z, sehthi)));
}
// Add to 32 bit running sum as to not roll over.
se_32 = _mm_add_epi32(se_32,
_mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
_mm_unpacklo_epi16(se_16,z)));
sev_32 = _mm_add_epi32(sev_32,
_mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
_mm_unpacklo_epi16(sev_16,z)));
seh_32 = _mm_add_epi32(seh_32,
_mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
_mm_unpacklo_epi16(seh_16,z)));
msa_32 = _mm_add_epi32(msa_32,
_mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
_mm_unpacklo_epi16(msa_16,z)));
imgBuf += _width * _skipNum;
}
WebRtc_Word64 se_64[2];
WebRtc_Word64 sev_64[2];
WebRtc_Word64 seh_64[2];
WebRtc_Word64 msa_64[2];
// bring sums out of vector registers and into integer register
// domain, summing them along the way
_mm_store_si128 ((__m128i*)se_64,
_mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
_mm_unpacklo_epi32(se_32,z)));
_mm_store_si128 ((__m128i*)sev_64,
_mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
_mm_unpacklo_epi32(sev_32,z)));
_mm_store_si128 ((__m128i*)seh_64,
_mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
_mm_unpacklo_epi32(seh_32,z)));
_mm_store_si128 ((__m128i*)msa_64,
_mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
_mm_unpacklo_epi32(msa_32,z)));
const WebRtc_UWord32 spatialErrSum = se_64[0] + se_64[1];
const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
// normalize over all pixels
const float spatialErr = (float)(spatialErrSum >> 2);
const float spatialErrH = (float)(spatialErrHSum >> 1);
const float spatialErrV = (float)(spatialErrVSum >> 1);
const float norm = (float)pixelMSA;
// 2X2:
_spatialPredErr = spatialErr / norm;
// 1X2:
_spatialPredErrH = spatialErrH / norm;
// 2X1:
_spatialPredErrV = spatialErrV / norm;
return VPM_OK;
}
#endif // #if defined(WEBRTC_USE_SSE2)
VideoContentMetrics*
VPMContentAnalysis::ContentMetrics()
{

View File

@@ -8,10 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* content_analysis.h
*/
#ifndef VPM_CONTENT_ANALYSIS_H
#define VPM_CONTENT_ANALYSIS_H
@@ -24,7 +20,9 @@ namespace webrtc {
class VPMContentAnalysis
{
public:
VPMContentAnalysis(bool RTCD = true);
// When |runtime_cpu_detection| is true, runtime selection of an optimized
// code path is allowed.
VPMContentAnalysis(bool runtime_cpu_detection);
~VPMContentAnalysis();
// Initialize ContentAnalysis - should be called prior to
@@ -62,7 +60,7 @@ private:
ComputeSpatialMetricsFunc ComputeSpatialMetrics;
WebRtc_Word32 ComputeSpatialMetrics_C();
#if defined(WEBRTC_USE_SSE2)
#if defined(WEBRTC_ARCH_X86_FAMILY)
WebRtc_Word32 ComputeSpatialMetrics_SSE2();
WebRtc_Word32 TemporalDiffMetric_SSE2();
#endif

View File

@@ -0,0 +1,284 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "content_analysis.h"
#include <emmintrin.h>
#include <math.h>
namespace webrtc {
WebRtc_Word32
VPMContentAnalysis::TemporalDiffMetric_SSE2()
{
WebRtc_UWord32 numPixels = 0; // counter for # of pixels
const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
__m128i sad_64 = _mm_setzero_si128();
__m128i sum_64 = _mm_setzero_si128();
__m128i sqsum_64 = _mm_setzero_si128();
const __m128i z = _mm_setzero_si128();
for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
{
__m128i sqsum_32 = _mm_setzero_si128();
const WebRtc_UWord8 *lineO = imgBufO;
const WebRtc_UWord8 *lineP = imgBufP;
// Work on 16 pixels at a time. For HD content with a width of 1920
// this loop will run ~67 times (depending on border). Maximum for
// abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
// results which are then accumulated. There is no chance of
// rollover for these two accumulators.
// o*o will have a maximum of 255*255 = 65025. This will roll over
// a 16 bit accumulator as 67*65025 > 65535, but will fit in a
// 32 bit accumulator.
for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
{
const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
lineO += 16;
lineP += 16;
// abs pixel difference between frames
sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
// sum of all pixels in frame
sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
// squared sum of all pixels in frame
const __m128i olo = _mm_unpacklo_epi8(o,z);
const __m128i ohi = _mm_unpackhi_epi8(o,z);
const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
}
// Add to 64 bit running sum as to not roll over.
sqsum_64 = _mm_add_epi64(sqsum_64,
_mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
_mm_unpacklo_epi32(sqsum_32,z)));
imgBufO += _width * _skipNum;
imgBufP += _width * _skipNum;
numPixels += (width_end - _border);
}
WebRtc_Word64 sad_final_64[2];
WebRtc_Word64 sum_final_64[2];
WebRtc_Word64 sqsum_final_64[2];
// bring sums out of vector registers and into integer register
// domain, summing them along the way
_mm_store_si128 ((__m128i*)sad_final_64, sad_64);
_mm_store_si128 ((__m128i*)sum_final_64, sum_64);
_mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
// default
_motionMagnitudeNZ = 0.0f;
if (tempDiffSum == 0)
{
return VPM_OK;
}
// normalize over all pixels
const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
if (contrast > 0.0)
{
contrast = sqrt(contrast);
_motionMagnitudeNZ = tempDiffAvg/contrast;
}
return VPM_OK;
}
WebRtc_Word32
VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
{
const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
__m128i se_32 = _mm_setzero_si128();
__m128i sev_32 = _mm_setzero_si128();
__m128i seh_32 = _mm_setzero_si128();
__m128i msa_32 = _mm_setzero_si128();
const __m128i z = _mm_setzero_si128();
// Error is accumulated as a 32 bit value. Looking at HD content with a
// height of 1080 lines, or about 67 macro blocks. If the 16 bit row
// value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
// will not roll over a 32 bit accumulator.
// _skipNum is also used to reduce the number of rows
for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
{
__m128i se_16 = _mm_setzero_si128();
__m128i sev_16 = _mm_setzero_si128();
__m128i seh_16 = _mm_setzero_si128();
__m128i msa_16 = _mm_setzero_si128();
// Row error is accumulated as a 16 bit value. There are 8
// accumulators. Max value of a 16 bit number is 65529. Looking
// at HD content, 1080p, has a width of 1920, 120 macro blocks.
// A mb at a time is processed at a time. Absolute max error at
// a point would be abs(0-255+255+255+255) which equals 1020.
// 120*1020 = 122400. The probability of hitting this is quite low
// on well behaved content. A specially crafted image could roll over.
// _border could also be adjusted to concentrate on just the center of
// the images for an HD capture in order to reduce the possiblity of
// rollover.
const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
const WebRtc_UWord8 *lineCen = imgBuf + _border;
const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
{
const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
lineTop += 16;
lineCen += 16;
lineBot += 16;
// center pixel unpacked
__m128i clo = _mm_unpacklo_epi8(c,z);
__m128i chi = _mm_unpackhi_epi8(c,z);
// left right pixels unpacked and added together
const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
_mm_unpacklo_epi8(r,z));
const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
_mm_unpackhi_epi8(r,z));
// top & bottom pixels unpacked and added together
const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
_mm_unpacklo_epi8(b,z));
const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
_mm_unpackhi_epi8(b,z));
// running sum of all pixels
msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
clo = _mm_slli_epi16(clo, 1);
chi = _mm_slli_epi16(chi, 1);
const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
clo = _mm_slli_epi16(clo, 1);
chi = _mm_slli_epi16(chi, 1);
const __m128i setlo = _mm_subs_epi16(clo,
_mm_add_epi16(lrlo, tblo));
const __m128i sethi = _mm_subs_epi16(chi,
_mm_add_epi16(lrhi, tbhi));
// Add to 16 bit running sum
se_16 = _mm_add_epi16(se_16,
_mm_max_epi16(setlo,
_mm_subs_epi16(z, setlo)));
se_16 = _mm_add_epi16(se_16,
_mm_max_epi16(sethi,
_mm_subs_epi16(z, sethi)));
sev_16 = _mm_add_epi16(sev_16,
_mm_max_epi16(sevtlo,
_mm_subs_epi16(z, sevtlo)));
sev_16 = _mm_add_epi16(sev_16,
_mm_max_epi16(sevthi,
_mm_subs_epi16(z, sevthi)));
seh_16 = _mm_add_epi16(seh_16,
_mm_max_epi16(sehtlo,
_mm_subs_epi16(z, sehtlo)));
seh_16 = _mm_add_epi16(seh_16,
_mm_max_epi16(sehthi,
_mm_subs_epi16(z, sehthi)));
}
// Add to 32 bit running sum as to not roll over.
se_32 = _mm_add_epi32(se_32,
_mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
_mm_unpacklo_epi16(se_16,z)));
sev_32 = _mm_add_epi32(sev_32,
_mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
_mm_unpacklo_epi16(sev_16,z)));
seh_32 = _mm_add_epi32(seh_32,
_mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
_mm_unpacklo_epi16(seh_16,z)));
msa_32 = _mm_add_epi32(msa_32,
_mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
_mm_unpacklo_epi16(msa_16,z)));
imgBuf += _width * _skipNum;
}
WebRtc_Word64 se_64[2];
WebRtc_Word64 sev_64[2];
WebRtc_Word64 seh_64[2];
WebRtc_Word64 msa_64[2];
// bring sums out of vector registers and into integer register
// domain, summing them along the way
_mm_store_si128 ((__m128i*)se_64,
_mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
_mm_unpacklo_epi32(se_32,z)));
_mm_store_si128 ((__m128i*)sev_64,
_mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
_mm_unpacklo_epi32(sev_32,z)));
_mm_store_si128 ((__m128i*)seh_64,
_mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
_mm_unpacklo_epi32(seh_32,z)));
_mm_store_si128 ((__m128i*)msa_64,
_mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
_mm_unpacklo_epi32(msa_32,z)));
const WebRtc_UWord32 spatialErrSum = se_64[0] + se_64[1];
const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
// normalize over all pixels
const float spatialErr = (float)(spatialErrSum >> 2);
const float spatialErrH = (float)(spatialErrHSum >> 1);
const float spatialErrV = (float)(spatialErrVSum >> 1);
const float norm = (float)pixelMSA;
// 2X2:
_spatialPredErr = spatialErr / norm;
// 1X2:
_spatialPredErrH = spatialErrH / norm;
// 2X1:
_spatialPredErrV = spatialErrV / norm;
return VPM_OK;
}
} // namespace webrtc

View File

@@ -22,7 +22,7 @@ _resampledFrame(),
_enableCA(false)
{
_spatialResampler = new VPMSimpleSpatialResampler();
_ca = new VPMContentAnalysis();
_ca = new VPMContentAnalysis(true);
_vd = new VPMVideoDecimator();
}

View File

@@ -14,7 +14,7 @@
'dependencies': [
'webrtc_utility',
'<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
'<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
'<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
'<(webrtc_root)/system_wrappers/source/system_wrappers.gyp:system_wrappers',
],
'include_dirs': [
@@ -26,41 +26,57 @@
],
},
'sources': [
# interfaces
'../interface/video_processing.h',
'../interface/video_processing_defines.h',
# headers
'video_processing_impl.h',
'brighten.cc',
'brighten.h',
'brightness_detection.cc',
'brightness_detection.h',
'brighten.h',
'color_enhancement.cc',
'color_enhancement.h',
'color_enhancement_private.h',
'content_analysis.h',
'deflickering.h',
'denoising.h',
'frame_preprocessor.h',
'spatial_resampler.h',
'video_decimator.h',
# sources
'video_processing_impl.cc',
'brightness_detection.cc',
'brighten.cc',
'color_enhancement.cc',
'content_analysis.cc',
'content_analysis.h',
'deflickering.cc',
'deflickering.h',
'denoising.cc',
'denoising.h',
'frame_preprocessor.cc',
'frame_preprocessor.h',
'spatial_resampler.cc',
'spatial_resampler.h',
'video_decimator.cc',
], # source
'video_decimator.h',
'video_processing_impl.cc',
'video_processing_impl.h',
],
'conditions': [
['target_arch=="ia32" or target_arch=="x64"', {
'dependencies': [ 'video_processing_sse2', ],
}],
],
},
{
'target_name': 'video_processing_sse2',
'type': '<(library)',
'sources': [
'content_analysis_sse2.cc',
],
'include_dirs': [
'../interface',
'../../../interface',
],
'conditions': [
['os_posix==1 and OS!="mac"', {
'cflags': [ '-msse2', ],
}],
['OS=="mac"', {
'xcode_settings': {
'OTHER_CFLAGS': [ '-msse2', ],
},
}],
],
},
],
}
# Local Variables:
# tab-width:2
# indent-tabs-mode:nil
# End:
# vim: set expandtab tabstop=2 shiftwidth=2:

View File

@@ -17,7 +17,7 @@ namespace webrtc {
TEST_F(VideoProcessingModuleTest, ContentAnalysis)
{
VPMContentAnalysis _ca_c(false);
VPMContentAnalysis _ca_sse;
VPMContentAnalysis _ca_sse(true);
VideoContentMetrics *_cM_c, *_cM_SSE;
_ca_c.Initialize(_width,_height);