From 9139fddf0ecb7d234d6b0bd6c65a769f3f9fd9a6 Mon Sep 17 00:00:00 2001 From: "frkoenig@google.com" Date: Fri, 19 Aug 2011 22:33:08 +0000 Subject: [PATCH] Optimize ssim_8x8 for SSE2. Code was pulled from libvpx assembly and converted to intrinsics. Review URL: http://webrtc-codereview.appspot.com/122001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@409 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../video_coding/main/test/test_util.cc | 145 ++++++++++++++---- 1 file changed, 112 insertions(+), 33 deletions(-) diff --git a/src/modules/video_coding/main/test/test_util.cc b/src/modules/video_coding/main/test/test_util.cc index b7da9f5c7..3b34d873a 100644 --- a/src/modules/video_coding/main/test/test_util.cc +++ b/src/modules/video_coding/main/test/test_util.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "system_wrappers/interface/cpu_features_wrapper.h" #include "test_util.h" #include "test_macros.h" #include "rtp_dump.h" @@ -440,34 +441,6 @@ PSNRfromFiles(const WebRtc_Word8 *refFileName, const WebRtc_Word8 *testFileName, return 0; } -void -ssim_parms_8x8_c -( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr -) -{ - int i,j; - for(i=0;i<8;i++,s+=sp,r+=rp) - { - for(j=0;j<8;j++) - { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} - static double similarity ( @@ -499,13 +472,110 @@ similarity } static double -ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp) +ssim_8x8_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp +) { - unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + unsigned long sum_s = 0; + unsigned long sum_r = 0; + unsigned long sum_sq_s = 0; + unsigned long sum_sq_r = 0; + unsigned long sum_sxr = 0; + + int i,j; + for(i=0;i<8;i++,s+=sp,r+=rp) + { + for(j=0;j<8;j++) + { + sum_s += s[j]; + sum_r += r[j]; + sum_sq_s += s[j] * s[j]; + sum_sq_r += r[j] * r[j]; + sum_sxr += s[j] * r[j]; + } + } + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); } +#if defined(WEBRTC_USE_SSE2) +#include +static double +ssim_8x8_sse2 +( + unsigned char *s, + int sp, + unsigned char *r, + int rp +) +{ + int i; + const __m128i z = _mm_setzero_si128(); + __m128i sum_s_16 = _mm_setzero_si128(); + __m128i sum_r_16 = _mm_setzero_si128(); + __m128i sum_sq_s_32 = _mm_setzero_si128(); + __m128i sum_sq_r_32 = _mm_setzero_si128(); + __m128i sum_sxr_32 = _mm_setzero_si128(); + + for(i=0;i<8;i++,s+=sp,r+=rp) + { + const __m128i s_8 = _mm_loadl_epi64((__m128i*)(s)); + const __m128i r_8 = _mm_loadl_epi64((__m128i*)(r)); + + const __m128i s_16 = _mm_unpacklo_epi8(s_8,z); + const __m128i r_16 = _mm_unpacklo_epi8(r_8,z); + + sum_s_16 = _mm_adds_epu16(sum_s_16, s_16); + sum_r_16 = _mm_adds_epu16(sum_r_16, r_16); + const __m128i sq_s_32 = _mm_madd_epi16(s_16, s_16); + sum_sq_s_32 = _mm_add_epi32(sum_sq_s_32, sq_s_32); + const __m128i sq_r_32 = _mm_madd_epi16(r_16, r_16); + sum_sq_r_32 = _mm_add_epi32(sum_sq_r_32, sq_r_32); + const __m128i sxr_32 = _mm_madd_epi16(s_16, r_16); + sum_sxr_32 = _mm_add_epi32(sum_sxr_32, sxr_32); + } + + const __m128i sum_s_32 = _mm_add_epi32(_mm_unpackhi_epi16(sum_s_16, z), + _mm_unpacklo_epi16(sum_s_16, z)); + const __m128i sum_r_32 = _mm_add_epi32(_mm_unpackhi_epi16(sum_r_16, z), + _mm_unpacklo_epi16(sum_r_16, z)); + + unsigned long sum_s_64[2]; + unsigned long sum_r_64[2]; + unsigned long sum_sq_s_64[2]; + unsigned long sum_sq_r_64[2]; + unsigned long sum_sxr_64[2]; + + _mm_store_si128 ((__m128i*)sum_s_64, + _mm_add_epi64(_mm_unpackhi_epi32(sum_s_32, z), + _mm_unpacklo_epi32(sum_s_32, z))); + _mm_store_si128 ((__m128i*)sum_r_64, + _mm_add_epi64(_mm_unpackhi_epi32(sum_r_32, z), + _mm_unpacklo_epi32(sum_r_32, z))); + _mm_store_si128 ((__m128i*)sum_sq_s_64, + _mm_add_epi64(_mm_unpackhi_epi32(sum_sq_s_32, z), + _mm_unpacklo_epi32(sum_sq_s_32, z))); + _mm_store_si128 ((__m128i*)sum_sq_r_64, + _mm_add_epi64(_mm_unpackhi_epi32(sum_sq_r_32, z), + _mm_unpacklo_epi32(sum_sq_r_32, z))); + _mm_store_si128 ((__m128i*)sum_sxr_64, + _mm_add_epi64(_mm_unpackhi_epi32(sum_sxr_32, z), + _mm_unpacklo_epi32(sum_sxr_32, z))); + + const unsigned long sum_s = sum_s_64[0] + sum_s_64[1]; + const unsigned long sum_r = sum_r_64[0] + sum_r_64[1]; + const unsigned long sum_sq_s = sum_sq_s_64[0] + sum_sq_s_64[1]; + const unsigned long sum_sq_r = sum_sq_r_64[0] + sum_sq_r_64[1]; + const unsigned long sum_sxr = sum_sxr_64[0] + sum_sxr_64[1]; + + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} +#endif + double SSIM_frame ( @@ -516,9 +586,18 @@ SSIM_frame int width, int height) { - WebRtc_Word32 i,j; - WebRtc_UWord32 samples = 0; + int i,j; + unsigned int samples = 0; double ssim_total = 0; + double (*ssim_8x8)(unsigned char*, int, unsigned char*, int rp); + + ssim_8x8 = ssim_8x8_c; + if(WebRtc_GetCPUInfo(kSSE2)) + { +#if defined(WEBRTC_USE_SSE2) + ssim_8x8 = ssim_8x8_sse2; +#endif + } // sample point start with each 4x4 location for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)