From d4cdb683a48fc25bd3d32f53964023828f13cdbf Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Mon, 28 Mar 2011 16:39:05 -0700 Subject: [PATCH] fixed an overflow in ssim calculation This commit fixed an overflow in ssim calculation, added register save and restore to make sure assembly code working for x64 platform. It also changed the sampling points to every 4x4 instead of 8x8 and adjusted the constants in SSIM calculation to match the scale of previous VPXSSIM. Change-Id: Ia4dbb8c69eac55812f4662c88ab4653b6720537b --- vp8/encoder/ssim.c | 66 +++++++++++++++++++++++++----------- vp8/encoder/x86/ssim_opt.asm | 37 ++++++++++---------- 2 files changed, 65 insertions(+), 38 deletions(-) diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 64d67c6dd..c78be37a3 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -290,8 +290,8 @@ void ssim_parms_8x8_c } } -const static long long c1 = 426148; // (256^2*(.01*255)^2 -const static long long c2 = 3835331; //(256^2*(.03*255)^2 +const static long long cc1 = 26634; // (64^2*(.01*255)^2 +const static long long cc2 = 239708; // (64^2*(.03*255)^2 static double similarity ( @@ -303,10 +303,19 @@ static double similarity int count ) { - long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + long long ssim_n, ssim_d; + long long c1, c2; - long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + //scale the constants by number of pixels + c1 = (cc1*count*count)>>12; + c2 = (cc2*count*count)>>12; + + ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr- + (long long) 2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + ((long long)count*sum_sq_s-(long long)sum_s*sum_s + + (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ; return ssim_n * 1.0 / ssim_d; } @@ -332,18 +341,33 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp, const vp8_variance_rtcd_vtable_t *rtcd) { unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - double ssim3; - long long ssim_n; - long long ssim_d; + long long ssim3; + long long ssim_n,ssim_n1,ssim_n2; + long long ssim_d,ssim_d1,ssim_d2; + long long ssim_t1,ssim_t2; + long long c1, c2; + + // normalize by 256/64 + c1 = cc1*16; + c2 = cc2*16; rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); - ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + ssim_n1 = (2*sum_s*sum_r+ c1); - ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2); - ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; - return (long)( 256*ssim3 * ssim3 ); + ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1); + + ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s + + (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ; + + ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1; + ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2; + + ssim3 = 256 *ssim_t1 * ssim_t2; + if(ssim3 <0 ) + ssim3=0; + return (long)( ssim3 ); } // TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels // such that the window regions overlap block boundaries to penalize blocking @@ -361,18 +385,20 @@ double vp8_ssim2 ) { int i,j; - + int samples =0; double ssim_total=0; - // we can sample points as frequently as we like start with 1 per 8x8 - for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + // we can sample points as frequently as we like start with 1 per 4x4 + for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4) { - for(j=0; j < width; j+=8 ) + for(j=0; j < width-8; j+=4 ) { - ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd); + ssim_total += v; + samples++; } } - ssim_total /= (width/8 * height /8); + ssim_total /= samples; return ssim_total; } @@ -405,4 +431,4 @@ double vp8_calc_ssim *weight = 1; return ssimv; -} +} \ No newline at end of file diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index c267cdb54..d6cebf33d 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -16,12 +16,12 @@ paddusw xmm14, xmm4 ; sum_r movdqa xmm1, xmm3 pmaddwd xmm1, xmm1 - paddq xmm13, xmm1 ; sum_sq_s + paddd xmm13, xmm1 ; sum_sq_s movdqa xmm2, xmm4 pmaddwd xmm2, xmm2 - paddq xmm12, xmm2 ; sum_sq_r + paddd xmm12, xmm2 ; sum_sq_r pmaddwd xmm3, xmm4 - paddq xmm11, xmm3 ; sum_sxr + paddd xmm11, xmm3 ; sum_sxr %endmacro ; Sum across the register %1 starting with q words @@ -66,6 +66,7 @@ sym(vp8_ssim_parms_16x16_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM push rsi push rdi ; end prolog @@ -115,19 +116,20 @@ NextRow: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -154,6 +156,7 @@ sym(vp8_ssim_parms_8x8_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM push rsi push rdi ; end prolog @@ -174,11 +177,8 @@ sym(vp8_ssim_parms_8x8_sse3): NextRow2: ;grab source and reference pixels - movq xmm5, [rsi] - movq xmm6, [rdi] - - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 + movq xmm3, [rsi] + movq xmm4, [rdi] punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r @@ -197,19 +197,20 @@ NextRow2: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret