From fea3556e20f63ed1a03daa45804f724e70705aa0 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 9 Feb 2012 12:38:31 -0800 Subject: [PATCH] Fix variance overflow In the variance calculations the difference is summed and later squared. When the sum exceeds sqrt(2^31) the value is treated as a negative when it is shifted which gives incorrect results. To fix this we cast the result of the multiplication as unsigned. The alternative fix is to shift sum down by 4 before multiplying. However that will reduce precision. For 16x16 blocks the maximum sum is 65280 and sqrt(2^31) is 46340 (and change). PPC change is untested. Change-Id: I1bad27ea0720067def6d71a6da5f789508cec265 --- .../arm/armv6/vp8_variance16x16_armv6.asm | 2 +- .../vp8_variance_halfpixvar16x16_h_armv6.asm | 2 +- .../vp8_variance_halfpixvar16x16_hv_armv6.asm | 2 +- .../vp8_variance_halfpixvar16x16_v_armv6.asm | 2 +- vp8/encoder/arm/neon/variance_neon.asm | 22 +++++++++---------- .../neon/vp8_subpixelvariance16x16_neon.asm | 4 ++-- .../neon/vp8_subpixelvariance16x16s_neon.asm | 16 +++++++------- .../arm/neon/vp8_subpixelvariance8x8_neon.asm | 4 ++-- vp8/encoder/ppc/variance_altivec.asm | 6 ++--- vp8/encoder/ppc/variance_subpixel_altivec.asm | 2 +- vp8/encoder/variance_c.c | 10 ++++----- vp8/encoder/x86/variance_mmx.c | 10 ++++----- vp8/encoder/x86/variance_sse2.c | 8 +++---- vp8/encoder/x86/variance_ssse3.c | 4 ++-- 14 files changed, 47 insertions(+), 47 deletions(-) diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm index 5feaa8bc2..dc84c30da 100644 --- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -144,7 +144,7 @@ loop ldr r6, [sp, #40] ; get address of sse mul r0, r8, r8 ; sum * sum str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) ldmfd sp!, {r4-r12, pc} diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm index 1b5489795..dd2ce685c 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -169,7 +169,7 @@ loop ldr r6, [sp, #40] ; get address of sse mul r0, r8, r8 ; sum * sum str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) ldmfd sp!, {r4-r12, pc} diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm index 38c55edf8..f972d9b5b 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -210,7 +210,7 @@ loop ldr r6, [sp, #40] ; get address of sse mul r0, r8, r8 ; sum * sum str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) ldmfd sp!, {r4-r12, pc} diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm index 22a50eb00..f5da9c09e 100644 --- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -171,7 +171,7 @@ loop ldr r6, [sp, #40] ; get address of sse mul r0, r8, r8 ; sum * sum str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) ldmfd sp!, {r4-r12, pc} diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/encoder/arm/neon/variance_neon.asm index e1a46869a..e3b48327d 100644 --- a/vp8/encoder/arm/neon/variance_neon.asm +++ b/vp8/encoder/arm/neon/variance_neon.asm @@ -77,14 +77,14 @@ variance16x16_neon_loop ;vmov.32 r1, d1[0] ;mul r0, r0, r0 ;str r1, [r12] - ;sub r0, r1, r0, asr #8 + ;sub r0, r1, r0, lsr #8 - ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should - ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right. + ; while sum is signed, sum * sum is always positive and must be treated as + ; unsigned to avoid propagating the sign bit. vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [r12] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return bx lr @@ -145,8 +145,8 @@ variance16x8_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [r12] ;store sse - vshr.s32 d10, d10, #7 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #7 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return bx lr @@ -200,8 +200,8 @@ variance8x16_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [r12] ;store sse - vshr.s32 d10, d10, #7 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #7 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return bx lr @@ -265,8 +265,8 @@ variance8x8_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [r12] ;store sse - vshr.s32 d10, d10, #6 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #6 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return bx lr diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm index 5107d8b99..d753ad129 100644 --- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -405,8 +405,8 @@ sub_pixel_variance16x16_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [r6] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 add sp, sp, #528 vmov.32 r0, d0[0] ;return diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm index 0a2b71c49..155be4fc5 100644 --- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm @@ -112,8 +112,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return pop {pc} @@ -208,8 +208,8 @@ vp8_filt_spo16x16s_0_4_loop_neon vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return pop {pc} @@ -327,8 +327,8 @@ vp8_filt16x16s_4_4_loop_neon vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return pop {pc} @@ -560,8 +560,8 @@ sub_pixel_variance16x16s_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #8 + vsub.u32 d0, d1, d10 add sp, sp, #256 vmov.32 r0, d0[0] ;return diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm index 38b58780a..cc7ae52c9 100644 --- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm @@ -206,8 +206,8 @@ sub_pixel_variance8x8_neon_loop vmull.s32 q5, d0, d0 vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #6 - vsub.s32 d0, d1, d10 + vshr.u32 d10, d10, #6 + vsub.u32 d0, d1, d10 vmov.32 r0, d0[0] ;return pop {r4-r5, pc} diff --git a/vp8/encoder/ppc/variance_altivec.asm b/vp8/encoder/ppc/variance_altivec.asm index a1ebf663a..fb8d5bb1d 100644 --- a/vp8/encoder/ppc/variance_altivec.asm +++ b/vp8/encoder/ppc/variance_altivec.asm @@ -98,7 +98,7 @@ stw r4, 0(r7) ;# sse mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> DS + srlwi r3, r3, \DS ;# (sum*sum) >> DS subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) .endm @@ -142,7 +142,7 @@ stw r4, 0(r7) ;# sse mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> 8 + srlwi r3, r3, \DS ;# (sum*sum) >> 8 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) .endm @@ -367,7 +367,7 @@ vp8_variance4x4_ppc: stw r4, 0(r7) ;# sse mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, 4 ;# (sum*sum) >> 4 + srlwi r3, r3, 4 ;# (sum*sum) >> 4 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) epilogue diff --git a/vp8/encoder/ppc/variance_subpixel_altivec.asm b/vp8/encoder/ppc/variance_subpixel_altivec.asm index 301360b1d..2308373a1 100644 --- a/vp8/encoder/ppc/variance_subpixel_altivec.asm +++ b/vp8/encoder/ppc/variance_subpixel_altivec.asm @@ -157,7 +157,7 @@ stw r4, 0(r9) ;# sse mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> 8 + srlwi r3, r3, \DS ;# (sum*sum) >> 8 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) .endm diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c index c7b9c2209..dbcbc7a01 100644 --- a/vp8/encoder/variance_c.c +++ b/vp8/encoder/variance_c.c @@ -75,7 +75,7 @@ unsigned int vp8_variance16x16_c( variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); *sse = var; - return (var - ((avg * avg) >> 8)); + return (var - ((unsigned int)(avg * avg) >> 8)); } unsigned int vp8_variance8x16_c( @@ -91,7 +91,7 @@ unsigned int vp8_variance8x16_c( variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } unsigned int vp8_variance16x8_c( @@ -107,7 +107,7 @@ unsigned int vp8_variance16x8_c( variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } @@ -124,7 +124,7 @@ unsigned int vp8_variance8x8_c( variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); *sse = var; - return (var - ((avg * avg) >> 6)); + return (var - ((unsigned int)(avg * avg) >> 6)); } unsigned int vp8_variance4x4_c( @@ -140,7 +140,7 @@ unsigned int vp8_variance4x4_c( variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); *sse = var; - return (var - ((avg * avg) >> 4)); + return (var - ((unsigned int)(avg * avg) >> 4)); } diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index e2524b46a..53466e9ec 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -91,7 +91,7 @@ unsigned int vp8_variance4x4_mmx( vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; *sse = var; - return (var - ((avg * avg) >> 4)); + return (var - ((unsigned int)(avg * avg) >> 4)); } @@ -108,7 +108,7 @@ unsigned int vp8_variance8x8_mmx( vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; *sse = var; - return (var - ((avg * avg) >> 6)); + return (var - ((unsigned int)(avg * avg) >> 6)); } @@ -153,7 +153,7 @@ unsigned int vp8_variance16x16_mmx( var = sse0 + sse1 + sse2 + sse3; avg = sum0 + sum1 + sum2 + sum3; *sse = var; - return (var - ((avg * avg) >> 8)); + return (var - ((unsigned int)(avg * avg) >> 8)); } unsigned int vp8_variance16x8_mmx( @@ -172,7 +172,7 @@ unsigned int vp8_variance16x8_mmx( var = sse0 + sse1; avg = sum0 + sum1; *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } @@ -194,7 +194,7 @@ unsigned int vp8_variance8x16_mmx( avg = sum0 + sum1; *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 39213b03d..8fbd07fbe 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -148,7 +148,7 @@ unsigned int vp8_variance4x4_wmt( vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; *sse = var; - return (var - ((avg * avg) >> 4)); + return (var - ((unsigned int)(avg * avg) >> 4)); } @@ -165,7 +165,7 @@ unsigned int vp8_variance8x8_wmt vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; *sse = var; - return (var - ((avg * avg) >> 6)); + return (var - ((unsigned int)(avg * avg) >> 6)); } @@ -220,7 +220,7 @@ unsigned int vp8_variance16x8_wmt var = sse0 + sse1; avg = sum0 + sum1; *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } @@ -241,7 +241,7 @@ unsigned int vp8_variance8x16_wmt var = sse0 + sse1; avg = sum0 + sum1; *sse = var; - return (var - ((avg * avg) >> 7)); + return (var - ((unsigned int)(avg * avg) >> 7)); } diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c index 73f2e01a2..8b8b87e2d 100644 --- a/vp8/encoder/x86/variance_ssse3.c +++ b/vp8/encoder/x86/variance_ssse3.c @@ -112,7 +112,7 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 } *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 8)); + return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8)); } unsigned int vp8_sub_pixel_variance16x8_ssse3 @@ -161,5 +161,5 @@ unsigned int vp8_sub_pixel_variance16x8_ssse3 } *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 7)); + return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7)); }