Fix variance overflow
In the variance calculations the difference is summed and later squared. When the sum exceeds sqrt(2^31) the value is treated as a negative when it is shifted which gives incorrect results. To fix this we cast the result of the multiplication as unsigned. The alternative fix is to shift sum down by 4 before multiplying. However that will reduce precision. For 16x16 blocks the maximum sum is 65280 and sqrt(2^31) is 46340 (and change). PPC change is untested. Change-Id: I1bad27ea0720067def6d71a6da5f789508cec265
This commit is contained in:
parent
2e0d55314c
commit
fea3556e20
@ -144,7 +144,7 @@ loop
|
|||||||
ldr r6, [sp, #40] ; get address of sse
|
ldr r6, [sp, #40] ; get address of sse
|
||||||
mul r0, r8, r8 ; sum * sum
|
mul r0, r8, r8 ; sum * sum
|
||||||
str r11, [r6] ; store sse
|
str r11, [r6] ; store sse
|
||||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
ldmfd sp!, {r4-r12, pc}
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
@ -169,7 +169,7 @@ loop
|
|||||||
ldr r6, [sp, #40] ; get address of sse
|
ldr r6, [sp, #40] ; get address of sse
|
||||||
mul r0, r8, r8 ; sum * sum
|
mul r0, r8, r8 ; sum * sum
|
||||||
str r11, [r6] ; store sse
|
str r11, [r6] ; store sse
|
||||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
ldmfd sp!, {r4-r12, pc}
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
@ -210,7 +210,7 @@ loop
|
|||||||
ldr r6, [sp, #40] ; get address of sse
|
ldr r6, [sp, #40] ; get address of sse
|
||||||
mul r0, r8, r8 ; sum * sum
|
mul r0, r8, r8 ; sum * sum
|
||||||
str r11, [r6] ; store sse
|
str r11, [r6] ; store sse
|
||||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
ldmfd sp!, {r4-r12, pc}
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
@ -171,7 +171,7 @@ loop
|
|||||||
ldr r6, [sp, #40] ; get address of sse
|
ldr r6, [sp, #40] ; get address of sse
|
||||||
mul r0, r8, r8 ; sum * sum
|
mul r0, r8, r8 ; sum * sum
|
||||||
str r11, [r6] ; store sse
|
str r11, [r6] ; store sse
|
||||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
ldmfd sp!, {r4-r12, pc}
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
@ -77,14 +77,14 @@ variance16x16_neon_loop
|
|||||||
;vmov.32 r1, d1[0]
|
;vmov.32 r1, d1[0]
|
||||||
;mul r0, r0, r0
|
;mul r0, r0, r0
|
||||||
;str r1, [r12]
|
;str r1, [r12]
|
||||||
;sub r0, r1, r0, asr #8
|
;sub r0, r1, r0, lsr #8
|
||||||
|
|
||||||
;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
|
; while sum is signed, sum * sum is always positive and must be treated as
|
||||||
;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
|
; unsigned to avoid propagating the sign bit.
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [r12] ;store sse
|
vst1.32 {d1[0]}, [r12] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
bx lr
|
bx lr
|
||||||
@ -145,8 +145,8 @@ variance16x8_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [r12] ;store sse
|
vst1.32 {d1[0]}, [r12] ;store sse
|
||||||
vshr.s32 d10, d10, #7
|
vshr.u32 d10, d10, #7
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
bx lr
|
bx lr
|
||||||
@ -200,8 +200,8 @@ variance8x16_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [r12] ;store sse
|
vst1.32 {d1[0]}, [r12] ;store sse
|
||||||
vshr.s32 d10, d10, #7
|
vshr.u32 d10, d10, #7
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
bx lr
|
bx lr
|
||||||
@ -265,8 +265,8 @@ variance8x8_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [r12] ;store sse
|
vst1.32 {d1[0]}, [r12] ;store sse
|
||||||
vshr.s32 d10, d10, #6
|
vshr.u32 d10, d10, #6
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
bx lr
|
bx lr
|
||||||
|
@ -405,8 +405,8 @@ sub_pixel_variance16x16_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [r6] ;store sse
|
vst1.32 {d1[0]}, [r6] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
add sp, sp, #528
|
add sp, sp, #528
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
|
@ -112,8 +112,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [lr] ;store sse
|
vst1.32 {d1[0]}, [lr] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
pop {pc}
|
pop {pc}
|
||||||
@ -208,8 +208,8 @@ vp8_filt_spo16x16s_0_4_loop_neon
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [lr] ;store sse
|
vst1.32 {d1[0]}, [lr] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
pop {pc}
|
pop {pc}
|
||||||
@ -327,8 +327,8 @@ vp8_filt16x16s_4_4_loop_neon
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [lr] ;store sse
|
vst1.32 {d1[0]}, [lr] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
pop {pc}
|
pop {pc}
|
||||||
@ -560,8 +560,8 @@ sub_pixel_variance16x16s_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [lr] ;store sse
|
vst1.32 {d1[0]}, [lr] ;store sse
|
||||||
vshr.s32 d10, d10, #8
|
vshr.u32 d10, d10, #8
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
add sp, sp, #256
|
add sp, sp, #256
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
|
@ -206,8 +206,8 @@ sub_pixel_variance8x8_neon_loop
|
|||||||
|
|
||||||
vmull.s32 q5, d0, d0
|
vmull.s32 q5, d0, d0
|
||||||
vst1.32 {d1[0]}, [lr] ;store sse
|
vst1.32 {d1[0]}, [lr] ;store sse
|
||||||
vshr.s32 d10, d10, #6
|
vshr.u32 d10, d10, #6
|
||||||
vsub.s32 d0, d1, d10
|
vsub.u32 d0, d1, d10
|
||||||
|
|
||||||
vmov.32 r0, d0[0] ;return
|
vmov.32 r0, d0[0] ;return
|
||||||
pop {r4-r5, pc}
|
pop {r4-r5, pc}
|
||||||
|
@ -98,7 +98,7 @@
|
|||||||
stw r4, 0(r7) ;# sse
|
stw r4, 0(r7) ;# sse
|
||||||
|
|
||||||
mullw r3, r3, r3 ;# sum*sum
|
mullw r3, r3, r3 ;# sum*sum
|
||||||
srawi r3, r3, \DS ;# (sum*sum) >> DS
|
srlwi r3, r3, \DS ;# (sum*sum) >> DS
|
||||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@ -142,7 +142,7 @@
|
|||||||
stw r4, 0(r7) ;# sse
|
stw r4, 0(r7) ;# sse
|
||||||
|
|
||||||
mullw r3, r3, r3 ;# sum*sum
|
mullw r3, r3, r3 ;# sum*sum
|
||||||
srawi r3, r3, \DS ;# (sum*sum) >> 8
|
srlwi r3, r3, \DS ;# (sum*sum) >> 8
|
||||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@ -367,7 +367,7 @@ vp8_variance4x4_ppc:
|
|||||||
stw r4, 0(r7) ;# sse
|
stw r4, 0(r7) ;# sse
|
||||||
|
|
||||||
mullw r3, r3, r3 ;# sum*sum
|
mullw r3, r3, r3 ;# sum*sum
|
||||||
srawi r3, r3, 4 ;# (sum*sum) >> 4
|
srlwi r3, r3, 4 ;# (sum*sum) >> 4
|
||||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
|
||||||
|
|
||||||
epilogue
|
epilogue
|
||||||
|
@ -157,7 +157,7 @@
|
|||||||
stw r4, 0(r9) ;# sse
|
stw r4, 0(r9) ;# sse
|
||||||
|
|
||||||
mullw r3, r3, r3 ;# sum*sum
|
mullw r3, r3, r3 ;# sum*sum
|
||||||
srawi r3, r3, \DS ;# (sum*sum) >> 8
|
srlwi r3, r3, \DS ;# (sum*sum) >> 8
|
||||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ unsigned int vp8_variance16x16_c(
|
|||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 8));
|
return (var - ((unsigned int)(avg * avg) >> 8));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_variance8x16_c(
|
unsigned int vp8_variance8x16_c(
|
||||||
@ -91,7 +91,7 @@ unsigned int vp8_variance8x16_c(
|
|||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
|
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_c(
|
unsigned int vp8_variance16x8_c(
|
||||||
@ -107,7 +107,7 @@ unsigned int vp8_variance16x8_c(
|
|||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
|
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -124,7 +124,7 @@ unsigned int vp8_variance8x8_c(
|
|||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
|
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 6));
|
return (var - ((unsigned int)(avg * avg) >> 6));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_variance4x4_c(
|
unsigned int vp8_variance4x4_c(
|
||||||
@ -140,7 +140,7 @@ unsigned int vp8_variance4x4_c(
|
|||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
|
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 4));
|
return (var - ((unsigned int)(avg * avg) >> 4));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ unsigned int vp8_variance4x4_mmx(
|
|||||||
|
|
||||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 4));
|
return (var - ((unsigned int)(avg * avg) >> 4));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,7 +108,7 @@ unsigned int vp8_variance8x8_mmx(
|
|||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
|
|
||||||
return (var - ((avg * avg) >> 6));
|
return (var - ((unsigned int)(avg * avg) >> 6));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,7 +153,7 @@ unsigned int vp8_variance16x16_mmx(
|
|||||||
var = sse0 + sse1 + sse2 + sse3;
|
var = sse0 + sse1 + sse2 + sse3;
|
||||||
avg = sum0 + sum1 + sum2 + sum3;
|
avg = sum0 + sum1 + sum2 + sum3;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 8));
|
return (var - ((unsigned int)(avg * avg) >> 8));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_mmx(
|
unsigned int vp8_variance16x8_mmx(
|
||||||
@ -172,7 +172,7 @@ unsigned int vp8_variance16x8_mmx(
|
|||||||
var = sse0 + sse1;
|
var = sse0 + sse1;
|
||||||
avg = sum0 + sum1;
|
avg = sum0 + sum1;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ unsigned int vp8_variance8x16_mmx(
|
|||||||
avg = sum0 + sum1;
|
avg = sum0 + sum1;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
|
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,7 +148,7 @@ unsigned int vp8_variance4x4_wmt(
|
|||||||
|
|
||||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 4));
|
return (var - ((unsigned int)(avg * avg) >> 4));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,7 +165,7 @@ unsigned int vp8_variance8x8_wmt
|
|||||||
|
|
||||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 6));
|
return (var - ((unsigned int)(avg * avg) >> 6));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -220,7 +220,7 @@ unsigned int vp8_variance16x8_wmt
|
|||||||
var = sse0 + sse1;
|
var = sse0 + sse1;
|
||||||
avg = sum0 + sum1;
|
avg = sum0 + sum1;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -241,7 +241,7 @@ unsigned int vp8_variance8x16_wmt
|
|||||||
var = sse0 + sse1;
|
var = sse0 + sse1;
|
||||||
avg = sum0 + sum1;
|
avg = sum0 + sum1;
|
||||||
*sse = var;
|
*sse = var;
|
||||||
return (var - ((avg * avg) >> 7));
|
return (var - ((unsigned int)(avg * avg) >> 7));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,7 +112,7 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
|
|||||||
}
|
}
|
||||||
|
|
||||||
*sse = xxsum0;
|
*sse = xxsum0;
|
||||||
return (xxsum0 - ((xsum0 * xsum0) >> 8));
|
return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance16x8_ssse3
|
unsigned int vp8_sub_pixel_variance16x8_ssse3
|
||||||
@ -161,5 +161,5 @@ unsigned int vp8_sub_pixel_variance16x8_ssse3
|
|||||||
}
|
}
|
||||||
|
|
||||||
*sse = xxsum0;
|
*sse = xxsum0;
|
||||||
return (xxsum0 - ((xsum0 * xsum0) >> 7));
|
return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user