diff --git a/extras/get_disto.c b/extras/get_disto.c
index 75ecd62c..172bb43f 100644
--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@@ -94,26 +94,28 @@ typedef struct {
   uint32_t xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
 } DistoStats;
 
-static const double kMinValue = 1.e-10;  // minimal threshold
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * SSIM_KERNEL + 1] = { 1, 2, 3, 4, 3, 2, 1 };
 
 static WEBP_INLINE double SSIMCalculation(const DistoStats* const stats) {
   const uint32_t N = stats->w;
-  const double w2 = N * N;
-  const double C1 = 6.5025 * w2;
-  const double C2 = 58.5225 * w2;
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double sxy = stats->xym * N - xmym;
-  const double fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  double fden;
-  double sxx = stats->xxm * N - xmxm;
-  double syy = stats->yym * N - ymym;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  const int64_t xmym = (int64_t)stats->xm * stats->ym;
+  const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+  const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+  const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+  // we descale by 8 to prevent overflow during the fnum/fden multiply.
+  const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+  const uint64_t den_S = (sxx + syy + C2) >> 8;
+  const uint64_t fnum = (2 * xmym + C1) * num_S;
+  const uint64_t fden = (xmxm + ymym + C1) * den_S;
+  const double r = (double)fnum / fden;
+  assert(r >= 0. && r <= 1.0);
+  return r;
 }
 
 static double SSIMGetClipped(const uint8_t* src1, int stride1,
@@ -129,16 +131,18 @@ static double SSIMGetClipped(const uint8_t* src1, int stride1,
   src2 += ymin * stride2;
   for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
     for (x = xmin; x <= xmax; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats.xm  += s1;
-      stats.ym  += s2;
-      stats.xxm += s1 * s1;
-      stats.xym += s1 * s2;
-      stats.yym += s2 * s2;
+      const uint32_t w = kWeight[SSIM_KERNEL + x - xo]
+                       * kWeight[SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
-  stats.w = (ymax - ymin + 1) * (xmax - xmin + 1);
   return SSIMCalculation(&stats);
 }
 
diff --git a/src/dsp/enc.c b/src/dsp/enc.c
index 3013b87c..71cc5348 100644
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -693,31 +693,35 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // SSIM / PSNR
 
-static const double kMinValue = 1.e-10;  // minimal threshold
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
 
 static WEBP_INLINE double SSIMCalculation(
     const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const double w2 = N * N;
-  const double C1 = 6.5025 * w2;
-  const double C2 = 58.5225 * w2;
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double sxy = stats->xym * N - xmym;
-  const double fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  double fden;
-  double sxx = stats->xxm * N - xmxm;
-  double syy = stats->yym * N - ymym;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  const int64_t xmym = (int64_t)stats->xm * stats->ym;
+  const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+  const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+  const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+  // we descale by 8 to prevent overflow during the fnum/fden multiply.
+  const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+  const uint64_t den_S = (sxx + syy + C2) >> 8;
+  const uint64_t fnum = (2 * xmym + C1) * num_S;
+  const uint64_t fden = (xmxm + ymym + C1) * den_S;
+  const double r = (double)fnum / fden;
+  assert(r >= 0. && r <= 1.0);
+  return r;
 }
 
 double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  const uint32_t w = (2 * VP8_SSIM_KERNEL + 1) * (2 * VP8_SSIM_KERNEL + 1);
-  return SSIMCalculation(stats, w);
+  return SSIMCalculation(stats, kWeightSum);
 }
 
 double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
@@ -739,16 +743,18 @@ static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
   src2 += ymin * stride2;
   for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
     for (x = xmin; x <= xmax; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats.xm  += s1;
-      stats.ym  += s2;
-      stats.xxm += s1 * s1;
-      stats.xym += s1 * s2;
-      stats.yym += s2 * s2;
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
-  stats.w = (ymax - ymin + 1) * (xmax - xmin + 1);
   return VP8SSIMFromStatsClipped(&stats);
 }
 
@@ -758,13 +764,14 @@ static double SSIMGet_C(const uint8_t* src1, int stride1,
   int x, y;
   for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
     for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats.xm  += s1;
-      stats.ym  += s2;
-      stats.xxm += s1 * s1;
-      stats.xym += s1 * s2;
-      stats.yym += s2 * s2;
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
   return VP8SSIMFromStats(&stats);
diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index b5fb83ee..59d83de9 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -1422,30 +1422,45 @@ static uint32_t HorizontalAdd32b(const __m128i* const m) {
   return (uint32_t)_mm_cvtsi128_si32(c);
 }
 
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
 static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
                            const uint8_t* src2, int stride2) {
   VP8DistoStats stats;
-  int y;
   const __m128i zero = _mm_setzero_si128();
-  __m128i xm = zero, ym = zero;   // 16b accums
+  __m128i xm = zero, ym = zero;                // 16b accums
   __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
   assert(2 * VP8_SSIM_KERNEL + 1 == 7);
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    // process 8 bytes at a time (7 bytes, actually)
-    const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1);
-    const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2);
-    // convert 8b -> 16b
-    const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
-    const __m128i b1 = _mm_unpacklo_epi8(b0, zero);
-    // zeroes the rightmost pixel we are not interested in:
-    const __m128i s1 = _mm_slli_si128(a1, 2);
-    const __m128i s2 = _mm_slli_si128(b1, 2);
-    xm = _mm_add_epi16(xm, s1);
-    ym = _mm_add_epi16(ym, s2);
-    xxm = _mm_add_epi32(xxm, _mm_madd_epi16(s1, s1));
-    xym = _mm_add_epi32(xym, _mm_madd_epi16(s1, s2));
-    yym = _mm_add_epi32(yym, _mm_madd_epi16(s2, s2));
-  }
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
   stats.xm  = HorizontalAdd16b(&xm);
   stats.ym  = HorizontalAdd16b(&ym);
   stats.xxm = HorizontalAdd32b(&xxm);