diff --git a/src/enc/quant_enc.c b/src/enc/quant_enc.c
index 67288a74..35bfaf21 100644
--- a/src/enc/quant_enc.c
+++ b/src/enc/quant_enc.c
@@ -829,11 +829,12 @@ static int ReconstructIntra4(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // DC-error diffusion
 
-// Diffusion weights. We under-correct a bit (3/4th of the error is actually
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
-#define C1 2    // fraction of error sent to the 4x4 block below
-#define C2 1    // fraction of error sent to the 4x4 block on the right
-#define DSHIFT 2
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
 
 // Quantize as usual, but also compute and return the quantization error.
 // Error is already divided by DSHIFT.
@@ -845,10 +846,10 @@ static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
     const int err = (V - qV);
     *v = sign ? -qV : qV;
-    return (sign ? -err : err) >> DSHIFT;
+    return (sign ? -err : err) >> DSCALE;
   }
   *v = 0;
-  return (sign ? -V : V) >> DSHIFT;
+  return (sign ? -V : V) >> DSCALE;
 }
 
 static void CorrectDCValues(const VP8EncIterator* const it,
@@ -863,21 +864,24 @@ static void CorrectDCValues(const VP8EncIterator* const it,
   // as top[]/left[] on the next block.
   int ch;
   for (ch = 0; ch <= 1; ++ch) {
-    const int16_t* const top = it->top_derr_[it->x_][ch];
-    const int16_t* const left = it->left_derr_[ch];
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
     int16_t (* const c)[16] = &tmp[ch * 4];
     int err0, err1, err2, err3;
-    c[0][0] += C1 * top[0] + C2 * left[0];
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
     err0 = QuantizeSingle(&c[0][0], mtx);
-    c[1][0] += C1 * top[1] + C2 * err0;
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
     err1 = QuantizeSingle(&c[1][0], mtx);
-    c[2][0] += C1 * err0 + C2 * left[1];
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
     err2 = QuantizeSingle(&c[2][0], mtx);
-    c[3][0] += C1 * err1 + C2 * err2;
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
     err3 = QuantizeSingle(&c[3][0], mtx);
-    rd->derr[ch][0] = err1;
-    rd->derr[ch][1] = err2;
-    rd->derr[ch][2] = err3;
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
   }
 }
 
@@ -885,18 +889,19 @@ static void StoreDiffusionErrors(VP8EncIterator* const it,
                                  const VP8ModeScore* const rd) {
   int ch;
   for (ch = 0; ch <= 1; ++ch) {
-    int16_t* const top = it->top_derr_[it->x_][ch];
-    int16_t* const left = it->left_derr_[ch];
-    left[0] = rd->derr[ch][0];   // restore err1
-    left[1] = rd->derr[ch][2];   //     ... err3
-    top[0]  = rd->derr[ch][1];   //     ... err2
-    top[1]  = rd->derr[ch][2];   //     ... err3.
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
   }
 }
 
 #undef C1
 #undef C2
 #undef DSHIFT
+#undef DSCALE
 
 //------------------------------------------------------------------------------
 
diff --git a/src/enc/vp8i_enc.h b/src/enc/vp8i_enc.h
index d2fce941..11ff3f1d 100644
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@@ -121,7 +121,7 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // #define DISABLE_TOKEN_BUFFER
 
 // quality below which error-diffusion is enabled
-#define ERROR_DIFFUSION_QUALITY 30
+#define ERROR_DIFFUSION_QUALITY 98
 
 //------------------------------------------------------------------------------
 // Headers
@@ -204,7 +204,7 @@ typedef struct {
   score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;
 
-typedef int16_t DError[2 /* u/v */][2 /* top or left */];
+typedef int8_t DError[2 /* u/v */][2 /* top or left */];
 
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
@@ -218,7 +218,7 @@ typedef struct {
   uint8_t modes_i4[16];       // mode numbers for intra4 predictions
   int mode_uv;                // mode number of chroma prediction
   uint32_t nz;                // non-zero blocks
-  int16_t derr[2][3];         // DC diffusion errors for U/V for blocks #1/2/3
+  int8_t derr[2][3];          // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;
 
 // Iterator structure to iterate through macroblocks, pointing to the