diff --git a/examples/dwebp.c b/examples/dwebp.c
index eb40b747..36a95b6f 100644
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -555,6 +555,8 @@ static void Help(void) {
          "  -version  .... print version number and exit.\n"
          "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
          "  -nofilter .... disable in-loop filtering.\n"
+         "  -nodither .... disable dithering.\n"
+         "  -dither <d> .. dithering strength (in 0..100)\n"
          "  -mt .......... use multi-threading\n"
          "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
          "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
@@ -625,6 +627,10 @@ int main(int argc, const char *argv[]) {
       format = YUV;
     } else if (!strcmp(argv[c], "-mt")) {
       config.options.use_threads = 1;
+    } else if (!strcmp(argv[c], "-nodither")) {
+      config.options.dithering_strength = 0;
+    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
+      config.options.dithering_strength = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
       config.options.use_cropping = 1;
       config.options.crop_left   = strtol(argv[++c], NULL, 0);
@@ -719,7 +725,7 @@ int main(int argc, const char *argv[]) {
     if (!incremental) {
       status = WebPDecode(data, data_size, &config);
     } else {
-      WebPIDecoder* const idec = WebPINewDecoder(output_buffer);
+      WebPIDecoder* const idec = WebPIDecode(data, data_size, &config);
       if (idec == NULL) {
         fprintf(stderr, "Failed during WebPINewDecoder().\n");
         status = VP8_STATUS_OUT_OF_MEMORY;
diff --git a/examples/vwebp.c b/examples/vwebp.c
index b0f0771d..da08135e 100644
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -376,6 +376,7 @@ static void Help(void) {
          "  -noicc ....... don't use the icc profile if present.\n"
          "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
          "  -nofilter .... disable in-loop filtering.\n"
+         "  -dither <int>  dithering strength (0..100). Default=50.\n"
          "  -mt .......... use multi-threading.\n"
          "  -info ........ print info.\n"
          "  -h     ....... this help message.\n"
@@ -397,6 +398,7 @@ int main(int argc, char *argv[]) {
     fprintf(stderr, "Library version mismatch!\n");
     return -1;
   }
+  config->options.dithering_strength = 50;
   kParams.use_color_profile = 1;
 
   for (c = 1; c < argc; ++c) {
@@ -409,6 +411,8 @@ int main(int argc, char *argv[]) {
       config->options.no_fancy_upsampling = 1;
     } else if (!strcmp(argv[c], "-nofilter")) {
       config->options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
+      config->options.dithering_strength = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-info")) {
       kParams.print_info = 1;
     } else if (!strcmp(argv[c], "-version")) {
diff --git a/man/dwebp.1 b/man/dwebp.1
index 4426aedc..9a616763 100644
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "May 10, 2013"
+.TH DWEBP 1 "November 26, 2013"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -55,7 +55,15 @@ edges (especially the red ones), but should be faster.
 .B \-nofilter
 Don't use the in-loop filtering process even if it is required by
 the bitstream. This may produce visible blocks on the non-compliant output,
-but will make the decoding faster.
+but it will make the decoding faster.
+.TP
+.B \-dither " strength
+Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
+post-processing effect applied to chroma components in lossy compression.
+It helps by smoothing gradients and avoiding banding artifacts.
+.TP
+.B \-nodither
+Disable all dithering (default).
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 88eb3c87..30248049 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -148,6 +148,82 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
   }
 }
 
+//------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
+        }
+        all_amp |= dqm->dither_;
+      }
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
+      }
+    }
+  }
+}
+
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
+  }
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // This function is called after a row of macroblocks is finished decoding.
 // It also takes into account the following restrictions:
@@ -186,6 +262,10 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     FilterRow(dec);
   }
 
+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
   if (io->put != NULL) {
     int y_start = MACROBLOCK_VPOS(mb_y);
     int y_end = MACROBLOCK_VPOS(mb_y + 1);
diff --git a/src/dec/idec.c b/src/dec/idec.c
index 78562e3e..a9b8acc8 100644
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -423,6 +423,7 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   // This change must be done before calling VP8InitFrame()
   dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                        io->width, io->height);
+  VP8InitDithering(params->options, dec);
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
diff --git a/src/dec/quant.c b/src/dec/quant.c
index a4cc693d..fea6c530 100644
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@@ -104,6 +104,8 @@ void VP8ParseQuant(VP8Decoder* const dec) {
 
       m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
       m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
     }
   }
 }
diff --git a/src/dec/vp8.c b/src/dec/vp8.c
index 3c17ae51..d2f4cfb4 100644
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -561,6 +561,12 @@ static int ParseResiduals(VP8Decoder* const dec,
 
   block->non_zero_y_ = non_zero_y;
   block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
   return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
 
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index abd765c9..bb438626 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -17,6 +17,7 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
+#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
 
@@ -173,6 +174,9 @@ typedef struct {  // Top/Left Contexts used for syntax-parsing
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
   quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 
 // Data needed to reconstruct a macroblock
@@ -190,6 +194,7 @@ typedef struct {
   // This allows to call specialized transform functions.
   uint32_t non_zero_y_;
   uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
 } VP8MBData;
 
 // Persistent information needed by the parallel processing
@@ -244,6 +249,10 @@ struct VP8Decoder {
   // per-partition boolean decoders.
   VP8BitReader parts_[MAX_NUM_PARTITIONS];
 
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering
+
   // dequantization (one set of DC/AC dequant factor per segment)
   VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
 
@@ -324,7 +333,10 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
                        const WebPHeaderStructure* const headers,
                        int width, int height);
-// Process the last decoded row (filtering + output)
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
diff --git a/src/dec/webp.c b/src/dec/webp.c
index 21d16070..08f8bb77 100644
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@@ -474,6 +474,7 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
         // This change must be done before calling VP8Decode()
         dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
                                              io.width, io.height);
+        VP8InitDithering(params->options, dec);
         if (!VP8Decode(dec, &io)) {
           status = dec->status_;
         }
diff --git a/src/utils/random.h b/src/utils/random.h
index 9a755eb9..32632d3d 100644
--- a/src/utils/random.h
+++ b/src/utils/random.h
@@ -34,8 +34,10 @@ typedef struct {
 void VP8InitRandom(VP8Random* const rg, float dithering);
 
 // Returns a centered pseudo-random number with 'num_bits' amplitude.
-// (uses D.Knuth's Difference-based random generator)
-static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+// (uses D.Knuth's Difference-based random generator).
+// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
+static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
+                                      int amp) {
   int diff;
   assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
   diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
@@ -43,12 +45,16 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
   rg->tab_[rg->index1_] = diff;
   if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
   if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
-  diff = (diff << 1) >> (32 - num_bits);    // sign-extend, 0-center
-  diff = (diff * rg->amp_) >> VP8_RANDOM_DITHER_FIX;   // restrict range
-  diff += 1 << (num_bits - 1);              // shift back to 0.5-center
+  diff = (diff << 1) >> (32 - num_bits);         // sign-extend, 0-center
+  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
+  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
   return diff;
 }
 
+static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+  return VP8RandomBits2(rg, num_bits, rg->amp_);
+}
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/src/webp/decode.h b/src/webp/decode.h
index 404dae18..ad5125ac 100644
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0202    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0203    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -441,11 +441,12 @@ struct WebPDecoderOptions {
   int use_scaling;                    // if true, scaling is applied _afterward_
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
+  int dithering_strength;             // dithering strength (0=Off, 100=full)
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
-  uint32_t pad[6];                    // padding for later use
+  uint32_t pad[5];                    // padding for later use
 };
 
 // Main object storing the configuration for advanced decoding.