Adding new test for packet losses

New test for executing a batch of packet loss tests. May need some more touching up. Change-Id: Ifd90ed5d1bc78ffe4c9cba2dfd1959d8da300cb2
Error concealment in decoder
2011-02-09 15:06:24 +01:00 · 2011-01-20 11:22:50 +01:00 · 2011-01-19 05:51:00 -08:00 · 2011-01-19 09:53:21 +01:00 · 2011-01-18 12:54:57 -08:00 · 2011-01-18 11:21:32 -05:00
84 changed files with 5586 additions and 3181 deletions
--- a/args.c
+++ b/args.c
@@ -135,6 +135,17 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
                     def->long_name, long_val);

        fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+        if(def->enums)
+        {
+            const struct arg_enum_list *listptr;
+
+            fprintf(fp, "  %-37s\t  ", "");
+
+            for(listptr = def->enums; listptr->name; listptr++)
+                fprintf(fp, "%s%s", listptr->name,
+                        listptr[1].name ? ", " : "\n");
+        }
    }
 }

@@ -218,3 +229,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg)

    return rat;
 }
+
+
+int arg_parse_enum(const struct arg *arg)
+{
+    const struct arg_enum_list *listptr;
+    long int                    rawval;
+    char                       *endptr;
+
+    /* First see if the value can be parsed as a raw value */
+    rawval = strtol(arg->val, &endptr, 10);
+    if (arg->val[0] != '\0' && endptr[0] == '\0')
+    {
+        /* Got a raw value, make sure it's valid */
+        for(listptr = arg->def->enums; listptr->name; listptr++)
+            if(listptr->val == rawval)
+                return rawval;
+    }
+
+    /* Next see if it can be parsed as a string */
+    for(listptr = arg->def->enums; listptr->name; listptr++)
+        if(!strcmp(arg->val, listptr->name))
+            return listptr->val;
+
+    die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+    return 0;
+}
+
+
+int arg_parse_enum_or_int(const struct arg *arg)
+{
+    if(arg->def->enums)
+        return arg_parse_enum(arg);
+    return arg_parse_int(arg);
+}
--- a/args.h
+++ b/args.h
@@ -22,14 +22,23 @@ struct arg
    const struct arg_def  *def;
 };

+struct arg_enum_list
+{
+    const char *name;
+    int         val;
+};
+#define ARG_ENUM_LIST_END {0}
+
 typedef struct arg_def
 {
    const char *short_name;
    const char *long_name;
    int         has_val;
    const char *desc;
+    const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d}
+#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
+#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
 #define ARG_DEF_LIST_END {0}

 struct arg arg_init(char **argv);
@@ -41,4 +50,5 @@ char **argv_dup(int argc, const char **argv);
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
 #endif
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -547,6 +547,10 @@ process_common_toolchain() {
                tgt_isa=universal
                tgt_os=darwin9
                ;;
+            *darwin10*)
+                tgt_isa=x86_64
+                tgt_os=darwin10
+                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -606,6 +610,12 @@ process_common_toolchain() {
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
            add_ldflags "-mmacosx-version-min=10.5"
            ;;
+        *-darwin10-*)
+            add_cflags  "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_cflags  "-mmacosx-version-min=10.6"
+            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_ldflags "-mmacosx-version-min=10.6"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -824,6 +834,7 @@ process_common_toolchain() {
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
+        soft_enable sse4_1

        case  ${tgt_os} in
            win*)
@@ -879,7 +890,7 @@ process_common_toolchain() {
        case  ${tgt_os} in
            win*)
                add_asflags -f win${bits}
-                enabled debug && add_asflags -g dwarf2
+                enabled debug && add_asflags -g cv8
            ;;
            linux*|solaris*)
                add_asflags -f elf${bits}
--- a/11
+++ b/11
@@ -41,6 +41,7 @@ Advanced options:
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
+  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -114,6 +115,7 @@ all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
+all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -199,6 +201,7 @@ ARCH_EXT_LIST="
    sse2
    sse3
    ssse3
+    sse4_1

    altivec
 "
@@ -249,6 +252,7 @@ CONFIG_LIST="
    shared
    small
    arm_asm_detok
+    postproc_visualizer
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -288,6 +292,7 @@ CMDLINE_SELECT="
    shared
    small
    arm_asm_detok
+    postproc_visualizer
 "

 process_cmdline() {
@@ -324,8 +329,6 @@ post_process_cmdline() {
    for c in ${CODECS}; do
        enabled ${c} && enable ${c##*_}s
    done
-
-
 }


@@ -535,6 +538,10 @@ process_toolchain() {

    # Other toolchain specific defaults
    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
 }


--- a/examples.mk
+++ b/examples.mk
@@ -17,6 +17,7 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
+vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -28,6 +29,7 @@ vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
@@ -75,6 +77,11 @@ GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_with_drops.c
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
+ifeq ($(CONFIG_DECODERS),yes)
+GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_packetdrop_eval.c
+endif
+decode_partial_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D28
+decode_partial_with_drops.DESCRIPTION    = Drops parts of frames while decoding and evaluate quality
 GEN_EXAMPLES-$(CONFIG_ENCODERS) += error_resilient.c
 error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C
 error_resilient.DESCRIPTION      = Error Resiliency Feature
--- a/examples/decode_packetdrop_eval.txt
+++ b/examples/decode_packetdrop_eval.txt
@@ -0,0 +1,415 @@
+@TEMPLATE decoder_tmpl.c
+Decode With Drops Example
+=========================
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+This is an example utility which drops a series of frames, as specified
+on the command line. This is useful for observing the error recovery
+features of the codec.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+#include <math.h>
+#include <time.h>
+#include "vpx_scale/yv12config.h"
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+extern double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight
+);
+
+extern double vp8_ssim
+(
+    const unsigned char *img1,
+    const unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height
+);
+
+struct detect_buffer {
+    char buf[4];
+    size_t buf_read;
+    size_t position;
+};
+
+struct parsed_header
+{
+    char key_frame;
+    int version;
+    char show_frame;
+    int first_part_size;
+};
+
+int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
+{
+    int size = 0;
+    int remaining = length - pos;
+    /* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
+    int uncomp_part_size = (hdr->key_frame ? 10 : 3);
+    /* number of bytes yet to send from header and the first partition */
+    int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
+    if (remainFirst > 0)
+    {
+        if (remainFirst <= mtu)
+        {
+            size = remainFirst;
+        }
+        else
+        {
+            size = mtu;
+        }
+
+        return size;
+    }
+
+    /* second partition; just slot it up according to MTU */
+    if (remaining <= mtu)
+    {
+        size = remaining;
+        return size;
+    }
+    return mtu;
+}
+
+void throw_packets(unsigned char* frame, int* size, int loss_rate, int* thrown, int* kept)
+{
+    unsigned char loss_frame[256*1024];
+    int pkg_size = 1;
+    int count = 0;
+    int pos = 0;
+    int loss_pos = 0;
+    struct parsed_header hdr;
+    unsigned int tmp;
+    int mtu = 100;
+
+    if (*size < 3)
+    {
+        return;
+    }
+    putc('|', stdout);
+    /* parse uncompressed 3 bytes */
+    tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
+    hdr.key_frame = !(tmp & 0x1); /* inverse logic */
+    hdr.version = (tmp >> 1) & 0x7;
+    hdr.show_frame = (tmp >> 4) & 0x1;
+    hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
+
+    /* don't drop key frames */
+    if (hdr.key_frame)
+    {
+        int i;
+        *kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
+        for (i=0; i < *kept; i++)
+            putc('.', stdout);
+        return;
+    }
+
+    while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
+    {
+        int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
+        if (*thrown == 0 && !loss_event)
+        {
+            memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
+            loss_pos += pkg_size;
+            (*kept)++;
+            putc('.', stdout);
+        }
+        else
+        {
+            (*thrown)++;
+            putc('X', stdout);
+        }
+        pos += pkg_size;
+    }
+    memcpy(frame, loss_frame, loss_pos);
+    memset(frame + loss_pos, 0, *size - loss_pos);
+    *size = loss_pos;
+}
+
+double ssim_yuv(unsigned char *ptr_ref, 
+                unsigned char *ptr_deg,
+                int w,
+                int h,
+                double *weight) 
+{
+    /* insert ref and deg into YV12_BUFFER_CONFIG structs
+     * and calculate SSIM for this frame
+     */
+    
+    YV12_BUFFER_CONFIG ref, deg;
+    
+    ref.y_width = w;
+    ref.y_height = h;
+    ref.y_stride = w;
+    ref.uv_width = w/2;
+    ref.uv_height = h/2;
+    ref.uv_stride = w/2;
+    ref.y_buffer = ptr_ref;
+    ref.u_buffer = ptr_ref + w*h;
+    ref.v_buffer = ptr_ref + w*h + (w*h)/4;
+    /* do not need the rest of the struct parameters; leave them as is */
+    
+    deg.y_width = w;
+    deg.y_height = h;
+    deg.y_stride = w;
+    deg.uv_width = w/2;
+    deg.uv_height = h/2;
+    deg.uv_stride = w/2;
+    deg.y_buffer = ptr_deg;
+    deg.u_buffer = ptr_deg + w*h;
+    deg.v_buffer = ptr_deg + w*h + (w*h)/4;
+    
+    return vp8_calc_ssim(&ref, &deg, 1, weight);
+}
+
+void write_and_eval_frame(unsigned char *img_buf,
+                         int w,
+                         int h,
+                         FILE *outfile,
+                         FILE *reffile,
+                         double *sum_ssim,
+                         double *sum_weight)
+{
+    /* write frame in img to output file and calculate SSIM */
+    int img_sz = (w*h*3)/2;
+    unsigned char *ref_buf = NULL;
+    double temp_ssim;
+    double weight = 0.0;
+    
+    if(!img_buf || !outfile || !reffile) 
+        die("Failure in write_and_eval_frame");
+    
+    if(fwrite(img_buf, 1, img_sz, outfile) != img_sz)
+        die("Could not write to file");
+    
+    /* Read next frame from reference file */
+    ref_buf = (unsigned char *) malloc(img_sz);
+    if(!ref_buf)
+        die("Error allocating memory");
+
+    if(fread(ref_buf, 1, img_sz, reffile) != img_sz)
+        die("Failed to read complete reference frame");
+
+    /* Calculate SSIM */
+    
+    temp_ssim = ssim_yuv(ref_buf, img_buf, 
+                         w, h, &weight);
+    *sum_ssim += temp_ssim * weight;
+    *sum_weight += weight;
+    
+    free(ref_buf);    
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+
+Usage
+-----
+This example adds a single argument to the `simple_decoder` example,
+which specifies the range or pattern of frames to drop. The parameter is
+parsed as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+if(argc!=5 && argc != 6)
+    die("Usage: %s <infile> <outfile> <reffile> <N-M|N/M|L,S>\n", 
+        argv[0]);
+{
+    char *nptr;
+    n = strtol(argv[4], &nptr, 0);
+    mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
+
+    m = strtol(nptr+1, NULL, 0);
+    if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
+        *nptr != '\0' && *nptr != ','))
+        die("Couldn't parse pattern %s\n", argv[4]);
+}
+seed = (m > 0) ? m : (unsigned int)time(NULL);
+srand(seed);thrown_frame = 0;
+printf("Seed: %u\n", seed);
+
+if(!(reffile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading", argv[3]);
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+
+
+Dropping A Range Of Frames
+--------------------------
+To drop a range of frames, specify the starting frame and the ending
+frame to drop, separated by a dash. The following command will drop
+frames 5 through 10 (base 1).
+
+  $ ./decode_with_drops in.ivf out.i420 5-10
+
+
+Dropping A Pattern Of Frames
+----------------------------
+To drop a pattern of frames, specify the number of frames to drop and
+the number of frames after which to repeat the pattern, separated by
+a forward-slash. The following command will drop 3 of 7 frames.
+Specifically, it will decode 4 frames, then drop 3 frames, and then
+repeat.
+
+  $ ./decode_with_drops in.ivf out.i420 3/7
+
+
+Extra Variables
+---------------
+This example maintains the pattern passed on the command line in the
+`n`, `m`, and `is_range` variables:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+int              n, m, mode;                                        //
+unsigned int     seed;
+int              thrown=0, kept=0;
+int              thrown_frame=0, kept_frame=0;
+unsigned char   *last_yuv_buf = NULL;
+int              last_sz = 0;
+int              last_alloc_sz = 0;
+unsigned char   *temp_last = NULL;
+int              expected_decode = 1;
+FILE            *reffile, *sttfile;
+unsigned char   *ref_yuv_buf = NULL;
+int              ref_sz = 0;
+int              ref_alloc_sz = 0;
+double           sum_ssim = 0.0, sum_weight = 0.0;
+double           total_ssim;
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+
+
+Making The Drop Decision
+------------------------
+The example decides whether to drop the frame based on the current
+frame number, immediately before decoding the frame.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
+/* Decide whether to throw parts of the frame or the whole frame
+   depending on the drop mode */
+thrown_frame = 0;
+kept_frame = 0;
+switch (mode)
+{
+case 0:
+    if (m - (frame_cnt-1)%m <= n)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 1:
+    if (frame_cnt >= n && frame_cnt <= m)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 2:
+    throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
+    break;
+default: break;
+}
+if (mode < 2)
+{
+    if (frame_sz == 0)
+    {
+        putc('X', stdout);
+        thrown_frame++;
+    }
+    else
+    {
+        putc('.', stdout);
+        kept_frame++;
+    }
+}
+thrown += thrown_frame;
+kept += kept_frame;
+fflush(stdout);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DECODE
+/* Decode the frame, use frame_cnt as user-specific data */
+if(vpx_codec_decode(&codec, frame, frame_sz, NULL /*(void *) frame_cnt*/, 0))
+    die_codec(&codec, "Failed to decode frame");
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DECODE
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GET_FRAME
+while((img = vpx_codec_get_frame(&codec, &iter))) {
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GET_FRAME
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_DX
+/* Check if we are missing frames */
+while(/*((int) img->user_priv)*/ frame_cnt > expected_decode) {
+    write_and_eval_frame(last_yuv_buf,
+                         img->d_w,
+                         img->d_h,
+                         outfile,
+                         reffile,
+                         &sum_ssim,
+                         &sum_weight);
+                         
+    expected_decode++;
+    }
+
+expected_decode++; 
+
+/* Check size of last_yuv_buf */
+if(last_alloc_sz < (img->d_w * img->d_h * 3) / 2) {
+    /* Re-allocate */
+    if(last_yuv_buf) {
+        free(last_yuv_buf);
+        last_yuv_buf = NULL;
+    }
+    last_alloc_sz = (img->d_w * img->d_h * 3) / 2;
+    last_yuv_buf = (unsigned char *) malloc(last_alloc_sz);
+    last_sz = 0;
+}
+
+/* First, write new frame to last_yuv_buf */
+temp_last = last_yuv_buf;
+last_sz = 0;
+for(plane=0; plane < 3; plane++) {
+    unsigned char *buf =img->planes[plane];
+
+    for(y=0; y<img->d_h >> (plane?1:0); y++) {
+        memcpy(temp_last, buf, img->d_w >> (plane?1:0));
+        temp_last += img->d_w >> (plane?1:0);
+        last_sz += img->d_w >> (plane?1:0);
+        buf += img->stride[plane];
+    }
+}
+
+/* Then, write it to file and calculate SSIM*/
+write_and_eval_frame(last_yuv_buf,
+                     img->d_w,
+                     img->d_h,
+                     outfile,
+                     reffile,
+                     &sum_ssim,
+                     &sum_weight);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_DX
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
+
+total_ssim = sum_ssim / sum_weight;
+printf("SSIM = %f\n", total_ssim);
+
+if(!(sttfile = fopen("loss.stt", "at")))
+    die("Failed to open loss.stt for writing");
+fprintf(sttfile, "lossparam \tSSIM\n");
+fprintf(sttfile, "%s\t%f\n", argv[4], total_ssim);
+fclose(sttfile);
+
+if(last_yuv_buf)
+    free(last_yuv_buf);
+
+if(vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
+    
+fclose(reffile);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
--- a/examples/decoder_tmpl.c
+++ b/examples/decoder_tmpl.c
@@ -42,6 +42,8 @@ static void die(const char *fmt, ...) {

@DIE_CODEC

+@HELPERS
+
 int main(int argc, char **argv) {
    FILE            *infile, *outfile;
    vpx_codec_ctx_t  codec;
--- a/examples/vp8_set_maps.txt
+++ b/examples/vp8_set_maps.txt
@@ -78,8 +78,8 @@ if(frame_cnt + 1 == 22) {
 } else if(frame_cnt + 1 == 44) {
    vpx_active_map_t  active;

-    active.rows = 240/16;
-    active.cols = 320/16;
+    active.rows = cfg.g_h/16;
+    active.cols = cfg.g_w/16;

    /* pass in null map to disable active_map*/
    active.active_map = NULL;
--- a/tools_common.c
+++ b/tools_common.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdio.h>
+#include "tools_common.h"
+#ifdef _WIN32
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+FILE* set_binary_mode(FILE *stream)
+{
+    (void)stream;
+#ifdef _WIN32
+    _setmode(_fileno(stream), _O_BINARY);
+#endif
+    return stream;
+}
--- a/tools_common.h
+++ b/tools_common.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TOOLS_COMMON_H
+#define TOOLS_COMMON_H
+
+/* Sets a stdio stream into binary mode */
+FILE* set_binary_mode(FILE *stream);
+
+#endif
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -44,9 +44,11 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)

    vpx_free(oci->above_context);
    vpx_free(oci->mip);
+    vpx_free(oci->prev_mip);

    oci->above_context = 0;
    oci->mip = 0;
+    oci->prev_mip = 0;

 }

@@ -111,6 +113,17 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)

    oci->mi = oci->mip + oci->mode_info_stride + 1;

+    /* allocate memory for last frame MODE_INFO array */
+    oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->prev_mip)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return ALLOC_FAILURE;
+    }
+
+    oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+

    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);

--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -282,6 +282,8 @@ typedef struct

    void *current_bc;

+    int corrupted;
+
 #if CONFIG_RUNTIME_CPU_DETECT
    struct VP8_COMMON_RTCD  *rtcd;
 #endif
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -36,6 +36,14 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
    7, 11, 14, 15,
 };

+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);

 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
@@ -106,23 +114,20 @@ static void init_bit_trees()
    init_bit_tree(cat6, 11);
 }

-
-static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
-
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0, 0},
-    { 0, 0, 0, 0, 1},
-    { 0, 0, 0, 0, 2},
-    { 0, 0, 0, 0, 3},
-    { 0, 0, 0, 0, 4},
-    { cat1, Pcat1, bcc1, 1, 5},
-    { cat2, Pcat2, bcc2, 2, 7},
-    { cat3, Pcat3, bcc3, 3, 11},
-    { cat4, Pcat4, bcc4, 4, 19},
-    { cat5, Pcat5, bcc5, 5, 35},
-    { cat6, Pcat6, bcc6, 11, 67},
-    { 0, 0, 0, 0, 0}
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"

--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -24,10 +24,10 @@
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

 #define vp8_coef_tokens 12
@@ -42,7 +42,6 @@ typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
-    vp8bc_index_t *prob_bc;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
@@ -95,6 +94,7 @@ struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);

 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];

--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -65,11 +65,13 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;

 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
+    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
+    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
+    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif

 #endif
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -18,6 +18,7 @@ extern "C"
 #endif

 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "type_aliases.h"
 #include "ppflags.h"
@@ -45,7 +46,8 @@ extern "C"
    typedef enum
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
-        USAGE_LOCAL_FILE_PLAYBACK   = 0x1
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+        USAGE_CONSTRAINED_QUALITY   = 0x2
    } END_USAGE;


@@ -149,6 +151,7 @@ extern "C"
        int fixed_q;
        int worst_allowed_q;
        int best_allowed_q;
+        int cq_level;

        // allow internal resizing ( currently disabled in the build !!!!!)
        int allow_spatial_resampling;
@@ -186,9 +189,10 @@ extern "C"
        int arnr_strength ;
        int arnr_type     ;

-
        struct vpx_fixed_buf         two_pass_stats_in;
        struct vpx_codec_pkt_list  *output_pkt_list;
+
+        vp8e_tuning tuning;
    } VP8_CONFIG;


@@ -204,7 +208,7 @@ extern "C"
 // and not just a copy of the pointer..
    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);

    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -105,7 +105,7 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;

-    FRAME_TYPE last_frame_type;  /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for loopfilter init checking and motion search. */
    FRAME_TYPE frame_type;

    int show_frame;
@@ -140,6 +140,8 @@ typedef struct VP8Common

    MODE_INFO *mip; /* Base of allocated array */
    MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+    MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+    MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */


    INTERPOLATIONFILTERTYPE mcomp_filter_type;
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@ extern "C"
    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);

    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);

    int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
    int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)

 /* global constants */
-
+#if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
@@ -41,13 +41,32 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
    { RGB_TO_YUV(0xFF0000) }    /* Red */
 };

-static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x6633ff) },   /* Purple */
+    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+    { RGB_TO_YUV(0xff33cc) },   /* Pink */
+    { RGB_TO_YUV(0xff3366) },   /* Coral */
+    { RGB_TO_YUV(0x3366ff) },   /* Blue */
+    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+    { RGB_TO_YUV(0xff6633) },   /* Orange */
+    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+    { RGB_TO_YUV(0x8ab800) },   /* Green */
+    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+    { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
 {
    { RGB_TO_YUV(0x00ff00) },   /* Blue */
    { RGB_TO_YUV(0x0000ff) },   /* Green */
    { RGB_TO_YUV(0xffff00) },   /* Yellow */
    { RGB_TO_YUV(0xff0000) },   /* Red */
 };
+#endif

 static const short kernel5[] =
 {
@@ -476,7 +495,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
 */
-void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
@@ -484,10 +503,10 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);

-    y += stride + 2;
-    for (i = 0; i < 14; i++)
+    y += 2*stride + 2;
+    for (i = 0; i < 12; i++)
    {
-        for (j = 0; j < 14; j++)
+        for (j = 0; j < 12; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
@@ -511,6 +530,104 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    }
 }

+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    for (i = 0; i < 12; i++)
+    {
+        y[0]  = (y[0]*alpha  + y1_const)>>16;
+        y[1]  = (y[1]*alpha  + y1_const)>>16;
+        y[14] = (y[14]*alpha + y1_const)>>16;
+        y[15] = (y[15]*alpha + y1_const)>>16;
+        y += stride;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+    u += stride;
+    v += stride;
+
+    for (i = 0; i < 6; i++)
+    {
+        u[0] = (u[0]*alpha + u1_const)>>16;
+        v[0] = (v[0]*alpha + v1_const)>>16;
+
+        u[7] = (u[7]*alpha + u1_const)>>16;
+        v[7] = (v[7]*alpha + v1_const)>>16;
+
+        u += stride;
+        v += stride;
+    }
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
 static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
 {
    int dx;
@@ -522,7 +639,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = width;
-        if (dy)
+        if (dx)
            *y1 = ((width-x0)*dy)/dx + y0;
    }
    if (*x1 < 0)
@@ -531,7 +648,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = 0;
-        if (dy)
+        if (dx)
            *y1 = ((0-x0)*dy)/dx + y0;
    }
    if (*y1 > height)
@@ -540,7 +657,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = height;
-        if (dx)
+        if (dy)
            *x1 = ((height-y0)*dx)/dy + x0;
    }
    if (*y1 < 0)
@@ -549,7 +666,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = 0;
-        if (dx)
+        if (dy)
            *x1 = ((0-y0)*dx)/dy + x0;
    }
 }
@@ -561,10 +678,12 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif

-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
-    char message[512];
    int q = oci->filter_level * 10 / 6;
+    int flags = ppflags->post_proc_flag;
+    int deblock_level = ppflags->deblocking_level;
+    int noise_level = ppflags->noise_level;

    if (!oci->frame_to_show)
        return -1;
@@ -621,8 +740,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         oci->post_proc_buffer.y_stride);
    }

-    if (flags & VP8D_DEBUG_LEVEL1)
+#if CONFIG_POSTPROC_VISUALIZER
+    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
    {
+        char message[512];
        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                (oci->frame_type == KEY_FRAME),
                oci->refresh_golden_frame,
@@ -633,7 +754,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

-    if (flags & VP8D_DEBUG_LEVEL2)
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -665,7 +786,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }

-    if (flags & VP8D_DEBUG_LEVEL3)
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -700,45 +821,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }

-    if (flags & VP8D_DEBUG_LEVEL4)
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
+        char message[512];
        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
-#if 0
-        int i, j;
-        unsigned char *y_ptr;
-        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-        int mb_rows = post->y_height >> 4;
-        int mb_cols = post->y_width  >> 4;
-        int mb_index = 0;
-        MODE_INFO *mi = oci->mi;
-
-        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-        /* vp8_filter each macro block */
-        for (i = 0; i < mb_rows; i++)
-        {
-            for (j = 0; j < mb_cols; j++)
-            {
-                char zz[4];
-
-                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
-                vp8_blit_text(zz, y_ptr, post->y_stride);
-                mb_index ++;
-                y_ptr += 16;
-            }
-
-            mb_index ++; /* border */
-            y_ptr += post->y_stride  * 16 - post->y_width;
-
-        }
-
-#endif
-
    }

    /* Draw motion vectors */
-    if (flags & VP8D_DEBUG_LEVEL5)
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
    {
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
@@ -749,29 +840,144 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        MODE_INFO *mi = oci->mi;
        int x0, y0;

-        for (y0 = 8; y0 < (height + 8); y0 += 16)
+        for (y0 = 0; y0 < height; y0 += 16)
        {
-            for (x0 = 8; x0 < (width + 8); x0 += 16)
+            for (x0 = 0; x0 < width; x0 += 16)
            {
-               int x1, y1;
-               if (mi->mbmi.mode >= NEARESTMV)
+                int x1, y1;
+
+                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+                {
+                    mi++;
+                    continue;
+                }
+
+                if (mi->mbmi.mode == SPLITMV)
+                {
+                    switch (mi->mbmi.partitioning)
+                    {
+                        case 0 :    /* mv_top_bottom */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 1 :    /* mv_left_right */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 2 :    /* mv_quarters   */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[10];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
+                            break;
+                        }
+                        default :
+                        {
+                            B_MODE_INFO *bmi = mi->bmi;
+                            int bx0, by0;
+
+                            for (by0 = y0; by0 < (y0+16); by0 += 4)
+                            {
+                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+                                {
+                                    MV *mv = &bmi->mv.as_mv;
+
+                                    x1 = bx0 + 2 + (mv->col >> 3);
+                                    y1 = by0 + 2 + (mv->row >> 3);
+
+                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
+
+                                    bmi++;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (mi->mbmi.mode >= NEARESTMV)
                {
                    MV *mv = &mi->mbmi.mv.as_mv;
+                    const int lx0 = x0 + 8;
+                    const int ly0 = y0 + 8;

-                    x1 = x0 + (mv->col >> 3);
-                    y1 = y0 + (mv->row >> 3);
+                    x1 = lx0 + (mv->col >> 3);
+                    y1 = ly0 + (mv->row >> 3);

-                    if (x1 != x0 && y1 != y0)
+                    if (x1 != lx0 && y1 != ly0)
                    {
-                        constrain_line (x0, &x1, y0-1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0-1,  y1, y_buffer, y_stride);
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);

-                        constrain_line (x0, &x1, y0+1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0+1,  y1, y_buffer, y_stride);
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
                    }
                    else
-                        vp8_blit_line  (x0,  x1, y0,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
                }
+
                mi++;
            }
            mi++;
@@ -779,9 +985,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }

    /* Color in block modes */
-    if (flags & VP8D_DEBUG_LEVEL6)
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -791,18 +998,54 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j += 16)
+            for (x = 0; x < width; x += 16)
            {
                int Y = 0, U = 0, V = 0;

-                Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-                U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-                V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+                if (mi->mbmi.mode == B_PRED &&
+                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
+                {
+                    int by, bx;
+                    unsigned char *yl, *ul, *vl;
+                    B_MODE_INFO *bmi = mi->bmi;

-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                    yl = y_ptr + x;
+                    ul = u_ptr + (x>>1);
+                    vl = v_ptr + (x>>1);
+
+                    for (by = 0; by < 16; by += 4)
+                    {
+                        for (bx = 0; bx < 16; bx += 4)
+                        {
+                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+                                || (ppflags->display_mb_modes_flag & B_PRED))
+                            {
+                                Y = B_PREDICTION_MODE_colors[bmi->mode][0];
+                                U = B_PREDICTION_MODE_colors[bmi->mode][1];
+                                V = B_PREDICTION_MODE_colors[bmi->mode][2];
+
+                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            }
+                            bmi++;
+                        }
+
+                        yl += y_stride*4;
+                        ul += y_stride*1;
+                        vl += y_stride*1;
+                    }
+                }
+                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
+                {
+                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }

                mi++;
            }
@@ -815,9 +1058,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }

    /* Color in frame reference blocks */
-    if (flags & VP8D_DEBUG_LEVEL7)
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -827,18 +1070,21 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j +=16)
+            for (x = 0; x < width; x +=16)
            {
                int Y = 0, U = 0, V = 0;

-                Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-                U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-                V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+                {
+                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }

                mi++;
            }
@@ -849,6 +1095,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
            mi++;
        }
    }
+#endif

    *dest = oci->post_proc_buffer;

--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -24,7 +24,15 @@
              char whiteclamp[16], char bothclamp[16],\
              unsigned int w, unsigned int h, int pitch)

-#define prototype_postproc_blend_mb(sym)\
+#define prototype_postproc_blend_mb_inner(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_mb_outer(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_b(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)

@@ -52,22 +60,36 @@ extern prototype_postproc(vp8_postproc_downacross);
 #endif
 extern prototype_postproc_addnoise(vp8_postproc_addnoise);

-#ifndef vp8_postproc_blend_mb
-#define vp8_postproc_blend_mb vp8_blend_mb_c
+#ifndef vp8_postproc_blend_mb_inner
+#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
 #endif
-extern prototype_postproc_blend_mb(vp8_postproc_blend_mb);
+extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
+
+#ifndef vp8_postproc_blend_mb_outer
+#define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
+#endif
+extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
+
+#ifndef vp8_postproc_blend_b
+#define vp8_postproc_blend_b vp8_blend_b_c
+#endif
+extern prototype_postproc_blend_b(vp8_postproc_blend_b);

 typedef prototype_postproc((*vp8_postproc_fn_t));
 typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
 typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb((*vp8_postproc_blend_mb_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
+typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
+typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
 typedef struct
 {
-    vp8_postproc_inplace_fn_t   down;
-    vp8_postproc_inplace_fn_t   across;
-    vp8_postproc_fn_t           downacross;
-    vp8_postproc_addnoise_fn_t  addnoise;
-    vp8_postproc_blend_mb_fn_t  blend_mb;
+    vp8_postproc_inplace_fn_t           down;
+    vp8_postproc_inplace_fn_t           across;
+    vp8_postproc_fn_t                   downacross;
+    vp8_postproc_addnoise_fn_t          addnoise;
+    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
+    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
+    vp8_postproc_blend_b_fn_t           blend_b;
 } vp8_postproc_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -89,7 +111,7 @@ struct postproc_state
 #include "onyxc_int.h"
 #include "ppflags.h"
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        int deblock_level, int noise_level, int flags);
+                        vp8_ppflags_t *flags);


 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -13,17 +13,28 @@
 #define __INC_PPFLAGS_H
 enum
 {
-    VP8D_NOFILTERING    = 0,
-    VP8D_DEBLOCK        = 1<<0,
-    VP8D_DEMACROBLOCK   = 1<<1,
-    VP8D_ADDNOISE       = 1<<2,
-    VP8D_DEBUG_LEVEL1   = 1<<3,
-    VP8D_DEBUG_LEVEL2   = 1<<4,
-    VP8D_DEBUG_LEVEL3   = 1<<5,
-    VP8D_DEBUG_LEVEL4   = 1<<6,
-    VP8D_DEBUG_LEVEL5   = 1<<7,
-    VP8D_DEBUG_LEVEL6   = 1<<8,
-    VP8D_DEBUG_LEVEL7   = 1<<9
+    VP8D_NOFILTERING            = 0,
+    VP8D_DEBLOCK                = 1<<0,
+    VP8D_DEMACROBLOCK           = 1<<1,
+    VP8D_ADDNOISE               = 1<<2,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
 };

+typedef struct
+{
+    int post_proc_flag;
+    int deblocking_level;
+    int noise_level;
+    int display_ref_frame_flag;
+    int display_mb_modes_flag;
+    int display_b_modes_flag;
+    int display_mv_flag;
+} vp8_ppflags_t;
+
 #endif
--- a/vp8/common/preproc.h
+++ b/vp8/common/preproc.h
@@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc.h
-*
-*   Description  :     simple preprocessor
-*
-****************************************************************************/
-
-#ifndef __INC_PREPROC_H
-#define __INC_PREPROC_H
-
-/****************************************************************************
-*  Types
-****************************************************************************/
-
-typedef struct
-{
-    unsigned char *frame_buffer;
-    int frame;
-    unsigned int *fixed_divide;
-
-    unsigned char *frame_buffer_alloc;
-    unsigned int *fixed_divide_alloc;
-} pre_proc_instance;
-
-/****************************************************************************
-*  Functions.
-****************************************************************************/
-void pre_proc_machine_specific_config(void);
-void delete_pre_proc(pre_proc_instance *ppi);
-int init_pre_proc(pre_proc_instance *ppi, int frame_size);
-extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength);
-extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-
-#endif
--- a/vp8/common/preprocif.h
+++ b/vp8/common/preprocif.h
@@ -1,76 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc_if.h
-*
-*   Description  :     Pre-processor interface header file.
-*
-****************************************************************************/
-
-#ifndef __PREPROC_IF_H
-#define __PREPROC_IF_H
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "type_aliases.h"
-
-/****************************************************************************
-*  Types
-****************************************************************************/
-
-typedef struct
-{
-    UINT8 *Yuv0ptr;
-    UINT8 *Yuv1ptr;
-
-    UINT8   *frag_info;              // blocks coded : passed in
-    UINT32   frag_info_element_size;   // size of each element
-    UINT32   frag_info_coded_mask;     // mask to get at whether fragment is coded
-
-    UINT32 *region_index;            // Gives pixel index for top left of each block
-    UINT32 video_frame_height;
-    UINT32 video_frame_width;
-    UINT8 hfrag_pixels;
-    UINT8 vfrag_pixels;
-
-} SCAN_CONFIG_DATA;
-
-typedef enum
-{
-    SCP_FILTER_ON_OFF,
-    SCP_SET_SRF_OFFSET,
-    SCP_SET_EBO_ON_OFF,
-    SCP_SET_VCAP_LEVEL_OFFSET,
-    SCP_SET_SHOW_LOCAL
-
-} SCP_SETTINGS;
-
-typedef struct PP_INSTANCE *x_pp_inst;
-
-/****************************************************************************
-*  Module statics
-****************************************************************************/
-/* Controls whether Early break out is on or off in default case */
-#define EARLY_BREAKOUT_DEFAULT  TRUE
-
-/****************************************************************************
-*  Functions
-****************************************************************************/
-extern  void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value);
-extern  UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator);
-extern  x_pp_inst create_pp_instance(void);
-extern  void delete_pp_instance(x_pp_inst *);
-extern  BOOL scan_yuvinit(x_pp_inst,  SCAN_CONFIG_DATA *scan_config_ptr);
-
-#endif
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -206,4 +206,29 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits)

    return z;
 }
+
+static int vp8dx_bool_error(BOOL_DECODER *br)
+{
+  /* Check if we have reached the end of the buffer.
+   *
+   * Variable 'count' stores the number of bits in the 'value' buffer,
+   * minus 8. So if count == 8, there are 16 bits available to be read.
+   * Normally, count is filled with 8 and one byte is filled into the
+   * value buffer. When we reach the end of the buffer, count is instead
+   * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real
+   * bits from the bitstream. So the last bit in the bitstream will be
+   * represented by count == VP8_LOTS_OF_BITS - 16.
+   */
+    if ((br->count > VP8_BD_VALUE_SIZE)
+        && (br->count <= VP8_LOTS_OF_BITS - 16))
+    {
+       /* We have tried to decode bits after the end of
+        * stream was encountered.
+        */
+        return 1;
+    }
+
+    /* No error. */
+    return 0;
+}
 #endif
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -19,7 +19,7 @@
 extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
 extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif

--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -381,6 +381,12 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+        if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
+        {
+            /* propagate errors from reference frames */
+            xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+        }
+
        vp8_build_uvmvs(xd, pc->full_pixel);

        /*
@@ -391,6 +397,8 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        */
        vp8_decode_macroblock(pbi, xd);

+        /* check if the boolean decoder has suffered an error */
+        xd->corrupted |= vp8dx_bool_error(xd->current_bc);

        recon_yoffset += 16;
        recon_uvoffset += 8;
@@ -461,7 +469,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
            partition_size = user_data_end - partition;
        }

-        if (user_data_end - partition < partition_size)
+        if (partition + partition_size > user_data_end
+            || partition + partition_size < partition)
            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                               "Truncated packet or corrupt partition "
                               "%d length", i + 1);
@@ -484,7 +493,6 @@ static void setup_token_decoder(VP8D_COMP *pbi,

 static void stop_token_decoder(VP8D_COMP *pbi)
 {
-    int i;
    VP8_COMMON *pc = &pbi->common;

    if (pc->multi_token_partition != ONE_PARTITION)
@@ -555,6 +563,7 @@ static void init_frame(VP8D_COMP *pbi)
    xd->frame_type = pc->frame_type;
    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_stride = pc->mode_info_stride;
+    xd->corrupted = 0; /* init without corruption */
 }

 int vp8_decode_frame(VP8D_COMP *pbi)
@@ -570,6 +579,10 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    int i, j, k, l;
    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;

+    /* start with no corruption of current frame */
+    xd->corrupted = 0;
+    pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+
    if (data_end - data < 3)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet");
@@ -580,7 +593,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
    data += 3;

-    if (data_end - data < first_partition_length_in_bytes)
+    if (data + first_partition_length_in_bytes > data_end
+        || data + first_partition_length_in_bytes < data)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet or corrupt partition 0 length");
    vp8_setup_version(pc);
@@ -890,6 +904,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    stop_token_decoder(pbi);

+    /* Collect information about decoder corruption. */
+    /* 1. Check first boolean decoder for errors. */
+    pc->yv12_fb[pc->new_fb_idx].corrupted =
+        vp8dx_bool_error(bc);
+    /* 2. Check the macroblock information */
+    pc->yv12_fb[pc->new_fb_idx].corrupted |=
+        xd->corrupted;
+
    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */

    /* If this was a kf or Gf note the Q used */
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -254,12 +254,7 @@ static void ref_cnt_fb (int *buf, int *idx, int new_idx)
 /* If any buffer copy / swapping is signalled it should be done here. */
 static int swap_frame_buffers (VP8_COMMON *cm)
 {
-    int fb_to_update_with, err = 0;
-
-    if (cm->refresh_last_frame)
-        fb_to_update_with = cm->lst_fb_idx;
-    else
-        fb_to_update_with = cm->new_fb_idx;
+    int err = 0;

    /* The alternate reference frame or golden frame can be updated
     *  using the new, last, or golden/alt ref frame.  If it
@@ -271,7 +266,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;

        if (cm->copy_buffer_to_arf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_arf == 2)
            new_fb = cm->gld_fb_idx;
        else
@@ -285,7 +280,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;

        if (cm->copy_buffer_to_gf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_gf == 2)
            new_fb = cm->alt_fb_idx;
        else
@@ -334,6 +329,23 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign

    pbi->common.error.error_code = VPX_CODEC_OK;

+    if (size == 0)
+    {
+       /* This is used to signal that we are missing frames.
+        * We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
+
+        /* Nothing more to do. */
+        return 0;
+    }
+
+
 #if HAVE_ARMV7
 #if CONFIG_RUNTIME_CPU_DETECT
    if (cm->rtcd.flags & HAS_NEON)
@@ -356,6 +368,13 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        }
 #endif
        pbi->common.error.setjmp = 0;
+
+       /* We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
        return -1;
@@ -388,6 +407,16 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        return retcode;
    }

+    /* copy mode info to storage for future error concealment */
+    if (pbi->common.prev_mip)
+    {
+        /* size allocated in vp8_alloc_frame_buffers() */
+        int size_of_mip = (pbi->common.mb_cols + 1) * (pbi->common.mb_rows + 1)
+            * sizeof(MODE_INFO);
+
+        memcpy(pbi->common.prev_mip, pbi->common.mip, size_of_mip);
+    }
+
    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
    {
        if (swap_frame_buffers (cm))
@@ -506,7 +535,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +553,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,

    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else

    if (pbi->common.frame_to_show)
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -451,7 +451,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 #if CONFIG_MULTITHREAD
    int core_count = 0;
    int ithread;
-    int i;

    pbi->b_multithreaded_rd = 0;
    pbi->allocated_decoding_thread_count = 0;
@@ -596,7 +595,7 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 }


-int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
@@ -647,7 +646,6 @@ int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
-    return 0;
 #else
    (void) pbi;
    (void) width;
@@ -722,7 +720,6 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
    /*int mb_row;
    int mb_col;
    int baseline_filter_level[MAX_MB_SEGMENTS];*/
-    int filter_level;
    int alt_flt_enabled = mbd->segmentation_enabled;

    int i;
@@ -770,7 +767,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)

    int ibc = 0;
    int num_part = 1 << pbi->common.multi_token_partition;
-    int i, j;
+    int i;
    volatile int *last_row_current_mb_col = NULL;
    int nsync = pbi->sync_range;

@@ -810,7 +807,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)

    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
    {
-        int i;

        xd->current_bc = &pbi->mbc[mb_row%num_part];

@@ -894,9 +890,18 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+                if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
+                {
+                    /* propagate errors from reference frames */
+                    xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+                }
+
                vp8_build_uvmvs(xd, pc->full_pixel);
                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);

+                /* check if the boolean decoder has suffered an error */
+                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
                if (pbi->common.filter_level)
                {
                    /* Save decoded MB last row data for next-row decoding */
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -29,10 +29,9 @@
    push    {r4-r11, lr}

    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 20
-    add     r2, r2, r2, lsl #2          ; xcount
+    ;  sizeof (TOKENEXTRA) is 8
    sub     sp, sp, #12
-    add     r2, r1, r2, lsl #2          ; stop = p + xcount
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
    str     r2, [sp, #0]
    str     r3, [sp, #8]                ; save vp8_coef_encodings
    ldr     r2, [r0, #vp8_writer_lowvalue]
@@ -41,13 +40,13 @@
    b       check_p_lt_stop

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #8]                ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -142,12 +141,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #48]               ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -155,7 +153,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -62,13 +62,13 @@ mb_row_loop
    ; actuall work gets done here!

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #20]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -163,12 +163,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #8]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -176,7 +175,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -90,13 +90,13 @@ mb_row_loop
    ; actual work gets done here!

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #80]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -191,12 +191,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #84]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -204,7 +203,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor

 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
 }

 /*
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -51,7 +51,6 @@ DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));

 DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
 DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_prob_bc,               offsetof(vp8_extra_bit_struct, prob_bc));
 DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
 DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));

@@ -67,8 +66,8 @@ DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));

 // These two sizes are used in vp7cx_pack_tokens.  They are hard coded
 //  so if the size changes this will have to be adjusted.
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,6 +33,7 @@ typedef struct

    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *quant;
+    short *quant_fast;
    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
@@ -81,6 +82,7 @@ typedef struct
    int errthresh;
    int rddiv;
    int rdmult;
+    INT64 activity_sum;

    int mvcosts[2][MVvals+1];
    int *mvcost[2];
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -62,7 +62,6 @@ unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

 static const int qrounding_factors[129] =
 {
-    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -78,12 +77,18 @@ static const int qrounding_factors[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
 };

 static const int qzbin_factors[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -94,17 +99,11 @@ static const int qzbin_factors[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80,
+    80
 };

 static const int qrounding_factors_y2[129] =
 {
-    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -120,12 +119,18 @@ static const int qrounding_factors_y2[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
 };

 static const int qzbin_factors_y2[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -136,26 +141,30 @@ static const int qzbin_factors_y2[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80,
+    80
 };

-//#define EXACT_QUANT
+#define EXACT_QUANT
 #ifdef EXACT_QUANT
-static void vp8cx_invert_quant(short *quant, short *shift, short d)
+static void vp8cx_invert_quant(int improved_quant, short *quant,
+                               short *shift, short d)
 {
-    unsigned t;
-    int l;
-    t = d;
-    for(l = 0; t > 1; l++)
-        t>>=1;
-    t = 1 + (1<<(16+l))/d;
-    *quant = (short)(t - (1<<16));
-    *shift = l;
+    if(improved_quant)
+    {
+        unsigned t;
+        int l;
+        t = d;
+        for(l = 0; t > 1; l++)
+            t>>=1;
+        t = 1 + (1<<(16+l))/d;
+        *quant = (short)(t - (1<<16));
+        *shift = l;
+    }
+    else
+    {
+        *quant = (1 << 16) / d;
+        *shift = 0;
+    }
 }

 void vp8cx_init_quantizer(VP8_COMP *cpi)
@@ -170,7 +179,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
    {
        // dc values
        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
                           cpi->Y1quant_shift[Q] + 0, quant_val);
        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -178,7 +188,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
                           cpi->Y2quant_shift[Q] + 0, quant_val);
        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -186,7 +197,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
                           cpi->UVquant_shift[Q] + 0, quant_val);
        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -199,7 +211,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            int rc = vp8_default_zig_zag1d[i];

            quant_val = vp8_ac_yquant(Q);
-            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
+            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
                               cpi->Y1quant_shift[Q] + rc, quant_val);
            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -207,7 +220,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
+            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
                               cpi->Y2quant_shift[Q] + rc, quant_val);
            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -215,7 +229,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
+            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
                               cpi->UVquant_shift[Q] + rc, quant_val);
            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -316,6 +331,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 0; i < 16; i++)
    {
        x->block[i].quant = cpi->Y1quant[QIndex];
+        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
        x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
        x->block[i].zbin = cpi->Y1zbin[QIndex];
        x->block[i].round = cpi->Y1round[QIndex];
@@ -330,6 +346,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 16; i < 24; i++)
    {
        x->block[i].quant = cpi->UVquant[QIndex];
+        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
        x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
        x->block[i].zbin = cpi->UVzbin[QIndex];
        x->block[i].round = cpi->UVround[QIndex];
@@ -340,6 +357,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

    // Y2
    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
    x->block[24].quant = cpi->Y2quant[QIndex];
    x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
    x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -351,6 +369,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
+    // Clear Zbin mode boost for default case
+    cpi->zbin_mode_boost = 0;
+
    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
    // when these values are not all zero.
    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
@@ -363,6 +384,61 @@ void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 }


+/* activity_avg must be positive, or flat regions could get a zero weight
+ *  (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ *  vp8_activity_masking().
+ */
+#define VP8_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ *  purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ *  which will be faster.
+ */
+static const unsigned char VP8_VAR_OFFS[16]=
+{
+    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+
+unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    unsigned int act;
+    unsigned int sse;
+    int sum;
+    unsigned int a;
+    unsigned int b;
+    /* TODO: This could also be done over smaller areas (8x8), but that would
+     *  require extensive changes elsewhere, as lambda is assumed to be fixed
+     *  over an entire MB in most of the code.
+     * Another option is to compute four 8x8 variances, and pick a single
+     *  lambda using a non-linear combination (e.g., the smallest, or second
+     *  smallest, etc.).
+     */
+    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,
+     x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);
+    /* This requires a full 32 bits of precision. */
+    act = (sse<<8) - sum*sum;
+    /* Drop 4 to give us some headroom to work with. */
+    act = (act + 8) >> 4;
+    /* If the region is flat, lower the activity some more. */
+    if (act < 8<<12)
+        act = act < 5<<12 ? act : 5<<12;
+    /* TODO: For non-flat regions, edge regions should receive less masking
+     *  than textured regions, but identifying edge regions quickly and
+     *  reliably enough is still a subject of experimentation.
+     * This will be most noticable near edges with a complex shape (e.g.,
+     *  text), but the 4x4 transform size should make this less of a problem
+     *  than it would be for an 8x8 transform.
+     */
+    /* Apply the masking to the RD multiplier. */
+    a = act + 4*cpi->activity_avg;
+    b = 4*act + cpi->activity_avg;
+    x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);
+    return act;
+}
+
+

 static
 void encode_mb_row(VP8_COMP *cpi,
@@ -374,6 +450,7 @@ void encode_mb_row(VP8_COMP *cpi,
                   int *segment_counts,
                   int *totalrate)
 {
+    INT64 activity_sum = 0;
    int i;
    int recon_yoffset, recon_uvoffset;
    int mb_col;
@@ -402,14 +479,14 @@ void encode_mb_row(VP8_COMP *cpi,
    // Set up limit values for vertical motion vector components
    // to prevent them extending beyond the UMV borders
    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) 
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                        + (VP8BORDERINPIXELS - 16);

    // for each macroblock col in image
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
    {
-        // Distance of Mb to the left & right edges, specified in 
-        // 1/8th pel units as they are always compared to values 
+        // Distance of Mb to the left & right edges, specified in
+        // 1/8th pel units as they are always compared to values
        // that are in 1/8th pel units
        xd->mb_to_left_edge = -((mb_col * 16) << 3);
        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
@@ -417,7 +494,7 @@ void encode_mb_row(VP8_COMP *cpi,
        // Set up limit values for horizontal motion vector components
        // to prevent them extending beyond the UMV borders
        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) 
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
                            + (VP8BORDERINPIXELS - 16);

        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
@@ -425,6 +502,12 @@ void encode_mb_row(VP8_COMP *cpi,
        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
        xd->left_available = (mb_col != 0);

+        x->rddiv = cpi->RDDIV;
+        x->rdmult = cpi->RDMULT;
+
+        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+            activity_sum += vp8_activity_masking(cpi, x);
+
        // Is segmentation enabled
        // MB level adjutment to quantizer
        if (xd->segmentation_enabled)
@@ -531,6 +614,7 @@ void encode_mb_row(VP8_COMP *cpi,
    // this is to account for the border
    xd->mode_info_context++;
    x->partition_info++;
+    x->activity_sum += activity_sum;
 }


@@ -544,7 +628,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
    VP8_COMMON *const cm = & cpi->common;
    MACROBLOCKD *const xd = & x->e_mbd;

-    int i;
    TOKENEXTRA *tp = cpi->tok;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
@@ -627,9 +710,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
    }

    vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
-    //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
-    //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);

    // Copy data over into macro block data sturctures.

@@ -647,22 +728,7 @@ void vp8_encode_frame(VP8_COMP *cpi)

    vp8_setup_block_ptrs(x);

-    x->rddiv = cpi->RDDIV;
-    x->rdmult = cpi->RDMULT;
-
-#if 0
-    // Experimental rd code
-    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
-    // such as cpi->rate_correction_factor that indicate relative complexity.
-    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
-    {
-        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
-        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
-    }
-    else
-        x->rdmult = cpi->RDMULT; */
-    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
-#endif
+    x->activity_sum = 0;

    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
@@ -703,11 +769,12 @@ void vp8_encode_frame(VP8_COMP *cpi)
        else
        {
 #if CONFIG_MULTITHREAD
+            int i;
+
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
-                int i;
                cpi->current_mb_col_main = -1;

                for (i = 0; i < cpi->encoding_thread_count; i++)
@@ -785,6 +852,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
                totalrate += cpi->mb_row_ei[i].totalrate;
            }

+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
+            }
+
 #endif

        }
@@ -920,6 +992,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif

+    /* Update the average activity for the next frame.
+     * This is feed-forward for now; it could also be saved in two-pass, or
+     *  done during lookahead when that is eventually added.
+     */
+    cpi->activity_avg = (unsigned int )(x->activity_sum/cpi->common.MBs);
+    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
+        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
@@ -1181,7 +1261,18 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
+        /* Are we using the fast quantizer for the mode selection? */
+        if(cpi->sf.use_fastquant_for_pick)
+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+
+        /* switch back to the regular quantizer for the encode */
+        if (cpi->sf.improved_quant)
+        {
+            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+        }
+
    }
    else
 #endif
@@ -1214,11 +1305,25 @@ int vp8cx_encode_inter_macroblock
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
-            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME))
-                cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
+                 cpi->zbin_mode_boost = 0;
            else
-                cpi->zbin_mode_boost = 0;
+            {
+                if (xd->mode_info_context->mbmi.mode == ZEROMV)
+                {
+                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+                    cpi->zbin_mode_boost = 0;
+                else
+                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
        }
+        else
+            cpi->zbin_mode_boost = 0;

        vp8cx_mb_init_quantizer(cpi,  x);
    }
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -105,7 +105,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

 #if !(CONFIG_REALTIME_ONLY)
 #if 1
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mby(x, rtcd);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -243,9 +243,9 @@ struct vp8_token_state{
 };

 // TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 1
-#define UV_RD_MULT 1
-#define Y2_RD_MULT 4
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 16

 static const int plane_rd_mult[4]=
 {
@@ -273,7 +273,6 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    int x;
    int sz;
    int next;
-    int path;
    int rdmult;
    int rddiv;
    int final_eob;
@@ -309,8 +308,10 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    eob = d->eob;

    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-    /* TODO: These should vary with the block type, since the quantizer does. */
-    rdmult = (mb->rdmult << 2)*err_mult;
+    rdmult = mb->rdmult * err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+        rdmult = (rdmult * 9)>>4;
+
    rddiv = mb->rddiv;
    best_mask[0] = best_mask[1] = 0;
    /* Initialize the sentinel node of the trellis. */
@@ -633,7 +634,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mb(x);

 #if !(CONFIG_REALTIME_ONLY)
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mb(x, rtcd);
 #endif

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -61,6 +61,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                    volatile int *last_row_current_mb_col;
+                    INT64 activity_sum = 0;

                    if (ithread > 0)
                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
@@ -111,6 +112,12 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                        xd->left_available = (mb_col != 0);

+                        x->rddiv = cpi->RDDIV;
+                        x->rdmult = cpi->RDMULT;
+
+                        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                            activity_sum += vp8_activity_masking(cpi, x);
+
                        // Is segmentation enabled
                        // MB level adjutment to quantizer
                        if (xd->segmentation_enabled)
@@ -126,6 +133,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        else
                            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default

+                        x->active_ptr = cpi->active_map + seg_map_index + mb_col;

                        if (cm->frame_type == KEY_FRAME)
                        {
@@ -157,8 +165,28 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                                cpi->inter_zz_count ++;

-                        }
+                            // Special case code for cyclic refresh
+                            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+                            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                            {
+                                cpi->segmentation_map[seg_map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;

+                                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+                                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+                                // else mark it as dirty (1).
+                                if (xd->mode_info_context->mbmi.segment_id)
+                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
+                                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                                {
+                                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
+                                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
+                                }
+                                else
+                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
+
+                            }
+                        }
                        cpi->tplist[mb_row].stop = *tp;

                        x->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
@@ -197,6 +225,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    // this is to account for the border
                    xd->mode_info_context++;
                    x->partition_info++;
+                    x->activity_sum += activity_sum;

                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -240,8 +269,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->sadperbit16      = x->sadperbit16;
    z->sadperbit4       = x->sadperbit4;
    z->errthresh        = x->errthresh;
-    z->rddiv            = x->rddiv;
-    z->rdmult           = x->rdmult;

    /*
    z->mv_col_min    = x->mv_col_min;
@@ -255,6 +282,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
+    z->optimize          = x->optimize;

    /*
    z->mvc              = x->mvc;
@@ -282,6 +310,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    for (i = 0; i < 25; i++)
    {
        z->block[i].quant           = x->block[i].quant;
+        z->block[i].quant_fast      = x->block[i].quant_fast;
        z->block[i].quant_shift     = x->block[i].quant_shift;
        z->block[i].zbin            = x->block[i].zbin;
        z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
@@ -392,8 +421,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

        vp8_setup_block_ptrs(mb);

-        mb->rddiv = cpi->RDDIV;
-        mb->rdmult = cpi->RDMULT;
+        mb->activity_sum = 0;

        mbd->left_context = &cm->left_context;
        mb->mvc = cm->fc.mvc;
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -53,8 +53,11 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
 #define IIFACTOR   1.4
 #define IIKFACTOR1 1.40
 #define IIKFACTOR2 1.5
-#define RMAX    14.0
-#define GF_RMAX 48.0        // 128.0
+#define RMAX       14.0
+#define GF_RMAX    48.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200

 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

@@ -65,6 +68,18 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};


+const int cq_level[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);

@@ -247,7 +262,6 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
     * macroblock.
     */
    size_t stats_sz;
-    FIRSTPASS_STATS stats;

    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
    stats_sz = (stats_sz + 7) & ~7;
@@ -374,8 +388,6 @@ unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
 }
 void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
 {
-    int Offset;
-
    cpi->fp_motion_map_stats = target_pos;
 }

@@ -472,7 +484,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

    // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
        tmp_err += new_mv_mode_penalty;

@@ -495,7 +507,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
            num00--;
        else
        {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                tmp_err += new_mv_mode_penalty;

@@ -905,7 +917,7 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
    double pow_lowq = 0.40;

    if (section_target_bandwitdh <= 0)
-        return MAXQ;
+        return cpi->maxq_max_limit;          // Highest value allowed

    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);

@@ -941,10 +953,12 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_

    // Correction factor used for Q values >= 20
    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
-    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+    corr_high = (corr_high < 0.05)
+                    ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;

-    // Try and pick a Q that should be high enough to encode the content at the given rate.
-    for (Q = 0; Q < MAXQ; Q++)
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (Q = cpi->maxq_min_limit; Q < cpi->maxq_max_limit; Q++)
    {
        int bits_per_mb_at_this_q;

@@ -963,6 +977,28 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
            break;
    }

+    // Restriction on active max q for constrained quality mode.
+    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+         (Q < cpi->cq_target_quality) )
+         //(Q < cpi->oxcf.cq_level;) )
+    {
+        Q = cpi->cq_target_quality;
+        //Q = cpi->oxcf.cq_level;
+    }
+
+    // Adjust maxq_min_limit and maxq_max_limit limits based on
+    // averaga q observed in clip for non kf/gf.arf frames
+    // Give average a chance to settle though.
+    if ( (cpi->ni_frames >
+                  ((unsigned int)cpi->total_stats->count >> 8)) &&
+         (cpi->ni_frames > 150) )
+    {
+        cpi->maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+    }
+
    return Q;
 }
 static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
@@ -1111,6 +1147,79 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta

    return Q;
 }
+
+// For cq mode estimate a cq level that matches the observed
+// complexity and data rate.
+static int estimate_cq(VP8_COMP *cpi, double section_err,
+                       int section_target_bandwitdh, int Height, int Width)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double pow_highq = 0.90;
+    double pow_lowq = 0.40;
+    double clip_iiratio;
+    double clip_iifactor;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
+
+    // Corrections for higher compression speed settings
+    // (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+    // II ratio correction factor for clip as a whole
+    clip_iiratio = cpi->total_stats->intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->total_stats->coded_error);
+    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+    if (clip_iifactor < 0.80)
+        clip_iifactor = 0.80;
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that can encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        if (Q < 50)
+        {
+            correction_factor =
+                pow( err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+
+            correction_factor = (correction_factor < 0.05) ? 0.05
+                                    : (correction_factor > 5.0) ? 5.0
+                                        : correction_factor;
+        }
+        else
+            correction_factor = corr_high;
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + correction_factor *
+                        speed_correction *
+                        clip_iifactor *
+                        (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return cq_level[Q];
+}
+
 extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);

 void vp8_init_second_pass(VP8_COMP *cpi)
@@ -1145,6 +1254,14 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->output_frame_rate = cpi->oxcf.frame_rate;
    cpi->bits_left = (long long)(cpi->total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->bits_left -= (long long)(cpi->total_stats->duration * two_pass_min_rate / 10000000.0);
+    cpi->clip_bits_total = cpi->bits_left;
+
+    // Calculate a minimum intra value to be used in determining the IIratio
+    // scores used in the second pass. We have this minimum to make sure
+    // that clips that are static but "low complexity" in the intra domain
+    // are still boosted appropriately for KF/GF/ARF
+    cpi->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    cpi->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;

    vp8_avg_stats(cpi->total_stats);

@@ -1173,17 +1290,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    {
        start_pos = cpi->stats_in;               // Note starting "file" position

-        cpi->modified_total_error_left = 0.0;
+        cpi->modified_error_total = 0.0;
+        cpi->modified_error_used = 0.0;

        while (vp8_input_stats(cpi, &this_frame) != EOF)
        {
-            cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
+            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
        }
+        cpi->modified_error_left = cpi->modified_error_total;

        reset_fpf_position(cpi, start_pos);            // Reset file position

    }

+    // Calculate the clip target modified bits per error
+    // The observed bpe starts as the same number.
+    cpi->clip_bpe =  cpi->bits_left /
+                     DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
+    cpi->observed_bpe = cpi->clip_bpe;
+
    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }

@@ -1250,7 +1375,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // what level of boost is appropriate for the GF or ARF that will be coded with the group
    i = 0;

-    while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    while (((i < cpi->static_scene_max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
    {
        double r;
        double this_frame_mvr_ratio;
@@ -1308,6 +1433,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // Underlying boost factor is based on inter intra error ratio
        r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));

+        if (next_frame.intra_error > cpi->gf_intra_err_min)
+            r = (IIKFACTOR2 * next_frame.intra_error /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        else
+            r = (IIKFACTOR2 * cpi->gf_intra_err_min /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
        // Increase boost for frames where new data coming into frame (eg zoom out)
        // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
        // The range for this_frame_mv_in_out is -1.0 to +1.0
@@ -1353,18 +1485,20 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        boost_score += (decay_accumulator * r);

        // Break out conditions.
-        if (   /* i>4 || */
+        if  (   /* i>4 || */
+            // Break at cpi->max_gf_interval unless almost totally static
+            (i >= cpi->max_gf_interval && (loop_decay_rate < 0.99)) ||
            (
-                (i > MIN_GF_INTERVAL) &&                            // Dont break out with a very short interval
-                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&      // Dont break out very close to a key frame
+                // Dont break out with a very short interval
+                (i > MIN_GF_INTERVAL) &&
+                // Dont break out very close to a key frame
+                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&
                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
                ((mv_ratio_accumulator > 100.0) ||
                 (abs_mv_in_out_accumulator > 3.0) ||
                 (mv_in_out_accumulator < -2.0) ||
-                 ((boost_score - old_boost_score) < 2.0)
-                )
-            )
-        )
+                 ((boost_score - old_boost_score) < 2.0))
+            ) )
        {
            boost_score = old_boost_score;
            break;
@@ -1439,7 +1573,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        // Boost for arf frame
        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (cpi->baseline_gf_interval * 50);
+        Boost += (i * 50);
        allocation_chunks = (i * 100) + Boost;

        // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1585,6 +1719,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Reset the file position
    reset_fpf_position(cpi, start_pos);

+    // Update the record of error used so far (only done once per gf group)
+    cpi->modified_error_used += gf_group_err;
+
    // Assign  bits to the arf or gf.
    {
        int Boost;
@@ -1738,17 +1875,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->section_intra_rating =
+            sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -1892,6 +2021,16 @@ void vp8_second_pass(VP8_COMP *cpi)
    // Is this a GF / ARF (Note that a KF is always also a GF)
    if (cpi->frames_till_gf_update_due == 0)
    {
+        // Update monitor of the bits per error observed so far.
+        // Done once per gf group based on what has gone before
+        // so do nothing if this is the first frame.
+        if (cpi->common.current_video_frame > 0)
+        {
+            cpi->observed_bpe =
+                (double)(cpi->clip_bits_total - cpi->bits_left) /
+                cpi->modified_error_used;
+        }
+
        // Define next gf group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
        define_gf_group(cpi, &this_frame_copy);
@@ -1965,22 +2104,56 @@ void vp8_second_pass(VP8_COMP *cpi)

    if (cpi->common.current_video_frame == 0)
    {
-        // guess at 2nd pass q
        cpi->est_max_qcorrection_factor = 1.0;
-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);

-        if (tmp_q < cpi->worst_quality)
+        // Experimental code to try and set a cq_level in constrained
+        // quality mode.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
        {
-            cpi->active_worst_quality         = tmp_q;
-            cpi->ni_av_qi                     = tmp_q;
-        }
-        else
-        {
-            cpi->active_worst_quality         = cpi->worst_quality;
-            cpi->ni_av_qi                     = cpi->worst_quality;
+            int est_cq;
+
+            est_cq =
+                estimate_cq( cpi,
+                             (cpi->total_coded_error_left / frames_left),
+                             (int)(cpi->bits_left / frames_left),
+                             cpi->common.Height, cpi->common.Width);
+
+            cpi->cq_target_quality = cpi->oxcf.cq_level;
+            if ( est_cq > cpi->cq_target_quality )
+                cpi->cq_target_quality = est_cq;
        }
+
+        // guess at maxq needed in 2nd pass
+        cpi->maxq_max_limit = cpi->worst_quality;
+        cpi->maxq_min_limit = cpi->best_quality;
+        tmp_q = estimate_max_q( cpi,
+                                (cpi->total_coded_error_left / frames_left),
+                                (int)(cpi->bits_left / frames_left),
+                                cpi->common.Height,
+                                cpi->common.Width);
+
+        // Limit the maxq value returned subsequently.
+        // This increases the risk of overspend or underspend if the initial
+        // estimate for the clip is bad, but helps prevent excessive
+        // variation in Q, especially near the end of a clip
+        // where for example a small overspend may cause Q to crash
+        cpi->maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+                                  ? (tmp_q + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+                                  ? (tmp_q - 32) : cpi->best_quality;
+
+        cpi->active_worst_quality         = tmp_q;
+        cpi->ni_av_qi                     = tmp_q;
    }
-    else
+
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ( (cpi->common.current_video_frame <
+                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                  (unsigned int)cpi->total_stats->count) )
    {
        if (frames_left < 1)
            frames_left = 1;
@@ -1994,13 +2167,6 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->active_worst_quality --;

        cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
-
-        // Clamp to user set limits
-        if (cpi->active_worst_quality > cpi->worst_quality)
-            cpi->active_worst_quality = cpi->worst_quality;
-        else if (cpi->active_worst_quality < cpi->best_quality)
-            cpi->active_worst_quality = cpi->best_quality;
-
    }

    cpi->frames_to_key --;
@@ -2122,6 +2288,9 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    cpi->common.frame_type = KEY_FRAME;

+    // is this a forced key frame by interval
+    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
    // Clear the alt ref active flag as this can never be active on a key frame
    cpi->source_alt_ref_active = FALSE;

@@ -2184,7 +2353,11 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        kf_group_err /= 2.0;
        kf_group_intra_err /= 2.0;
        kf_group_coded_err /= 2.0;
+
+        cpi->next_key_frame_forced = TRUE;
    }
+    else
+        cpi->next_key_frame_forced = FALSE;

    // Special case for the last frame of the file
    if (cpi->stats_in >= cpi->stats_in_end)
@@ -2199,7 +2372,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    }

    // Calculate the number of bits that should be assigned to the kf group.
-    if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
+    if ((cpi->bits_left > 0) && ((int)cpi->modified_error_left > 0))
    {
        // Max for a single normal frame (not key frame)
        int max_bits = frame_max_bits(cpi);
@@ -2211,7 +2384,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // complexity of the section
        cpi->kf_group_bits = (long long)( cpi->bits_left *
                                          ( kf_group_err /
-                                            cpi->modified_total_error_left ));
+                                            cpi->modified_error_left ));

        // Clip based on maximum per frame rate defined by the user.
        max_grp_bits = (long long)max_bits * (long long)cpi->frames_to_key;
@@ -2283,7 +2456,12 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;

-        r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)) ;
+        if (next_frame.intra_error > cpi->kf_intra_err_min)
+            r = (IIKFACTOR2 * next_frame.intra_error /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        else
+            r = (IIKFACTOR2 * cpi->kf_intra_err_min /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

        if (r > RMAX)
            r = RMAX;
@@ -2344,17 +2522,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -2434,7 +2602,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        kf_boost = (int)((double)kf_boost * 100.0) >> 4;                          // Scale 16 to 100

        // Adjustment to boost based on recent average q
-        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        //kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;

        if (kf_boost < 250)                                                      // Min KF boost
            kf_boost = 250;
@@ -2474,7 +2642,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            double  alt_kf_grp_bits =
                        ((double)cpi->bits_left *
                         (kf_mod_err * (double)cpi->frames_to_key) /
-                         DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left));
+                         DOUBLE_DIVIDE_CHECK(cpi->modified_error_left));

            alt_kf_bits = (int)((double)kf_boost *
                                (alt_kf_grp_bits / (double)allocation_chunks));
@@ -2492,7 +2660,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            alt_kf_bits =
                (int)((double)cpi->bits_left *
                      (kf_mod_err /
-                       DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left)));
+                       DOUBLE_DIVIDE_CHECK(cpi->modified_error_left)));

            if (alt_kf_bits > cpi->kf_bits)
            {
@@ -2512,7 +2680,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    // Adjust the count of total modified error left.
    // The count of bits left is adjusted elsewhere based on real coded frame sizes
-    cpi->modified_total_error_left -= kf_group_err;
+    cpi->modified_error_left -= kf_group_err;

    if (cpi->oxcf.allow_spatial_resampling)
    {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;

+    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
+    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
+    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
+    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
+    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
+
    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -85,9 +91,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)

    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-
+#if !(CONFIG_REALTIME_ONLY)
    cpi->rtcd.search.full_search             = vp8_full_search_sad;
+#endif
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
+
+    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif

    // Pure C:
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -408,6 +408,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
        break;
    case 3:
+    default:
        this_mv.col += 4;
        this_mv.row += 4;
        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
@@ -913,7 +914,8 @@ int vp8_diamond_search_sad
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    int i, j, step;
@@ -940,6 +942,8 @@ int vp8_diamond_search_sad
    unsigned char *check_here;
    int thissad;

+    *num00 = 0;
+
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -949,7 +953,7 @@ int vp8_diamond_search_sad
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -961,8 +965,6 @@ int vp8_diamond_search_sad
    best_mv->row = ref_row;
    best_mv->col = ref_col;

-    *num00 = 0;
-
    for (step = 0; step < tot_steps ; step++)
    {
        for (j = 0 ; j < x->searches_per_step ; j++)
@@ -982,7 +984,7 @@ int vp8_diamond_search_sad
                {
                    this_mv.row = this_row_offset << 3;
                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1013,7 +1015,7 @@ int vp8_diamond_search_sad
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }

 int vp8_diamond_search_sadx4
@@ -1028,7 +1030,8 @@ int vp8_diamond_search_sadx4
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    int i, j, step;
@@ -1055,6 +1058,8 @@ int vp8_diamond_search_sadx4
    unsigned char *check_here;
    unsigned int thissad;

+    *num00 = 0;
+
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -1064,7 +1069,7 @@ int vp8_diamond_search_sadx4
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -1076,8 +1081,6 @@ int vp8_diamond_search_sadx4
    best_mv->row = ref_row;
    best_mv->col = ref_col;

-    *num00 = 0;
-
    for (step = 0; step < tot_steps ; step++)
    {
        int all_in = 1, t;
@@ -1108,7 +1111,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (sad_array[t] < bestsad)
                        {
@@ -1137,7 +1140,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = this_row_offset << 3;
                        this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (thissad < bestsad)
                        {
@@ -1168,12 +1171,12 @@ int vp8_diamond_search_sadx4
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }


 #if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1211,7 +1214,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
        // Baseline value at the centre

        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1239,7 +1242,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
            this_mv.col = c << 3;
            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

            if (thissad < bestsad)
            {
@@ -1258,12 +1261,12 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }

-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1301,7 +1304,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1323,7 +1326,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
        check_here = r * mv_stride + in_what + col_min;
        c = col_min;

-        while ((c + 3) < col_max)
+        while ((c + 2) < col_max)
        {
            int i;

@@ -1336,7 +1339,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1359,7 +1362,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                if (thissad < bestsad)
                {
@@ -1381,12 +1384,163 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
-#endif

+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    unsigned int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned short sad_array8[8];
+    unsigned int sad_array[3];
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 7) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
+
+            for (i = 0; i < 8; i++)
+            {
+                thissad = (unsigned int)sad_array8[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.col = c << 3;
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->row = r;
+                    best_mv->col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+#endif /* !(CONFIG_REALTIME_ONLY) */

 #ifdef ENTROPY_STATS
 void print_mode_context(void)
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -25,7 +25,6 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units

-
 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
 extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
@@ -67,7 +66,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int distance, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvcost[2], \
-     int *mvsadcost[2] \
+     int *mvsadcost[2], \
+     MV *center_mv \
    )

 #define prototype_diamond_search_sad(sym)\
@@ -83,7 +83,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int *num00, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvsadcost[2], \
-     int *mvcost[2] \
+     int *mvcost[2], \
+     MV *center_mv \
    )

 #if ARCH_X86 || ARCH_X86_64
@@ -93,6 +94,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
 typedef prototype_full_search_sad(*vp8_full_search_fn_t);
 extern prototype_full_search_sad(vp8_full_search_sad);
 extern prototype_full_search_sad(vp8_full_search_sadx3);
+extern prototype_full_search_sad(vp8_full_search_sadx8);

 typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
 extern prototype_diamond_search_sad(vp8_diamond_search_sad);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,7 +18,6 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "onyxc_int.h"
-#include "preproc.h"
 #include "variance.h"
 #include "dct.h"
 #include "encodemb.h"
@@ -28,6 +27,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
+#include "temporal_filter.h"

 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -46,6 +46,8 @@
 #define MAX_THRESHMULT  512

 #define GF_ZEROMV_ZBIN_BOOST 24
+#define LF_ZEROMV_ZBIN_BOOST 12
+#define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192

 #define VP8_TEMPORAL_ALT_REF 1
@@ -180,6 +182,9 @@ typedef struct
    int first_step;
    int optimize_coefficients;

+    int use_fastquant_for_pick;
+    int no_skip_block4x4_search;
+
 } SPEED_FEATURES;

 typedef struct
@@ -227,6 +232,7 @@ typedef struct VP8_ENCODER_RTCD
    vp8_encodemb_rtcd_vtable_t  encodemb;
    vp8_quantize_rtcd_vtable_t  quantize;
    vp8_search_rtcd_vtable_t    search;
+    vp8_temporal_rtcd_vtable_t  temporal;
 } VP8_ENCODER_RTCD;

 enum
@@ -239,6 +245,12 @@ enum
    BLOCK_MAX_SEGMENTS
 };

+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
 typedef struct
 {

@@ -260,6 +272,9 @@ typedef struct
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);


    MACROBLOCK mb;
@@ -276,14 +291,14 @@ typedef struct
    unsigned int source_frame_flags;
    YV12_BUFFER_CONFIG scaled_source;

-    int source_buffer_count;
-    int source_encode_index;
-    int source_alt_ref_pending;
-    int source_alt_ref_active;
+    int source_buffer_count;    // number of src_buffers in use for lagged encoding
+    int source_encode_index;    // index of buffer in src_buffer to encode
+    int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
+    int source_alt_ref_active;  // an alt ref frame has been encoded and is usable

-    int last_alt_ref_sei;
-    int is_src_frame_alt_ref;
-    int is_next_src_alt_ref;
+    int last_alt_ref_sei;       // index into src_buffers of frame used as alt reference
+    int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+    int is_next_src_alt_ref;    // source of next frame to encode is an exact copy of an alt ref frame

    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
@@ -302,7 +317,11 @@ typedef struct

    unsigned int frames_since_key;
    unsigned int key_frame_frequency;
-    unsigned int next_key;
+    unsigned int this_key_frame_forced;
+    unsigned int next_key_frame_forced;
+
+    // Ambient reconstruction err target for force key frames
+    int ambient_err;

    unsigned int mode_check_freq[MAX_MODES];
    unsigned int mode_test_hit_counts[MAX_MODES];
@@ -319,6 +338,7 @@ typedef struct
    int mvcostmultiplier;
    int subseqblockweight;
    int errthresh;
+    unsigned int activity_avg;

    int RDMULT;
    int RDDIV ;
@@ -350,7 +370,6 @@ typedef struct
    int this_frame_target;
    int projected_frame_size;
    int last_q[2];                   // Separate values for Intra/Inter
-    int target_bits_per_mb;

    double rate_correction_factor;
    double key_frame_rate_correction_factor;
@@ -383,6 +402,7 @@ typedef struct
    int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
    int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
    int max_gf_interval;
+    int static_scene_max_gf_interval;
    int baseline_gf_interval;
    int gf_decay_rate;
    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -399,6 +419,7 @@ typedef struct
    int inter_frame_target;
    double output_frame_rate;
    long long last_time_stamp_seen;
+    long long last_end_time_stamp_seen;
    long long first_time_stamp_ever;

    int ni_av_qi;
@@ -431,6 +452,10 @@ typedef struct
    int best_quality;
    int active_best_quality;

+    int cq_target_quality;
+    int maxq_max_limit;
+    int maxq_min_limit;
+
    int drop_frames_allowed;          // Are we permitted to drop frames?
    int drop_frame;                  // Drop this frame?
    int drop_count;                  // How many frames have we dropped?
@@ -454,8 +479,6 @@ typedef struct
    unsigned char *output_partition2;
    size_t output_partition2size;

-    pre_proc_instance ppi;
-
    int frames_to_key;
    int gfu_boost;
    int kf_boost;
@@ -465,12 +488,20 @@ typedef struct
    double total_coded_error_left;
    double start_tot_err_left;
    double min_error;
+    double kf_intra_err_min;
+    double gf_intra_err_min;
+
+    double modified_error_total;
+    double modified_error_used;
+    double modified_error_left;
+    double clip_bpe;
+    double observed_bpe;

-    double modified_total_error_left;
    double avg_iiratio;

    int target_bandwidth;
    long long bits_left;
+    long long clip_bits_total;
    FIRSTPASS_STATS *total_stats;
    FIRSTPASS_STATS *this_frame_stats;
    FIRSTPASS_STATS *stats_in, *stats_in_end;
@@ -611,9 +642,6 @@ typedef struct
    unsigned int tempdata2;

    int base_skip_false_prob[128];
-    unsigned int section_is_low_motion;
-    unsigned int section_benefits_from_aggresive_q;
-    unsigned int section_is_fast_motion;
    unsigned int section_intra_rating;

    double section_max_qfactor;
@@ -661,6 +689,10 @@ typedef struct
    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
    int gf_active_count;

+    //Store last frame's MV info for next frame MV prediction
+    int_mv *lfmv;
+    int *lf_ref_frame_sign_bias;
+    int *lf_ref_frame;

 } VP8_COMP;

@@ -670,6 +702,8 @@ void vp8_encode_frame(VP8_COMP *cpi);

 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);

+unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
+
 int rd_cost_intra_mb(MACROBLOCKD *x);

 void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -608,8 +608,10 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
        }

-        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
-        if (cpi->is_src_frame_alt_ref)
+        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+        // unless ARNR filtering is enabled in which case we want
+        // an unfiltered alternative
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
        {
            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                continue;
@@ -685,7 +687,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
 #if 0

            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
+            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -698,7 +700,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                    num00--;
                else
                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
+                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);

                    if (thissme < bestsme)
                    {
@@ -724,7 +726,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -743,7 +745,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9

                        if (thissme < bestsme)
                        {
--- a/vp8/encoder/preproc.c
+++ b/vp8/encoder/preproc.c
@@ -1,251 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc.c
-*
-*   Description  :     Simple pre-processor.
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-#include "memory.h"
-#include "preproc7.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*  Macros
-****************************************************************************/
-#define FRAMECOUNT 7
-#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-/****************************************************************************
-*  Exported Global Variables
-****************************************************************************/
-void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-void temp_filter_mmx
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-);
-void temp_filter_wmt
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-);
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_c
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_c
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int frame = 0;
-
-            do
-            {
-                *frameptr = s[byte];
-                ++frameptr;
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            d[byte] = s[byte];
-
-            ++byte;
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int modifier;
-        int offset = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            int accumulator = 0;
-            int count = 0;
-            int frame = 0;
-
-            frameptr[offset] = s[byte];
-
-            do
-            {
-                int pixel_value = *frameptr;
-
-                modifier   = s[byte];
-                modifier  -= pixel_value;
-                modifier  *= modifier;
-                modifier >>= strength;
-                modifier  *= 3;
-
-                if (modifier > 16)
-                    modifier = 16;
-
-                modifier = 16 - modifier;
-
-                accumulator += modifier * pixel_value;
-
-                count += modifier;
-
-                frameptr++;
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            accumulator += (count >> 1);
-            accumulator *= ppi->fixed_divide[count];
-            accumulator >>= 16;
-
-            d[byte] = accumulator;
-
-            ++byte;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-}
-/****************************************************************************
- *
- *  ROUTINE       : delete_pre_proc
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Deletes a pre-processing instance.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void delete_pre_proc(pre_proc_instance *ppi)
-{
-    if (ppi->frame_buffer_alloc)
-        vpx_free(ppi->frame_buffer_alloc);
-
-    ppi->frame_buffer_alloc = 0;
-    ppi->frame_buffer      = 0;
-
-    if (ppi->fixed_divide_alloc)
-        vpx_free(ppi->fixed_divide_alloc);
-
-    ppi->fixed_divide_alloc = 0;
-    ppi->fixed_divide      = 0;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : init_pre_proc
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  int frame_size        : Number of bytes in one frame.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: 1 if successful, 0 if failed.
- *
- *  FUNCTION      : Initializes prepprocessor instance.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int init_pre_proc7(pre_proc_instance *ppi, int frame_size)
-{
-    int i;
-    int mmx_enabled;
-    int xmm_enabled;
-    int wmt_enabled;
-
-    vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
-    if (wmt_enabled)
-        temp_filter = temp_filter_wmt;
-    else if (mmx_enabled)
-        temp_filter = temp_filter_mmx;
-    else
-        temp_filter = temp_filter_c;
-
-
-    delete_pre_proc(ppi);
-
-    ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char));
-
-    if (!ppi->frame_buffer_alloc)
-    {
-        delete_pre_proc(ppi);
-        return 0;
-    }
-
-    ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc);
-
-    ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int));
-
-    if (!ppi->fixed_divide_alloc)
-    {
-        delete_pre_proc(ppi);
-        return 0;
-    }
-
-    ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc);
-
-    for (i = 1; i < 255; i++)
-        ppi->fixed_divide[i] = 0x10000 / i;
-
-    return 1;
-}
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,8 +16,9 @@
 #include "entropy.h"
 #include "predictdc.h"

-//#define EXACT_QUANT
-#ifdef EXACT_QUANT
+#define EXACT_QUANT
+
+#ifdef EXACT_FASTQUANT
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -26,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    short *coeff_ptr       = b->coeff;
    short *zbin_ptr        = b->zbin;
    short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant;
+    short *quant_ptr       = b->quant_fast;
    short *quant_shift_ptr = b->quant_shift;
    short *qcoeff_ptr      = d->qcoeff;
    short *dqcoeff_ptr     = d->dqcoeff;
@@ -64,6 +65,44 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    d->eob = eob + 1;
 }

+#else
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc] = x;                          // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+        if (y)
+        {
+            eob = i;                                // last nonzero coeffs
+        }
+    }
+    d->eob = eob + 1;
+}
+
+#endif
+
+#ifdef EXACT_QUANT
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -178,39 +217,6 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
 }

 #else
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr   = b->coeff;
-    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
-    short *qcoeff_ptr  = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = d->dequant;
-
-    eob = -1;
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
-
-        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc] = x;                          // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
-
-        if (y)
-        {
-            eob = i;                                // last nonzero coeffs
-        }
-    }
-    d->eob = eob + 1;
-}

 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -45,46 +45,48 @@ extern int inter_b_modes[10];
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9

+// Work in progress recalibration of baseline rate tables based on
+// the assumption that bits per mb is inversely proportional to the
+// quantizer value.
 const int vp8_bits_per_mb[2][QINDEX_RANGE] =
 {
-    // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q:
+    // Intra case 450000/Qintra
    {
-        674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719,
-        390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402,
-        285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615,
-        229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401,
-        187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560,
-        156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700,
-        134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782,
-        114883, 112332, 108410, 105685, 103434, 101192,  98587,  95959,
-        94059,  92017,  89970,  87936,  86142,  84801,  82736,  81106,
-        79668,  78135,  76641,  75103,  73943,  72693,  71401,  70098,
-        69165,  67901,  67170,  65987,  64923,  63534,  62378,  61302,
-        59921,  58941,  57844,  56782,  55960,  54973,  54257,  53454,
-        52230,  50938,  49962,  49190,  48288,  47270,  46738,  46037,
-        45020,  44027,  43216,  42287,  41594,  40702,  40081,  39414,
-        38282,  37627,  36987,  36375,  35808,  35236,  34710,  34162,
-        33659,  33327,  32751,  32384,  31936,  31461,  30982,  30582,
+        1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
+        409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
+        250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
+        195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
+        155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
+        121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
+        102272, 100000, 97826,  97826,  95744,  93750,  91836,  90000,
+        88235,  86538,  84905,  83333,  81818,  80357,  78947,  77586,
+        76271,  75000,  73770,  72580,  71428,  70312,  69230,  68181,
+        67164,  66176,  65217,  64285,  63380,  62500,  61643,  60810,
+        60000,  59210,  59210,  58441,  57692,  56962,  56250,  55555,
+        54878,  54216,  53571,  52941,  52325,  51724,  51136,  50561,
+        49450,  48387,  47368,  46875,  45918,  45000,  44554,  44117,
+        43269,  42452,  41666,  40909,  40178,  39473,  38793,  38135,
+        36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
+        32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
    },
-
-    // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q:
+    // Inter case 285000/Qinter
    {
-        497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321,
-        233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116,
-        154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226,
-        114188, 111756, 107882, 104749, 102522,  96451,  94424,  90905,
-        87286,  84931,  82111,  80534,  77610,  74700,  73037,  70715,
-        68006,  67235,  65374,  64009,  62134,  60180,  59105,  57691,
-        55509,  54512,  53318,  52693,  51194,  49840,  48944,  46980,
-        45668,  44177,  42348,  40994,  39859,  38889,  37717,  36391,
-        35482,  34622,  33795,  32756,  32002,  31492,  30573,  29737,
-        29152,  28514,  27941,  27356,  26859,  26329,  25874,  25364,
-        24957,  24510,  24290,  23689,  23380,  22845,  22481,  22066,
-        21587,  21219,  20880,  20452,  20260,  19926,  19661,  19334,
-        18915,  18391,  18046,  17833,  17441,  17105,  16888,  16729,
-        16383,  16023,  15706,  15442,  15222,  14938,  14673,  14452,
-        14005,  13807,  13611,  13447,  13223,  13102,  12963,  12801,
-        12627,  12534,  12356,  12228,  12056,  11907,  11746,  11643,
+        712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
+        237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
+        142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
+        101785, 98275,  95000,  91935,  89062,  86363,  83823,  81428,
+        79166,  77027,  75000,  73076,  71250,  69512,  67857,  66279,
+        64772,  63333,  61956,  60638,  59375,  58163,  57000,  55882,
+        54807,  53773,  52777,  51818,  50892,  50000,  49137,  47500,
+        45967,  44531,  43181,  41911,  40714,  39583,  38513,  37500,
+        36538,  35625,  34756,  33928,  33139,  32386,  31666,  30978,
+        30319,  29687,  29081,  28500,  27941,  27403,  26886,  26388,
+        25909,  25446,  25000,  24568,  23949,  23360,  22800,  22265,
+        21755,  21268,  20802,  20357,  19930,  19520,  19127,  18750,
+        18387,  18037,  17701,  17378,  17065,  16764,  16473,  16101,
+        15745,  15405,  15079,  14766,  14467,  14179,  13902,  13636,
+        13380,  13133,  12895,  12666,  12445,  12179,  11924,  11632,
+        11445,  11220,  11003,  10795,  10594,  10401,  10215,  10035,
    }
 };

@@ -324,6 +326,7 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
        cpi->frames_till_gf_update_due = cpi->goldfreq;

    cpi->common.refresh_golden_frame = TRUE;
+    cpi->common.refresh_alt_ref_frame = TRUE;
 }

 void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
@@ -1034,9 +1037,7 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
            gf_frame_useage = pct_gf_active;

        // Is a fixed manual GF frequency being used
-        if (!cpi->auto_gold)
-            cpi->common.refresh_golden_frame = TRUE;
-        else
+        if (cpi->auto_gold)
        {
            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
@@ -1549,12 +1550,21 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
                    }
                }
-                // VBR
+                // VBR and CQ mode
                // Note that tighter restrictions here can help quality but hurt encode speed
                else
                {
-                    *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-                    *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    // Stron overshoot limit for constrained quality
+                    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
                }
            }
        }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -126,6 +126,24 @@ void vp8_sad16x16x3_c(
    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad16x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -139,6 +157,24 @@ void vp8_sad16x8x3_c(
    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad16x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -152,6 +188,24 @@ void vp8_sad8x8x3_c(
    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad8x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x16x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -165,6 +219,24 @@ void vp8_sad8x16x3_c(
    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad8x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad4x4x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -178,6 +250,24 @@ void vp8_sad4x4x3_c(
    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad4x4x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
    const unsigned char *src_ptr,
    int  src_stride,
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -37,29 +37,9 @@
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering

-#define USE_FILTER_LUT 1
 #if VP8_TEMPORAL_ALT_REF

-#if USE_FILTER_LUT
-static int modifier_lut[7][19] =
-{
-    // Strength=0
-    {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=1
-    {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=2
-    {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=3
-    {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=4
-    {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
-    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
-};
-#endif
-static void build_predictors_mb
+static void vp8_temporal_filter_predictors_mb_c
 (
    MACROBLOCKD *x,
    unsigned char *y_mb_ptr,
@@ -79,14 +59,11 @@ static void build_predictors_mb

    if ((mv_row | mv_col) & 7)
    {
-//        vp8_sixtap_predict16x16_c(yptr, stride,
-//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
        x->subpixel_predict16x16(yptr, stride,
                                    mv_col & 7, mv_row & 7, &pred[0], 16);
    }
    else
    {
-        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
        RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
    }

@@ -111,7 +88,7 @@ static void build_predictors_mb
        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, stride, &pred[320], 8);
    }
 }
-static void apply_temporal_filter
+void vp8_temporal_filter_apply_c
 (
    unsigned char *frame1,
    unsigned int stride,
@@ -120,17 +97,13 @@ static void apply_temporal_filter
    int strength,
    int filter_weight,
    unsigned int *accumulator,
-    unsigned int *count
+    unsigned short *count
 )
 {
    int i, j, k;
    int modifier;
    int byte = 0;

-#if USE_FILTER_LUT
-    int *lut = modifier_lut[strength];
-#endif
-
    for (i = 0,k = 0; i < block_size; i++)
    {
        for (j = 0; j < block_size; j++, k++)
@@ -139,23 +112,19 @@ static void apply_temporal_filter
            int src_byte = frame1[byte];
            int pixel_value = *frame2++;

-#if USE_FILTER_LUT
-            // LUT implementation --
-            // improves precision of filter
-            modifier = abs(src_byte-pixel_value);
-            modifier = modifier>18 ? 0 : lut[modifier];
-#else
-            modifier   = src_byte;
-            modifier  -= pixel_value;
+            modifier   = src_byte - pixel_value;
+            // This is an integer approximation of:
+            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
            modifier  *= modifier;
-            modifier >>= strength;
            modifier  *= 3;
+            modifier  += 1 << (strength - 1);
+            modifier >>= strength;

            if (modifier > 16)
                modifier = 16;

            modifier = 16 - modifier;
-#endif
            modifier *= filter_weight;

            count[k] += modifier;
@@ -171,7 +140,7 @@ static void apply_temporal_filter
 #if ALT_REF_MC_ENABLED
 static int dummy_cost[2*mv_max+1];

-static int find_matching_mb
+static int vp8_temporal_filter_find_matching_mb_c
 (
    VP8_COMP *cpi,
    YV12_BUFFER_CONFIG *arf_frame,
@@ -246,7 +215,7 @@ static int find_matching_mb
            step_param,
            sadpb / 2/*x->errorperbit*/,
            &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost); //sadpb < 9
+            mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9

        // Further step/diamond searches as necessary
        n = 0;
@@ -268,7 +237,7 @@ static int find_matching_mb
                    step_param + n,
                    sadpb / 4/*x->errorperbit*/,
                    &num00, &cpi->fn_ptr[BLOCK_16X16],
-                    mvsadcost, mvcost); //sadpb = 9
+                    mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9

                if (thissme < bestsme)
                {
@@ -292,7 +261,7 @@ static int find_matching_mb
        bestsme = cpi->find_fractional_mv_step(x, b, d,
                    &d->bmi.mv.as_mv, &best_ref_mv1,
                    x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
-                    cpi->mb.mvcost);
+                    mvcost);
    }
 #endif

@@ -308,7 +277,7 @@ static int find_matching_mb
 }
 #endif

-static void vp8cx_temp_blur1_c
+static void vp8_temporal_filter_iterate_c
 (
    VP8_COMP *cpi,
    int frame_count,
@@ -321,17 +290,17 @@ static void vp8cx_temp_blur1_c
    int mb_col, mb_row;
    unsigned int filter_weight[MAX_LAG_BUFFERS];
    unsigned char *mm_ptr = cpi->fp_motion_map;
-    int cols = cpi->common.mb_cols;
-    int rows = cpi->common.mb_rows;
+    int mb_cols = cpi->common.mb_cols;
+    int mb_rows = cpi->common.mb_rows;
    int MBs  = cpi->common.MBs;
    int mb_y_offset = 0;
    int mb_uv_offset = 0;
-    unsigned int accumulator[384];
-    unsigned int count[384];
+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
    unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);

    // Save input state
    unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -345,7 +314,7 @@ static void vp8cx_temp_blur1_c
            filter_weight[frame] = 1;
    }

-    for (mb_row = 0; mb_row < rows; mb_row++)
+    for (mb_row = 0; mb_row < mb_rows; mb_row++)
    {
 #if ALT_REF_MC_ENABLED
        // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -354,14 +323,14 @@ static void vp8cx_temp_blur1_c
                                + (VP8BORDERINPIXELS - 19);
 #endif

-        for (mb_col = 0; mb_col < cols; mb_col++)
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
        {
            int i, j, k, w;
            int weight_cap;
            int stride;

            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned short));

 #if ALT_REF_MC_ENABLED
            // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -412,11 +381,12 @@ static void vp8cx_temp_blur1_c
 #define THRESH_HIGH  20000

                    // Correlation has been lost try MC
-                    err = find_matching_mb ( cpi,
-                                             cpi->frames[alt_ref_index],
-                                             cpi->frames[frame],
-                                             mb_y_offset,
-                                             THRESH_LOW );
+                    err = vp8_temporal_filter_find_matching_mb_c
+                        (cpi,
+                         cpi->frames[alt_ref_index],
+                         cpi->frames[frame],
+                         mb_y_offset,
+                         THRESH_LOW);

                    if (filter_weight[frame] < 2)
                    {
@@ -429,43 +399,46 @@ static void vp8cx_temp_blur1_c
                if (filter_weight[frame] != 0)
                {
                    // Construct the predictors
-                    build_predictors_mb (
-                              mbd,
-                              cpi->frames[frame]->y_buffer + mb_y_offset,
-                              cpi->frames[frame]->u_buffer + mb_uv_offset,
-                              cpi->frames[frame]->v_buffer + mb_uv_offset,
-                              cpi->frames[frame]->y_stride,
-                              mbd->block[0].bmi.mv.as_mv.row,
-                              mbd->block[0].bmi.mv.as_mv.col,
-                              predictor );
+                    vp8_temporal_filter_predictors_mb_c
+                        (mbd,
+                         cpi->frames[frame]->y_buffer + mb_y_offset,
+                         cpi->frames[frame]->u_buffer + mb_uv_offset,
+                         cpi->frames[frame]->v_buffer + mb_uv_offset,
+                         cpi->frames[frame]->y_stride,
+                         mbd->block[0].bmi.mv.as_mv.row,
+                         mbd->block[0].bmi.mv.as_mv.col,
+                         predictor);

                    // Apply the filter (YUV)
-                    apply_temporal_filter ( f->y_buffer + mb_y_offset,
-                                            f->y_stride,
-                                            predictor,
-                                            16,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator,
-                                            count );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->y_buffer + mb_y_offset,
+                         f->y_stride,
+                         predictor,
+                         16,
+                         strength,
+                         filter_weight[frame],
+                         accumulator,
+                         count);

-                    apply_temporal_filter ( f->u_buffer + mb_uv_offset,
-                                            f->uv_stride,
-                                            predictor + 256,
-                                            8,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator + 256,
-                                            count + 256 );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->u_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 256,
+                         8,
+                         strength,
+                         filter_weight[frame],
+                         accumulator + 256,
+                         count + 256);

-                    apply_temporal_filter ( f->v_buffer + mb_uv_offset,
-                                            f->uv_stride,
-                                            predictor + 320,
-                                            8,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator + 320,
-                                            count + 320 );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->v_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 320,
+                         8,
+                         strength,
+                         filter_weight[frame],
+                         accumulator + 320,
+                         count + 320);
                }
            }

@@ -524,8 +497,8 @@ static void vp8cx_temp_blur1_c
            mb_uv_offset += 8;
        }

-        mb_y_offset += 16*f->y_stride-f->y_width;
-        mb_uv_offset += 8*f->uv_stride-f->uv_width;
+        mb_y_offset += 16*(f->y_stride-mb_cols);
+        mb_uv_offset += 8*(f->uv_stride-mb_cols);
    }

    // Restore input state
@@ -534,7 +507,7 @@ static void vp8cx_temp_blur1_c
    mbd->pre.v_buffer = v_buffer;
 }

-void vp8cx_temp_filter_c
+void vp8_temporal_filter_prepare_c
 (
    VP8_COMP *cpi
 )
@@ -642,7 +615,7 @@ void vp8cx_temp_filter_c
                = &cpi->src_buffer[which_buffer].source_buffer;
    }

-    vp8cx_temp_blur1_c (
+    vp8_temporal_filter_iterate_c (
        cpi,
        frames_to_blur,
        frames_to_blur_backward,
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -12,8 +12,37 @@
 #ifndef __INC_VP8_TEMPORAL_FILTER_H
 #define __INC_VP8_TEMPORAL_FILTER_H

-#include "onyx_int.h"
+#define prototype_apply(sym)\
+    void (sym) \
+    ( \
+     unsigned char *frame1, \
+     unsigned int stride, \
+     unsigned char *frame2, \
+     unsigned int block_size, \
+     int strength, \
+     int filter_weight, \
+     unsigned int *accumulator, \
+     unsigned short *count \
+    )

-void vp8cx_temp_filter_c(VP8_COMP *cpi);
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
+#ifndef vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+#endif
+extern prototype_apply(vp8_temporal_filter_apply);
+
+typedef struct
+{
+    prototype_apply(*apply);
+} vp8_temporal_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define TEMPORAL_INVOKE(ctx,fn) vp8_temporal_filter_##fn
+#endif

 #endif // __INC_VP8_TEMPORAL_FILTER_H
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -132,8 +132,6 @@ static void tokenize2nd_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

-        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
-
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -185,7 +183,6 @@ static void tokenize1st_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

-        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -434,7 +431,6 @@ static __inline void stuff2nd_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
-    t->section = 11;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -465,7 +461,6 @@ static __inline void stuff1st_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
-    t->section = 8;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -495,7 +490,6 @@ void stuff1st_order_buv

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
-    t->section = 13;
    t->skip_eob_node = 0;
    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -25,11 +25,10 @@ typedef struct

 typedef struct
 {
-    int Token;
-    int Extra;
    const vp8_prob *context_tree;
-    int skip_eob_node;
-    int section;
+    short           Extra;
+    unsigned char   Token;
+    unsigned char   skip_eob_node;
 } TOKENEXTRA;

 int rd_cost_mby(MACROBLOCKD *);
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -32,6 +32,16 @@
     unsigned int *sad_array\
    )

+#define prototype_sad_multi_same_address_1(sym)\
+    void (sym)\
+    (\
+     const unsigned char *src_ptr, \
+     int source_stride, \
+     const unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned short *sad_array\
+    )
+
 #define prototype_sad_multi_dif_address(sym)\
    void (sym)\
    (\
@@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);

+#ifndef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
+
+#ifndef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
+
+#ifndef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
+
+#ifndef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
+
+#ifndef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
+
 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

 #ifndef vp8_variance_sad16x16x4d
@@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);

 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
 typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
 typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
@@ -317,6 +353,12 @@ typedef struct
    vp8_sad_multi_fn_t       sad8x8x3;
    vp8_sad_multi_fn_t       sad4x4x3;

+    vp8_sad_multi1_fn_t      sad16x16x8;
+    vp8_sad_multi1_fn_t      sad16x8x8;
+    vp8_sad_multi1_fn_t      sad8x16x8;
+    vp8_sad_multi1_fn_t      sad8x8x8;
+    vp8_sad_multi1_fn_t      sad4x4x8;
+
    vp8_sad_multi_d_fn_t     sad16x16x4d;
    vp8_sad_multi_d_fn_t     sad16x8x4d;
    vp8_sad_multi_d_fn_t     sad8x16x4d;
@@ -334,6 +376,7 @@ typedef struct
    vp8_variance_fn_t       svf_halfpix_v;
    vp8_variance_fn_t       svf_halfpix_hv;
    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi1_fn_t     sdx8f;
    vp8_sad_multi_d_fn_t    sdx4df;
 } vp8_variance_fn_ptr_t;

--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,511 +11,231 @@

 %include "vpx_ports/x86_abi_support.asm"

-section .text
-    global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_short_fdct8x4_wmt)
-
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
 sym(vp8_short_fdct4x4_mmx):
    push        rbp
-    mov         rbp, rsp
+    mov         rbp,        rsp
    SHADOW_ARGS_TO_STACK 3
    GET_GOT     rbx
-    push rsi
-    push rdi
+    push        rsi
+    push        rdi
    ; end prolog
-        mov     rsi,    arg(0) ;input
-        mov     rdi,    arg(1) ;output

-        lea     rdx,    [GLOBAL(dct_const_mmx)]
-        movsxd  rax,    dword ptr arg(2) ;pitch
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output

-        lea     rcx,    [rsi + rax*2]
+        movsxd      rax,        dword ptr arg(2) ;pitch
+
+        lea         rcx,        [rsi + rax*2]
        ; read the input data
-        movq    mm0,    [rsi]
-        movq    mm1,    [rsi + rax    ]
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]

-        movq    mm2,    [rcx]
-        movq    mm3,    [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw   mm0,    3
-        psllw   mm1,    3
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]

-        psllw   mm2,    3
-        psllw   mm3,    3
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23

-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13

-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33

-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30

+        punpckhdq   mm1,        mm2         ; 01 11 21 31

-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32

-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
+        punpckhdq   mm3,        mm5         ; 03 13 23 33

+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3

        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
+        movq        mm5,        mm0
+        movq        mm4,        mm1

-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3

+        psllw       mm5,        3
+        psllw       mm4,        3
+
+        psllw       mm0,        3
+        psllw       mm1,        3

        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
+        movq        mm2,        mm0         ; a1

-        paddw   mm0,    mm1         ; a + b
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1

        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1

-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5

-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]

        ; done with vertical
        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32

-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31

-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33

+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03

-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
+        punpckhdq   mm1,        mm2         ; 01 22 12 13

-        punpckhdq   mm1,    mm2     ; 01 11 21 31
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23

-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
+        punpckhdq   mm4,        mm5         ; 30 31 32 33

-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4

+        movq        mm5,        mm0
+        movq        mm3,        mm1

-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2

-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3

-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        pxor        mm6,        mm6         ; zero out for compare

+        pcmpeqw     mm6,        mm5         ; d1 != 0
+
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower

        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-        paddw   mm0,    mm1         ; a + b
+        movq        mm2,        mm0         ; a1

-        psubw   mm2,    mm1         ; a - b
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1

-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2

        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1

-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5

-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-        ; done with vertical
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]

-        pcmpeqw mm4,    mm4
-        pcmpeqw mm5,    mm5
-        psrlw   mm4,    15
-        psrlw   mm5,    15
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)

-        psllw   mm4,    2
-        psllw   mm5,    2
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3

-        paddw   mm0,    mm4
-        paddw   mm1,    mm5
-        paddw   mm2,    mm4
-        paddw   mm3,    mm5
-
-        psraw   mm0, 3
-        psraw   mm1, 3
-        psraw   mm2, 3
-        psraw   mm3, 3
-
-        movq        [rdi   ],   mm0
-        movq        [rdi+ 8],   mm1
-        movq        [rdi+16],   mm2
-        movq        [rdi+24],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
+     ; begin epilog
+    pop         rdi
+    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        lea         rdx,    [GLOBAL(dct_const_xmm)]
-        movsxd      rax,    dword ptr arg(2) ;pitch
-
-        lea         rcx,    [rsi + rax*2]
-        ; read the input data
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm2,       [rsi + rax]
-
-        movdqa      xmm4,       [rcx]
-        movdqa      xmm3,       [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw       xmm0,        3
-        psllw       xmm2,        3
-
-        psllw       xmm4,        3
-        psllw       xmm3,        3
-
-        ; transpose for the second stage
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
-
-        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
-
-        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
-
-        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
-
-
-        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
-
-        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
-
-        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
-
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
-
-        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
-
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
-
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
-
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
-
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
-
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
-
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
-
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
-
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
-
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-
-
-        ; done with vertical
-        ; transpose for the second stage
-        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
-        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
-
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
-
-        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
-
-        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
-
-        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
-
-        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
-
-
-        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
-
-        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
-
-        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
-
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
-
-        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
-
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
-
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
-
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
-
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
-
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
-
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
-
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
-
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
-
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-        ; done with vertical
-
-
-        pcmpeqw     xmm4,       xmm4
-        pcmpeqw     xmm5,       xmm5;
-        psrlw       xmm4,       15
-        psrlw       xmm5,       15
-
-        psllw       xmm4,       2
-        psllw       xmm5,       2
-
-        paddw       xmm0,       xmm4
-        paddw       xmm1,       xmm5
-        paddw       xmm2,       xmm4
-        paddw       xmm3,       xmm5
-
-        psraw       xmm0,       3
-        psraw       xmm1,       3
-        psraw       xmm2,       3
-        psraw       xmm3,       3
-
-        movq        QWORD PTR[rdi   ],   xmm0
-        movq        QWORD PTR[rdi+ 8],   xmm1
-        movq        QWORD PTR[rdi+16],   xmm2
-        movq        QWORD PTR[rdi+24],   xmm3
-
-        psrldq      xmm0,       8
-        psrldq      xmm1,       8
-        psrldq      xmm2,       8
-        psrldq      xmm3,       8
-
-        movq        QWORD PTR[rdi+32],   xmm0
-        movq        QWORD PTR[rdi+40],   xmm1
-        movq        QWORD PTR[rdi+48],   xmm2
-        movq        QWORD PTR[rdi+56],   xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
-    times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
-    times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
-    times 4 dw 23170
-
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw 23170
-    times 2 dw -23170
-    dw 23170
-
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
-    times 4 dw 0
-    times 4 dw 60547
-    times 4 dw 46341
-    times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
-    times 8 dw 0
-    times 8 dw 60547
-    times 8 dw 46341
-    times 8 dw 25080
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,32 +11,68 @@

 %include "vpx_ports/x86_abi_support.asm"

-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-;;    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rsi, arg(0)
-    movsxd      rax, DWORD PTR arg(2)
-    lea         rdi, [rsi + rax*2]
+    mov         rdi, arg(1)

-    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
-    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
-    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
-    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
+
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30

    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20

-    mov         rdi, arg(1)
-
    movdqa      xmm2, xmm0
    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
@@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2):
    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
    movdqa      xmm1, xmm0
    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
@@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2):
    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]

-    movdqa      XMMWORD PTR[rdi + 0], xmm0
-    movdqa      XMMWORD PTR[rdi + 16], xmm1
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1

-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-;;    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY

 SECTION_RODATA
 align 16
@@ -161,7 +397,9 @@ align 16
 _cmp_mask:
    times 4 dw 1
    times 4 dw 0
-
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
 align 16
 _mult_sub:
    dw 1
@@ -176,6 +414,9 @@ align 16
 _7:
    times 4 dd 7
 align 16
+_7w:
+    times 8 dw 7
+align 16
 _14500:
    times 4 dd 14500
 align 16
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 0
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-#endif

 #endif
+
 #endif


 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_sse2);
 extern prototype_fdct(vp8_short_walsh4x4_sse2);

 extern prototype_fdct(vp8_short_fdct4x4_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 1
-/* short SSE2 DCT currently disabled, does not match the MMX version */
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
-#endif

 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2

-#undef vp8_fdct_walsh_short4x4
+#undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2

 #endif
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,6 +17,7 @@ sym(vp8_short_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
@@ -143,6 +144,7 @@ sym(vp8_short_walsh4x4_sse2):
    pop rdi
    pop rsi
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -24,5 +24,14 @@
 #endif
 #endif

+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
 #endif

--- a/vp8/encoder/x86/preproc_mmx.c
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -1,298 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "memory.h"
-#include "preproc.h"
-#include "pragmas.h"
-
-/****************************************************************************
-*  Macros
-****************************************************************************/
-#define FRAMECOUNT 7
-#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-/****************************************************************************
-*  Exported Global Variables
-****************************************************************************/
-void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_wmt
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_wmt
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
-    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int i;
-            int frame = 0;
-
-            do
-            {
-                for (i = 0; i < 8; i++)
-                {
-                    *frameptr = s[byte+i];
-                    ++frameptr;
-                }
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            for (i = 0; i < 8; i++)
-                d[byte+i] = s[byte+i];
-
-            byte += 8;
-
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int i;
-        int offset2 = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            __declspec(align(16)) unsigned short counts[8];
-            __declspec(align(16)) unsigned short sums[8];
-            __asm
-            {
-                mov         eax, offset2
-                mov         edi, s                  // source pixels
-                pxor        xmm1, xmm1              // accumulator
-
-                pxor        xmm7, xmm7
-
-                mov         esi, frameptr           // accumulator
-                pxor        xmm2, xmm2              // count
-
-                movq        xmm3, QWORD PTR [edi]
-
-                movq        QWORD PTR [esi+8*eax], xmm3
-
-                punpcklbw   xmm3, xmm2              // xmm3 source pixels
-                mov         ecx,  FRAMECOUNT
-
-                next_frame:
-                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
-                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
-                movdqa      xmm6, xmm4              // save the pixel values
-                psubsw      xmm4, xmm3              // subtracted pixel values
-                pmullw      xmm4, xmm4              // square xmm4
-                movd        xmm5, strength
-                psrlw       xmm4, xmm5              // should be strength
-                pmullw      xmm4, threes            // 3 * modifier
-                movdqa      xmm5, sixteens          // 16s
-                psubusw     xmm5, xmm4              // 16 - modifiers
-                movdqa      xmm4, xmm5              // save the modifiers
-                pmullw      xmm4, xmm6              // multiplier values
-                paddusw     xmm1, xmm4              // accumulator
-                paddusw     xmm2, xmm5              // count
-                add         esi, 8                  // next frame
-                dec         ecx                     // next set of eight pixels
-                jnz         next_frame
-
-                movdqa      counts, xmm2
-                psrlw       xmm2, 1                 // divide count by 2 for rounding
-                paddusw     xmm1, xmm2              // rounding added in
-
-                mov         frameptr, esi
-
-                movdqa      sums, xmm1
-            }
-
-            for (i = 0; i < 8; i++)
-            {
-                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
-                blurvalue >>= 16;
-                d[i] = blurvalue;
-            }
-
-            s += 8;
-            d += 8;
-            byte += 8;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-    __asm emms
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_mmx
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_mmx
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
-    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int i;
-            int frame = 0;
-
-            do
-            {
-                for (i = 0; i < 4; i++)
-                {
-                    *frameptr = s[byte+i];
-                    ++frameptr;
-                }
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            for (i = 0; i < 4; i++)
-                d[byte+i] = s[byte+i];
-
-            byte += 4;
-
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int i;
-        int offset2 = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            __declspec(align(16)) unsigned short counts[8];
-            __declspec(align(16)) unsigned short sums[8];
-            __asm
-            {
-
-                mov         eax, offset2
-                mov         edi, s                  // source pixels
-                pxor        mm1, mm1                // accumulator
-                pxor        mm7, mm7
-
-                mov         esi, frameptr           // accumulator
-                pxor        mm2, mm2                // count
-
-                movd        mm3, DWORD PTR [edi]
-                movd        DWORD PTR [esi+4*eax], mm3
-
-                punpcklbw   mm3, mm2                // mm3 source pixels
-                mov         ecx,  FRAMECOUNT
-
-                next_frame:
-                movd        mm4, DWORD PTR [esi]    // get frame buffer values
-                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
-                movq        mm6, mm4                // save the pixel values
-                psubsw      mm4, mm3                // subtracted pixel values
-                pmullw      mm4, mm4                // square mm4
-                movd        mm5, strength
-                psrlw       mm4, mm5                // should be strength
-                pmullw      mm4, threes             // 3 * modifier
-                movq        mm5, sixteens           // 16s
-                psubusw     mm5, mm4                // 16 - modifiers
-                movq        mm4, mm5                // save the modifiers
-                pmullw      mm4, mm6                // multiplier values
-                paddusw     mm1, mm4                // accumulator
-                paddusw     mm2, mm5                // count
-                add         esi, 4                  // next frame
-                dec         ecx                     // next set of eight pixels
-                jnz         next_frame
-
-                movq        counts, mm2
-                psrlw       mm2, 1                  // divide count by 2 for rounding
-                paddusw     mm1, mm2                // rounding added in
-
-                mov         frameptr, esi
-
-                movq        sums, mm1
-
-            }
-
-            for (i = 0; i < 4; i++)
-            {
-                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
-                blurvalue >>= 16;
-                d[i] = blurvalue;
-            }
-
-            s += 4;
-            d += 4;
-            byte += 4;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-    __asm emms
-}
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -253,10 +253,9 @@ rq_zigzag_1c:
    pop         rbp
    ret

-
 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
 ;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
+;                           short *inv_scan_order, short *round_ptr,
 ;                           short *quant_ptr, short *dqcoeff_ptr);
 global sym(vp8_fast_quantize_b_impl_sse2)
 sym(vp8_fast_quantize_b_impl_sse2):
@@ -265,32 +264,18 @@ sym(vp8_fast_quantize_b_impl_sse2):
    SHADOW_ARGS_TO_STACK 7
    push        rsi
    push        rdi
-    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-
-    %define save_xmm6  0
-    %define save_xmm7 16
-
-    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
-
-    sub         rsp, vp8_fastquantizeb_stack_size
-
-    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
-    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
-
    mov         rdx, arg(0)                 ;coeff_ptr
    mov         rcx, arg(2)                 ;dequant_ptr
-    mov         rax, arg(3)                 ;scan_mask
    mov         rdi, arg(4)                 ;round_ptr
    mov         rsi, arg(5)                 ;quant_ptr

    movdqa      xmm0, XMMWORD PTR[rdx]
    movdqa      xmm4, XMMWORD PTR[rdx + 16]

-    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
-    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi
+    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
+    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4
@@ -303,8 +288,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    psubw       xmm1, xmm0                  ;x = abs(z)
    psubw       xmm5, xmm4                  ;x = abs(z)

-    paddw       xmm1, xmm6
-    paddw       xmm5, xmm7
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3

    pmulhw      xmm1, XMMWORD PTR[rsi]
    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
@@ -312,8 +297,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    mov         rdi, arg(1)                 ;qcoeff_ptr
    mov         rsi, arg(6)                 ;dqcoeff_ptr

-    movdqa      xmm6, XMMWORD PTR[rcx]
-    movdqa      xmm7, XMMWORD PTR[rcx + 16]
+    movdqa      xmm2, XMMWORD PTR[rcx]
+    movdqa      xmm3, XMMWORD PTR[rcx + 16]

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
@@ -323,64 +308,47 @@ sym(vp8_fast_quantize_b_impl_sse2):
    movdqa      XMMWORD PTR[rdi], xmm1
    movdqa      XMMWORD PTR[rdi + 16], xmm5

-    pmullw      xmm6, xmm1
-    pmullw      xmm7, xmm5
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5

-    movdqa      xmm2, XMMWORD PTR[rax]
-    movdqa      xmm3, XMMWORD PTR[rax+16];
+    mov         rdi, arg(3)                 ;inv_scan_order

-    pxor        xmm4, xmm4            ;clear all bits
+    ; Start with 16
+    pxor        xmm4, xmm4                  ;clear all bits
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4            ;set all bits
+    pcmpeqw     xmm4, xmm4                  ;set all bits
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4

-    psrlw       xmm1, 15
-    psrlw       xmm5, 15
+    pand        xmm1, XMMWORD PTR[rdi]
+    pand        xmm5, XMMWORD PTR[rdi+16]

-    pmaddwd     xmm1, xmm2
-    pmaddwd     xmm5, xmm3
+    pmaxsw      xmm1, xmm5

-    movq        xmm2, xmm1
-    movq        xmm3, xmm5
+    ; now down to 8
+    pshufd      xmm5, xmm1, 00001110b

-    psrldq      xmm1, 8
-    psrldq      xmm5, 8
+    pmaxsw      xmm1, xmm5

-    paddd       xmm1, xmm5
-    paddd       xmm2, xmm3
+    ; only 4 left
+    pshuflw     xmm5, xmm1, 00001110b

-    paddd       xmm1, xmm2
-    movq        xmm5, xmm1
+    pmaxsw      xmm1, xmm5

-    psrldq      xmm1, 4
-    paddd       xmm5, xmm1
+    ; okay, just 2!
+    pshuflw     xmm5, xmm1, 00000001b

-    movq        rcx,  xmm5
-    and         rcx,  0xffff
+    pmaxsw      xmm1, xmm5

-    xor         rdx,  rdx
-    sub         rdx,  rcx
+    movd        rax, xmm1
+    and         rax, 0xff

-    bsr         rax,  rcx
-    inc         rax
-
-    sar         rdx,  31
-    and         rax,  rdx
-
-    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
-    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
-
-    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
-    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
-
-    add         rsp, vp8_fastquantizeb_stack_size
-    pop         rsp
+    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
+    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff

    ; begin epilog
-    pop         rbx
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rdi, arg(3)                 ;round_ptr
+    mov         rsi, arg(4)                 ;quant_ptr
+
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+
+    movdqa      xmm2, [rdi]                 ;round lo
+    movdqa      xmm3, [rdi + 16]            ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    pmulhw      xmm1, [rsi]
+    pmulhw      xmm5, [rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rsi, arg(5)                 ;dqcoeff_ptr
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rdi], xmm1
+    movdqa      [rdi + 16], xmm5
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [ GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+;    xor         ecx, ecx
+;    mov         eax, -1
+;find_eob_loop:
+;    shr         edx, 1
+;    jc          fq_skip
+;    mov         eax, ecx
+;fq_skip:
+;    inc         ecx
+;    cmp         ecx, 16
+;    jne         find_eob_loop
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rsi], xmm2                 ;store dqcoeff
+    movdqa      [rsi + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,6 +77,7 @@ sym(vp8_subtract_mby_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM
    GET_GOT     rbx
    push rsi
    push rdi
@@ -138,6 +139,7 @@ submby_loop:
    pop rsi
    ; begin epilog
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2)
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         temporal_filter_apply_load_finished
+
+temporal_filter_apply_load_16:
+        movdqu      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddw       xmm4,           xmm0
+        paddw       xmm5,           xmm2
+        paddw       xmm6,           xmm1
+        paddw       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          temporal_filter_apply_load_16
+        jmp         temporal_filter_apply_load_8
+
+temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
--- a/vp8/encoder/x86/temporal_filter_x86.h
+++ b/vp8/encoder/x86/temporal_filter_x86.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
+#define __INC_VP8_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp8_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -297,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #endif
 #endif

+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef  vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef  vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef  vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef  vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
 #endif
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,11 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_c(input,   output,    pitch);
-    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }

-
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 short *scan_mask, short *round_ptr,
@@ -33,7 +32,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
    short *coeff_ptr   = b->coeff;
    short *zbin_ptr    = b->zbin;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -82,22 +81,16 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif

 #if HAVE_SSE2
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-{
-    vp8_short_fdct4x4_sse2(input,   output,    pitch);
-    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 short *scan_mask, short *round_ptr,
+                                 const short *inv_scan_order, short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
    short *coeff_ptr   = b->coeff;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -106,8 +99,7 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
                 coeff_ptr,
                 qcoeff_ptr,
                 dequant_ptr,
-                 scan_mask,
-
+                 vp8_default_inv_zig_zag,
                 round_ptr,
                 quant_ptr,
                 dqcoeff_ptr
@@ -179,6 +171,25 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

 #endif

+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_impl_ssse3(
+                    b->coeff,
+                    d->qcoeff,
+                    d->dequant,
+                    b->round,
+                    b->quant_fast,
+                    d->dqcoeff
+               );
+}
+#endif
+
+
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -188,6 +199,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
    int wmt_enabled = flags & HAS_SSE2;
    int SSE3Enabled = flags & HAS_SSE3;
    int SSSE3Enabled = flags & HAS_SSSE3;
+    int SSE4_1Enabled = flags & HAS_SSE4_1;

    /* Note:
     *
@@ -198,7 +210,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

    /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
-
    if (mmx_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
@@ -230,18 +241,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-#if 0 // new fdct
+
        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
-#else
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
-
-#endif

        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

@@ -254,10 +258,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
    }
-
 #endif
-#if HAVE_SSE2

+#if HAVE_SSE2
    if (wmt_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
@@ -306,11 +309,12 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
+
+        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
    }
-
 #endif
-#if HAVE_SSE3

+#if HAVE_SSE3
    if (SSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
@@ -319,8 +323,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
        cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
        cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
+#if !(CONFIG_REALTIME_ONLY)
        cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
-
+#endif
        cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
        cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
        cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
@@ -328,16 +333,32 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
        cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
    }
-
 #endif
-#if HAVE_SSSE3

+#if HAVE_SSSE3
    if (SSSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+
    }
+#endif
+
+#if HAVE_SSE4_1
+    if (SSE4_1Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
+        cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
+        cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
+        cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
+        cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
+#if !(CONFIG_REALTIME_ONLY)
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+#endif
+    }
+#endif

-#endif
 #endif
 }
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -17,7 +17,6 @@ VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
-VP8_COMMON_SRCS-yes += common/preproc.h
 VP8_COMMON_SRCS-yes += common/vpxerrors.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -37,6 +37,8 @@ struct vp8_extracfg
    unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
    unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
    unsigned int                arnr_type;        /* alt_ref filter type */
+    vp8e_tuning                 tuning;
+    unsigned int                cq_level;         /* constrained quality level */

 };

@@ -67,6 +69,8 @@ static const struct extraconfig_map extracfg_map[] =
            0,                          /* arnr_max_frames */
            3,                          /* arnr_strength */
            3,                          /* arnr_type*/
+            0,                          /* tuning*/
+            10,                         /* cq_level */
        }
    }
 };
@@ -104,6 +108,7 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 }


+#undef ERROR
 #define ERROR(str) do {\
        ctx->base.err_detail = str;\
        return VPX_CODEC_INVALID_PARAM;\
@@ -132,8 +137,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                       const vpx_codec_enc_cfg_t *cfg,
                                       const struct vp8_extracfg *vp8_cfg)
 {
-    RANGE_CHECK(cfg, g_w,                   2, 16384);
-    RANGE_CHECK(cfg, g_h,                   2, 16384);
+    RANGE_CHECK(cfg, g_w,                   1, 16384);
+    RANGE_CHECK(cfg, g_h,                   1, 16384);
    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
    RANGE_CHECK_HI(cfg, g_profile,          3);
@@ -145,7 +150,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CBR);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
    RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
@@ -187,7 +192,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
+    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);

+#if !(CONFIG_REALTIME_ONLY)
    if (cfg->g_pass == VPX_RC_LAST_PASS)
    {
        int              mb_r = (cfg->g_h + 15) / 16;
@@ -211,6 +218,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
        if ((int)(stats->count + 0.5) != n_packets - 1)
            ERROR("rc_twopass_stats_in missing EOS stats packet");
    }
+#endif

    return VPX_CODEC_OK;
 }
@@ -295,11 +303,16 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    {
        oxcf->end_usage          = USAGE_STREAM_FROM_SERVER;
    }
+    else if (cfg.rc_end_usage == VPX_CQ)
+    {
+        oxcf->end_usage          = USAGE_CONSTRAINED_QUALITY;
+    }

    oxcf->target_bandwidth       = cfg.rc_target_bitrate;

    oxcf->best_allowed_q          = cfg.rc_min_quantizer;
    oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
+    oxcf->cq_level                = vp8_cfg.cq_level;
    oxcf->fixed_q = -1;

    oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
@@ -335,6 +348,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->arnr_strength =  vp8_cfg.arnr_strength;
    oxcf->arnr_type =      vp8_cfg.arnr_type;

+    oxcf->tuning = vp8_cfg.tuning;

    /*
        printf("Current VP8 Settings: \n");
@@ -448,6 +462,8 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
        MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
        MAP(VP8E_SET_ARNR_STRENGTH ,        xcfg.arnr_strength);
        MAP(VP8E_SET_ARNR_TYPE     ,        xcfg.arnr_type);
+        MAP(VP8E_SET_TUNING,                xcfg.tuning);
+        MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);

    }

@@ -860,8 +876,16 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {

    YV12_BUFFER_CONFIG sd;
+    vp8_ppflags_t flags = {0};

-    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag))
+    if (ctx->preview_ppcfg.post_proc_flag)
+    {
+        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+        flags.noise_level           = ctx->preview_ppcfg.noise_level;
+    }
+
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
    {

        /*
@@ -1020,6 +1044,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
    {VP8E_SET_ARNR_MAXFRAMES,           set_param},
    {VP8E_SET_ARNR_STRENGTH ,           set_param},
    {VP8E_SET_ARNR_TYPE     ,           set_param},
+    {VP8E_SET_TUNING,                   set_param},
+    {VP8E_SET_CQ_LEVEL,                 set_param},
    { -1, NULL},
 };

@@ -1055,7 +1081,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =

        4,                  /* rc_min_quantizer */
        63,                 /* rc_max_quantizer */
-
        95,                 /* rc_undershoot_pct */
        200,                /* rc_overshoot_pct */

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -65,12 +65,19 @@ struct vpx_codec_alg_priv
    vpx_codec_priv_t        base;
    vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs)-1];
    vpx_codec_dec_cfg_t     cfg;
-    vp8_stream_info_t   si;
+    vp8_stream_info_t       si;
    int                     defer_alloc;
    int                     decoder_init;
    VP8D_PTR                pbi;
    int                     postproc_cfg_set;
    vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+    unsigned int            dbg_postproc_flag;
+    int                     dbg_color_ref_frame_flag;
+    int                     dbg_color_mb_modes_flag;
+    int                     dbg_color_b_modes_flag;
+    int                     dbg_display_mv_flag;
+#endif
    vpx_image_t             img;
    int                     img_setup;
    int                     img_avail;
@@ -253,8 +260,11 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
                                   unsigned int           data_sz,
                                   vpx_codec_stream_info_t *si)
 {
-
    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if(data + data_sz <= data)
+        res = VPX_CODEC_INVALID_PARAM;
+    else
    {
        /* Parse uncompresssed part of key frame header.
         * 3 bytes:- including version, frame type and an offset
@@ -331,7 +341,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,

    ctx->img_avail = 0;

-    /* Determine the stream parameters */
+    /* Determine the stream parameters. Note that we rely on peek_si to
+     * validate that we have a buffer that does not wrap around the top
+     * of the heap.
+     */
    if (!ctx->si.h)
        res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);

@@ -410,15 +423,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    {
        YV12_BUFFER_CONFIG sd;
        INT64 time_stamp = 0, time_end_stamp = 0;
-        int ppflag       = 0;
-        int ppdeblocking = 0;
-        int ppnoise      = 0;
+        vp8_ppflags_t flags = {0};

        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
        {
-            ppflag      = ctx->postproc_cfg.post_proc_flag;
-            ppdeblocking = ctx->postproc_cfg.deblocking_level;
-            ppnoise     = ctx->postproc_cfg.noise_level;
+            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+                                ;
+            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+            flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
        }

        if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
@@ -427,7 +452,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            res = update_error_state(ctx, &pbi->common.error);
        }

-        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag))
+        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
        {
            /* Align width/height */
            unsigned int a_w = (sd.y_width + 15) & ~15;
@@ -441,6 +466,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            vpx_img_set_rect(&ctx->img,
                             VP8BORDERINPIXELS, VP8BORDERINPIXELS,
                             sd.y_width, sd.y_height);
+            ctx->img.user_priv = user_priv;
            ctx->img_avail = 1;

        }
@@ -640,12 +666,79 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }

+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                        int ctrl_id,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+    int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+    switch (ctrl_id)
+    {
+        MAP (VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
+        MAP (VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
+        MAP (VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
+        MAP (VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
+    }
+
+    return VPX_CODEC_OK;
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                int ctrl_id,
+                                                va_list args)
+{
+    int *update_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+
+    if (update_info)
+    {
+        *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+            + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+            + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                               int ctrl_id,
+                                               va_list args)
+{
+
+    int *corrupted = va_arg(args, int *);
+
+    if (corrupted)
+    {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+        *corrupted = pbi->common.frame_to_show->corrupted;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}

 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
-    {VP8_SET_REFERENCE,  vp8_set_reference},
-    {VP8_COPY_REFERENCE, vp8_get_reference},
-    {VP8_SET_POSTPROC,   vp8_set_postproc},
+    {VP8_SET_REFERENCE,             vp8_set_reference},
+    {VP8_COPY_REFERENCE,            vp8_get_reference},
+    {VP8_SET_POSTPROC,              vp8_set_postproc},
+    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
+    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
+    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
+    {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
    { -1, NULL},
 };

--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -94,6 +94,7 @@ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
@@ -107,8 +108,11 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -118,7 +118,9 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
 {
    vpx_codec_err_t res;

-    if (!ctx || !data || !data_sz)
+    /* Sanity checks */
+    /* NULL data ptr allowed if data_sz is 0 too */
+    if (!ctx || (!data && data_sz))
        res = VPX_CODEC_INVALID_PARAM;
    else if (!ctx->iface || !ctx->priv)
        res = VPX_CODEC_ERROR;
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -38,9 +38,13 @@
 */
 enum vp8_dec_control_id
 {
-    VP8_SET_REFERENCE       = 1,    /**< pass in an external frame into decoder to be used as reference frame */
-    VP8_COPY_REFERENCE      = 2,    /**< get a copy of reference frame from the decoder */
-    VP8_SET_POSTPROC        = 3,    /**< set decoder's the post processing settings  */
+    VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+    VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
+    VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
+    VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
+    VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
+    VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
+    VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
    VP8_COMMON_CTRL_ID_MAX
 };

@@ -50,10 +54,14 @@ enum vp8_dec_control_id
 */
 enum vp8_postproc_level
 {
-    VP8_NOFILTERING    = 0,
-    VP8_DEBLOCK        = 1,
-    VP8_DEMACROBLOCK   = 2,
-    VP8_ADDNOISE       = 4
+    VP8_NOFILTERING             = 0,
+    VP8_DEBLOCK                 = 1<<0,
+    VP8_DEMACROBLOCK            = 1<<1,
+    VP8_ADDNOISE                = 1<<2,
+    VP8_DEBUG_TXT_FRAME_INFO    = 1<<3, /**< print frame information */
+    VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
+    VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
+    VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
 };

 /*!\brief post process flags
@@ -65,9 +73,9 @@ enum vp8_postproc_level

 typedef struct vp8_postproc_cfg
 {
-    int post_proc_flag;           /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
-    int deblocking_level;        /**< the strength of deblocking, valid range [0, 16] */
-    int noise_level;             /**< the strength of additive noise, valid range [0, 16] */
+    int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+    int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
+    int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
 } vp8_postproc_cfg_t;

 /*!\brief reference frame type
@@ -95,12 +103,16 @@ typedef struct vpx_ref_frame

 /*!\brief vp8 decoder control funciton parameter type
 *
- * defines the data type for each of VP8 decoder control funciton requires
+ * defines the data type for each of VP8 decoder control function requires
 */

 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)


 /*! @} - end defgroup vp8 */
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -140,7 +140,9 @@ enum vp8e_enc_control_id
    VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
    VP8E_SET_ARNR_STRENGTH ,         /**< control function to set the filter strength for the arf */
    VP8E_SET_ARNR_TYPE     ,         /**< control function to set the type of filter to use for the arf*/
-} ;
+    VP8E_SET_TUNING,                 /**< control function to set visual tuning */
+    VP8E_SET_CQ_LEVEL,               /**< control function to set constrained quality level */
+};

 /*!\brief vpx 1-D scaling mode
 *
@@ -224,6 +226,18 @@ typedef enum
 } vp8e_token_partitions;


+/*!\brief VP8 model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum
+{
+    VP8_TUNE_PSNR,
+    VP8_TUNE_SSIM
+} vp8e_tuning;
+
+
 /*!\brief VP8 encoder control function parameter type
 *
 * Defines the data types that VP8E control functions take. Note that
@@ -253,7 +267,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   vp8e_token_partitions)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH ,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE     ,     unsigned int)
-
+VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             vp8e_tuning)
+VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL     ,      unsigned int)

 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -36,6 +36,32 @@ extern vpx_codec_iface_t* vpx_codec_vp8_dx(void);
 #include "vp8.h"


+/*!\brief VP8 decoder control functions
+ *
+ * The set of macros define the control functions of VP8 decoder interface
+ */
+enum vp8d_dec_control_id
+{
+    VP8_DECODER_CTRL_ID_START   = 256,
+    VP8D_GET_LAST_REF_UPDATES,              /**< control function to get info on which reference frames were updated
+                                            by the last decode */
+    VP8D_GET_FRAME_CORRUPTED,               /**< check if the indicated frame is corrupted */
+    VP8_DECODER_CTRL_ID_MAX
+} ;
+
+
+/*!\brief VP8 encoder control function parameter type
+ *
+ * Defines the data types that VP8E control functions take. Note that
+ * additional common controls are defined in vp8.h
+ *
+ */
+
+
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
+VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)
+
+
 /*! @} - end defgroup vp8_decoder */


--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -179,7 +179,8 @@ extern "C" {
    enum vpx_rc_mode
    {
        VPX_VBR, /**< Variable Bit Rate (VBR) mode */
-        VPX_CBR  /**< Constant Bit Rate (CBR) mode */
+        VPX_CBR,  /**< Constant Bit Rate (CBR) mode */
+        VPX_CQ   /**< Constant Quality  (CQ)  mode */
    };


--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -74,6 +74,7 @@ void __cpuid(int CPUInfo[4], int info_type);
 #define HAS_SSE2  0x04
 #define HAS_SSE3  0x08
 #define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -117,6 +118,8 @@ x86_simd_caps(void)

    if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;

+    if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
    return flags & mask;
 }

--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -81,6 +81,8 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int

        ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * ybf->uv_stride) + border / 2;
        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * ybf->uv_stride) + border / 2;
+
+        ybf->corrupted = 0; /* assume not currupted by errors */
    }
    else
    {
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -57,6 +57,8 @@ extern "C"
        int border;
        int frame_size;
        YUV_TYPE clrtype;
+
+        int corrupted;
    } YV12_BUFFER_CONFIG;

    int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -35,6 +35,7 @@
 #if CONFIG_MD5
 #include "md5_utils.h"
 #endif
+#include "tools_common.h"
 #include "nestegg/include/nestegg/nestegg.h"

 #ifndef PATH_MAX
@@ -107,11 +108,19 @@ static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level",
        "Enable VP8 demacroblocking, w/ level");
 static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
                                       "Enable VP8 visible debug info");
-
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
+                                       "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
+                                       "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
+                                       "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
+                                       "Draw only selected motion vectors");

 static const arg_def_t *vp8_pp_args[] =
 {
    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
    NULL
 };
 #endif
@@ -314,7 +323,8 @@ void *out_open(const char *out_fn, int do_md5)
    }
    else
    {
-        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb") : stdout;
+        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb")
+                                                  : set_binary_mode(stdout);

        if (!outfile)
        {
@@ -432,6 +442,8 @@ unsigned int file_is_raw(FILE *infile,
    int is_raw = 0;
    vpx_codec_stream_info_t si;

+    si.sz = sizeof(si);
+
    if (fread(buf, 1, 32, infile) == 32)
    {
        int i;
@@ -540,6 +552,7 @@ webm_guess_framerate(struct input_ctx *input,
    *fps_den = tstamp / 1000;
    return 0;
 fail:
+    nestegg_destroy(input->nestegg_ctx);
    input->nestegg_ctx = NULL;
    rewind(input->infile);
    return 1;
@@ -702,6 +715,10 @@ int main(int argc, const char **argv_)
    vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
+    int                     vp8_dbg_color_ref_frame = 0;
+    int                     vp8_dbg_color_mb_modes = 0;
+    int                     vp8_dbg_color_b_modes = 0;
+    int                     vp8_dbg_display_mv = 0;
 #endif
    struct input_ctx        input = {0};

@@ -787,6 +804,42 @@ int main(int argc, const char **argv_)
            if (level)
                vp8_pp_cfg.post_proc_flag |= level;
        }
+        else if (arg_match(&arg, &pp_disp_ref_frame, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_ref_frame = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mb_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_mb_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_b_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_b_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mvs, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_display_mv = flags;
+            }
+        }

 #endif
        else
@@ -805,7 +858,7 @@ int main(int argc, const char **argv_)
        usage_exit();

    /* Open file */
-    infile = strcmp(fn, "-") ? fopen(fn, "rb") : stdin;
+    infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);

    if (!infile)
    {
@@ -876,7 +929,13 @@ int main(int argc, const char **argv_)
        }

        if(input.kind == WEBM_FILE)
-            webm_guess_framerate(&input, &fps_den, &fps_num);
+            if(webm_guess_framerate(&input, &fps_den, &fps_num))
+            {
+                fprintf(stderr, "Failed to guess framerate -- error parsing "
+                                "webm file?\n");
+                return EXIT_FAILURE;
+            }
+

        /*Note: We can't output an aspect ratio here because IVF doesn't
           store one, and neither does VP8.
@@ -920,6 +979,33 @@ int main(int argc, const char **argv_)
        return EXIT_FAILURE;
    }

+    if (vp8_dbg_color_ref_frame
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
+    {
+        fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_mb_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
+    {
+        fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_b_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
+    {
+        fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_display_mv
+        && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
+    {
+        fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
 #endif

    /* Decode file */
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -35,6 +35,7 @@
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
+#include "tools_common.h"
 #include "y4minput.h"
 #include "libmkv/EbmlWriter.h"
 #include "libmkv/EbmlIDs.h"
@@ -185,11 +186,11 @@ int stats_open_mem(stats_io_t *stats, int pass)
 }


-void stats_close(stats_io_t *stats)
+void stats_close(stats_io_t *stats, int last_pass)
 {
    if (stats->file)
    {
-        if (stats->pass == 1)
+        if (stats->pass == last_pass)
        {
 #if 0
 #elif USE_POSIX_MMAP
@@ -204,7 +205,7 @@ void stats_close(stats_io_t *stats)
    }
    else
    {
-        if (stats->pass == 1)
+        if (stats->pass == last_pass)
            free(stats->buf.buf);
    }
 }
@@ -250,7 +251,8 @@ enum video_file_type

 struct detect_buffer {
    char buf[4];
-    int  valid;
+    size_t buf_read;
+    size_t position;
 };


@@ -304,14 +306,21 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,

            for (r = 0; r < h; r++)
            {
-                if (detect->valid)
+                size_t needed = w;
+                size_t buf_position = 0;
+                const size_t left = detect->buf_read - detect->position;
+                if (left > 0)
                {
-                    memcpy(ptr, detect->buf, 4);
-                    shortread |= fread(ptr+4, 1, w-4, f) < w-4;
-                    detect->valid = 0;
+                    const size_t more = (left < needed) ? left : needed;
+                    memcpy(ptr, detect->buf + detect->position, more);
+                    buf_position = more;
+                    needed -= more;
+                    detect->position += more;
+                }
+                if (needed > 0)
+                {
+                    shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
                }
-                else
-                    shortread |= fread(ptr, 1, w, f) < w;

                ptr += img->stride[plane];
            }
@@ -338,12 +347,12 @@ unsigned int file_is_ivf(FILE *infile,
                         unsigned int *fourcc,
                         unsigned int *width,
                         unsigned int *height,
-                         char          detect[4])
+                         struct detect_buffer *detect)
 {
    char raw_hdr[IVF_FILE_HDR_SZ];
    int is_ivf = 0;

-    if(memcmp(detect, "DKIF", 4) != 0)
+    if(memcmp(detect->buf, "DKIF", 4) != 0)
        return 0;

    /* See write_ivf_file_header() for more documentation on the file header
@@ -367,6 +376,7 @@ unsigned int file_is_ivf(FILE *infile,
    {
        *width = mem_get_le16(raw_hdr + 12);
        *height = mem_get_le16(raw_hdr + 14);
+        detect->position = 4;
    }

    return is_ivf;
@@ -434,7 +444,7 @@ struct EbmlGlobal
    int debug;

    FILE    *stream;
-    uint64_t last_pts_ms;
+    int64_t last_pts_ms;
    vpx_rational_t  framerate;

    /* These pointers are to the start of an element */
@@ -647,7 +657,7 @@ write_webm_block(EbmlGlobal                *glob,
    unsigned char  track_number;
    unsigned short block_timecode = 0;
    unsigned char  flags;
-    uint64_t       pts_ms;
+    int64_t        pts_ms;
    int            start_cluster = 0, is_keyframe;

    /* Calculate the PTS of this frame in milliseconds */
@@ -907,7 +917,7 @@ static const arg_def_t resize_up_thresh   = ARG_DEF(NULL, "resize-up", 1,
 static const arg_def_t resize_down_thresh = ARG_DEF(NULL, "resize-down", 1,
        "Downscale threshold (buf %)");
 static const arg_def_t end_usage          = ARG_DEF(NULL, "end-usage", 1,
-        "VBR=0 | CBR=1");
+        "VBR=0 | CBR=1 | CQ=2");
 static const arg_def_t target_bitrate     = ARG_DEF(NULL, "target-bitrate", 1,
        "Bitrate (kbps)");
 static const arg_def_t min_quantizer      = ARG_DEF(NULL, "min-q", 1,
@@ -978,23 +988,34 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                     "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                        "alt_ref Max Frames");
+                                        "AltRef Max Frames");
 static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                       "alt_ref Strength");
+                                       "AltRef Strength");
 static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                   "alt_ref Type");
+                                   "AltRef Type");
+static const struct arg_enum_list tuning_enum[] = {
+    {"psnr", VP8_TUNE_PSNR},
+    {"ssim", VP8_TUNE_SSIM},
+    {NULL, 0}
+};
+static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
+                                   "Material to favor", tuning_enum);
+static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
+                                   "Constrained Quality Level");

 static const arg_def_t *vp8_args[] =
 {
    &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
-    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type, NULL
+    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
+    &tune_ssim, &cq_level, NULL
 };
 static const int vp8_arg_ctrl_map[] =
 {
    VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
    VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
    VP8E_SET_TOKEN_PARTITIONS,
-    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE, 0
+    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE,
+    VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, 0
 };
 #endif

@@ -1073,6 +1094,7 @@ int main(int argc, const char **argv_)
    int                      psnr_count = 0;

    exec_name = argv_[0];
+    ebml.last_pts_ms = -1;

    if (argc < 3)
        usage_exit();
@@ -1189,6 +1211,12 @@ int main(int argc, const char **argv_)
     */
    cfg.g_timebase.den = 1000;

+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    cfg.g_w = 0;
+    cfg.g_h = 0;
+
    /* Now parse the remainder of the parameters. */
    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
    {
@@ -1300,7 +1328,7 @@ int main(int argc, const char **argv_)
                if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX)
                {
                    arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i];
-                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_int(&arg);
+                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg);
                    arg_ctrl_cnt++;
                }
            }
@@ -1330,11 +1358,11 @@ int main(int argc, const char **argv_)
    {
        int frames_in = 0, frames_out = 0;
        unsigned long nbytes = 0;
-        size_t detect_bytes;
        struct detect_buffer detect;

        /* Parse certain options from the input file, if possible */
-        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb") : stdin;
+        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb")
+                                    : set_binary_mode(stdin);

        if (!infile)
        {
@@ -1344,13 +1372,11 @@ int main(int argc, const char **argv_)

        /* For RAW input sources, these bytes will applied on the first frame
         *  in read_frame().
-         * We can always read 4 bytes because the minimum supported frame size
-         *  is 2x2.
         */
-        detect_bytes = fread(detect.buf, 1, 4, infile);
-        detect.valid = 0;
+        detect.buf_read = fread(detect.buf, 1, 4, infile);
+        detect.position = 0;

-        if (detect_bytes == 4 && file_is_y4m(infile, &y4m, detect.buf))
+        if (detect.buf_read == 4 && file_is_y4m(infile, &y4m, detect.buf))
        {
            if (y4m_input_open(&y4m, infile, detect.buf, 4) >= 0)
            {
@@ -1375,8 +1401,8 @@ int main(int argc, const char **argv_)
                return EXIT_FAILURE;
            }
        }
-        else if (detect_bytes == 4 &&
-                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, detect.buf))
+        else if (detect.buf_read == 4 &&
+                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, &detect))
        {
            file_type = FILE_TYPE_IVF;
            switch (fourcc)
@@ -1395,8 +1421,15 @@ int main(int argc, const char **argv_)
        else
        {
            file_type = FILE_TYPE_RAW;
-            detect.valid = 1;
        }
+
+        if(!cfg.g_w || !cfg.g_h)
+        {
+            fprintf(stderr, "Specify stream dimensions with --width (-w) "
+                            " and --height (-h).\n");
+            return EXIT_FAILURE;
+        }
+
 #define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, cfg.field)

        if (verbose && pass == 0)
@@ -1449,7 +1482,8 @@ int main(int argc, const char **argv_)
                              cfg.g_w, cfg.g_h, 1);
        }

-        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb") : stdout;
+        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb")
+                                      : set_binary_mode(stdout);

        if (!outfile)
        {
@@ -1527,7 +1561,7 @@ int main(int argc, const char **argv_)
            vpx_codec_iter_t iter = NULL;
            const vpx_codec_cx_pkt_t *pkt;
            struct vpx_usec_timer timer;
-            int64_t frame_start;
+            int64_t frame_start, next_frame_start;

            if (!arg_limit || frames_in < arg_limit)
            {
@@ -1548,9 +1582,11 @@ int main(int argc, const char **argv_)

            frame_start = (cfg.g_timebase.den * (int64_t)(frames_in - 1)
                          * arg_framerate.den) / cfg.g_timebase.num / arg_framerate.num;
+            next_frame_start = (cfg.g_timebase.den * (int64_t)(frames_in)
+                                * arg_framerate.den)
+                                / cfg.g_timebase.num / arg_framerate.num;
            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, frame_start,
-                             cfg.g_timebase.den * arg_framerate.den
-                             / cfg.g_timebase.num / arg_framerate.num,
+                             next_frame_start - frame_start,
                             0, arg_deadline);
            vpx_usec_timer_mark(&timer);
            cx_time += vpx_usec_timer_elapsed(&timer);
@@ -1658,7 +1694,7 @@ int main(int argc, const char **argv_)
        }

        fclose(outfile);
-        stats_close(&stats);
+        stats_close(&stats, arg_passes-1);
        fprintf(stderr, "\n");

        if (one_pass_only)