wip: add support for experimental knobs

Change-Id: I51a0a80877be9784a7b161b4a17604d3922f3aef
Fixed encoder crash when mult-threading is enabled.
2011-01-06 12:49:06 -05:00 · 2010-12-29 16:41:22 -05:00 · 2010-12-29 10:28:35 -05:00 · 2010-12-28 11:56:11 -08:00 · 2010-12-28 14:51:46 -05:00 · 2010-12-28 13:23:07 -05:00
75 changed files with 4324 additions and 2856 deletions
--- a/args.c
+++ b/args.c
@@ -135,6 +135,17 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
                     def->long_name, long_val);

        fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+        if(def->enums)
+        {
+            const struct arg_enum_list *listptr;
+
+            fprintf(fp, "  %-37s\t  ", "");
+
+            for(listptr = def->enums; listptr->name; listptr++)
+                fprintf(fp, "%s%s", listptr->name,
+                        listptr[1].name ? ", " : "\n");
+        }
    }
 }

@@ -218,3 +229,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg)

    return rat;
 }
+
+
+int arg_parse_enum(const struct arg *arg)
+{
+    const struct arg_enum_list *listptr;
+    long int                    rawval;
+    char                       *endptr;
+
+    /* First see if the value can be parsed as a raw value */
+    rawval = strtol(arg->val, &endptr, 10);
+    if (arg->val[0] != '\0' && endptr[0] == '\0')
+    {
+        /* Got a raw value, make sure it's valid */
+        for(listptr = arg->def->enums; listptr->name; listptr++)
+            if(listptr->val == rawval)
+                return rawval;
+    }
+
+    /* Next see if it can be parsed as a string */
+    for(listptr = arg->def->enums; listptr->name; listptr++)
+        if(!strcmp(arg->val, listptr->name))
+            return listptr->val;
+
+    die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+    return 0;
+}
+
+
+int arg_parse_enum_or_int(const struct arg *arg)
+{
+    if(arg->def->enums)
+        return arg_parse_enum(arg);
+    return arg_parse_int(arg);
+}
--- a/args.h
+++ b/args.h
@@ -22,14 +22,23 @@ struct arg
    const struct arg_def  *def;
 };

+struct arg_enum_list
+{
+    const char *name;
+    int         val;
+};
+#define ARG_ENUM_LIST_END {0}
+
 typedef struct arg_def
 {
    const char *short_name;
    const char *long_name;
    int         has_val;
    const char *desc;
+    const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d}
+#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
+#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
 #define ARG_DEF_LIST_END {0}

 struct arg arg_init(char **argv);
@@ -41,4 +50,5 @@ char **argv_dup(int argc, const char **argv);
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
 #endif
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -547,6 +547,10 @@ process_common_toolchain() {
                tgt_isa=universal
                tgt_os=darwin9
                ;;
+            *darwin10*)
+                tgt_isa=x86_64
+                tgt_os=darwin10
+                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -606,6 +610,12 @@ process_common_toolchain() {
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
            add_ldflags "-mmacosx-version-min=10.5"
            ;;
+        *-darwin10-*)
+            add_cflags  "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_cflags  "-mmacosx-version-min=10.6"
+            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_ldflags "-mmacosx-version-min=10.6"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -824,6 +834,7 @@ process_common_toolchain() {
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
+        soft_enable sse4_1

        case  ${tgt_os} in
            win*)
@@ -879,7 +890,7 @@ process_common_toolchain() {
        case  ${tgt_os} in
            win*)
                add_asflags -f win${bits}
-                enabled debug && add_asflags -g dwarf2
+                enabled debug && add_asflags -g cv8
            ;;
            linux*|solaris*)
                add_asflags -f elf${bits}
--- a/11
+++ b/11
@@ -41,6 +41,7 @@ Advanced options:
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
+  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -114,6 +115,7 @@ all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
+all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -199,6 +201,7 @@ ARCH_EXT_LIST="
    sse2
    sse3
    ssse3
+    sse4_1

    altivec
 "
@@ -249,6 +252,7 @@ CONFIG_LIST="
    shared
    small
    arm_asm_detok
+    postproc_visualizer
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -288,6 +292,7 @@ CMDLINE_SELECT="
    shared
    small
    arm_asm_detok
+    postproc_visualizer
 "

 process_cmdline() {
@@ -324,8 +329,6 @@ post_process_cmdline() {
    for c in ${CODECS}; do
        enabled ${c} && enable ${c##*_}s
    done
-
-
 }


@@ -535,6 +538,10 @@ process_toolchain() {

    # Other toolchain specific defaults
    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
 }


--- a/examples.mk
+++ b/examples.mk
@@ -17,6 +17,7 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
+vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -28,11 +29,13 @@ vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
 vpxenc.SRCS                 += libmkv/EbmlWriter.c
 vpxenc.SRCS                 += libmkv/EbmlWriter.h
+vpxenc.SRCS                 += experimental.c
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder

--- a/examples/vp8_set_maps.txt
+++ b/examples/vp8_set_maps.txt
@@ -78,8 +78,8 @@ if(frame_cnt + 1 == 22) {
 } else if(frame_cnt + 1 == 44) {
    vpx_active_map_t  active;

-    active.rows = 240/16;
-    active.cols = 320/16;
+    active.rows = cfg.g_h/16;
+    active.cols = cfg.g_w/16;

    /* pass in null map to disable active_map*/
    active.active_map = NULL;
--- a/experimental.c
+++ b/experimental.c
@@ -0,0 +1,29 @@
+#define EXPERIMENTAL_C
+#include <stdio.h>
+
+#include "args.h"
+
+/* Get argument definitions */
+#include "experimental.h"
+
+/* Build argument definition list */
+static const arg_def_t *xxx_def_list[] = {
+#include "experimental.h"
+NULL
+};
+
+void xxx_show_usage(FILE *fp)
+{
+    arg_show_usage(fp, xxx_def_list);
+}
+
+int xxx_parse_arg(char **argi)
+{
+    struct arg arg;
+
+    arg = arg_init(argi);
+    if(0);
+#include "experimental.h"
+    else return 0;
+    return 1;
+}
--- a/experimental.h
+++ b/experimental.h
@@ -0,0 +1,56 @@
+#if defined(EXPERIMENTAL_C)
+/* The experimental.c file includes this file multiple times to build up the
+ * required state.
+ */
+#if !defined(XXX_ARG_DEF)
+#define XXX_ARG_DEF(sym, value) \
+    static const arg_def_t xxx_arg_def_##sym = \
+        ARG_DEF(NULL, #sym, 1, "Experimental");
+
+#define XXX_DEFINE_INT(sym, value) \
+    XXX_ARG_DEF(sym, value); int xxx_##sym = value;
+#define XXX_DEFINE_UINT(sym, value) \
+    XXX_ARG_DEF(sym, value); unsigned int xxx_##sym = value;
+
+#elif !defined(XXX_ARG_DEF_LIST)
+#define XXX_ARG_DEF_LIST(sym) &xxx_arg_def_##sym,
+
+#undef  XXX_DEFINE_INT
+#define XXX_DEFINE_INT(sym, value) XXX_ARG_DEF_LIST(sym)
+
+#undef  XXX_DEFINE_UINT
+#define XXX_DEFINE_UINT(sym, value) XXX_ARG_DEF_LIST(sym)
+
+#elif !defined(XXX_ARG_MATCH)
+#define XXX_ARG_MATCH
+
+#undef  XXX_DEFINE_INT
+#define XXX_DEFINE_INT(sym, value)\
+    else if (arg_match(&arg, &xxx_arg_def_##sym, argi)) \
+        xxx_##sym = arg_parse_int(&arg);
+
+#undef  XXX_DEFINE_UINT
+#define XXX_DEFINE_UINT(sym, value)\
+    else if (arg_match(&arg, &xxx_arg_def_##sym, argi)) \
+        xxx_##sym = arg_parse_uint(&arg);
+
+#endif
+#else
+/* All other files just get the extern references to these symbols. */
+
+#define XXX_DEFINE_INT(sym, value) extern int xxx_##sym;
+#define XXX_DEFINE_UINT(sym, value) extern unsigned int xxx_##sym;
+
+
+#include <stdio.h>
+void xxx_show_usage(FILE *fp);
+int xxx_parse_arg(char **argi);
+#endif
+
+/*
+ * BEGIN EXPERIMENTS BELOW
+ *
+ * XXX_DEFINE_INT(knob, 0)
+ */
+XXX_DEFINE_INT(foo, 0)
+XXX_DEFINE_INT(bar, 0)
--- a/tools_common.c
+++ b/tools_common.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdio.h>
+#include "tools_common.h"
+#ifdef _WIN32
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+FILE* set_binary_mode(FILE *stream)
+{
+    (void)stream;
+#ifdef _WIN32
+    _setmode(_fileno(stream), _O_BINARY);
+#endif
+    return stream;
+}
--- a/tools_common.h
+++ b/tools_common.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TOOLS_COMMON_H
+#define TOOLS_COMMON_H
+
+/* Sets a stdio stream into binary mode */
+FILE* set_binary_mode(FILE *stream);
+
+#endif
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -36,6 +36,14 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
    7, 11, 14, 15,
 };

+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);

 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
@@ -106,23 +114,20 @@ static void init_bit_trees()
    init_bit_tree(cat6, 11);
 }

-
-static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
-
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0, 0},
-    { 0, 0, 0, 0, 1},
-    { 0, 0, 0, 0, 2},
-    { 0, 0, 0, 0, 3},
-    { 0, 0, 0, 0, 4},
-    { cat1, Pcat1, bcc1, 1, 5},
-    { cat2, Pcat2, bcc2, 2, 7},
-    { cat3, Pcat3, bcc3, 3, 11},
-    { cat4, Pcat4, bcc4, 4, 19},
-    { cat5, Pcat5, bcc5, 5, 35},
-    { cat6, Pcat6, bcc6, 11, 67},
-    { 0, 0, 0, 0, 0}
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"

--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -24,10 +24,10 @@
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

 #define vp8_coef_tokens 12
@@ -42,7 +42,6 @@ typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
-    vp8bc_index_t *prob_bc;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
@@ -95,6 +94,7 @@ struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);

 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];

--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -65,11 +65,13 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;

 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
+    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
+    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
+    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif

 #endif
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -18,6 +18,7 @@ extern "C"
 #endif

 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "type_aliases.h"
 #include "ppflags.h"
@@ -189,6 +190,8 @@ extern "C"

        struct vpx_fixed_buf         two_pass_stats_in;
        struct vpx_codec_pkt_list  *output_pkt_list;
+
+        vp8e_tuning tuning;
    } VP8_CONFIG;


@@ -204,7 +207,7 @@ extern "C"
 // and not just a copy of the pointer..
    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);

    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -105,7 +105,7 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;

-    FRAME_TYPE last_frame_type;  /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for loopfilter init checking and motion search. */
    FRAME_TYPE frame_type;

    int show_frame;
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@ extern "C"
    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);

    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);

    int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
    int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)

 /* global constants */
-
+#if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
@@ -41,13 +41,32 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
    { RGB_TO_YUV(0xFF0000) }    /* Red */
 };

-static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x6633ff) },   /* Purple */
+    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+    { RGB_TO_YUV(0xff33cc) },   /* Pink */
+    { RGB_TO_YUV(0xff3366) },   /* Coral */
+    { RGB_TO_YUV(0x3366ff) },   /* Blue */
+    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+    { RGB_TO_YUV(0xff6633) },   /* Orange */
+    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+    { RGB_TO_YUV(0x8ab800) },   /* Green */
+    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+    { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
 {
    { RGB_TO_YUV(0x00ff00) },   /* Blue */
    { RGB_TO_YUV(0x0000ff) },   /* Green */
    { RGB_TO_YUV(0xffff00) },   /* Yellow */
    { RGB_TO_YUV(0xff0000) },   /* Red */
 };
+#endif

 static const short kernel5[] =
 {
@@ -476,7 +495,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
 */
-void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
@@ -484,10 +503,10 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);

-    y += stride + 2;
-    for (i = 0; i < 14; i++)
+    y += 2*stride + 2;
+    for (i = 0; i < 12; i++)
    {
-        for (j = 0; j < 14; j++)
+        for (j = 0; j < 12; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
@@ -511,6 +530,104 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    }
 }

+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    for (i = 0; i < 12; i++)
+    {
+        y[0]  = (y[0]*alpha  + y1_const)>>16;
+        y[1]  = (y[1]*alpha  + y1_const)>>16;
+        y[14] = (y[14]*alpha + y1_const)>>16;
+        y[15] = (y[15]*alpha + y1_const)>>16;
+        y += stride;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+    u += stride;
+    v += stride;
+
+    for (i = 0; i < 6; i++)
+    {
+        u[0] = (u[0]*alpha + u1_const)>>16;
+        v[0] = (v[0]*alpha + v1_const)>>16;
+
+        u[7] = (u[7]*alpha + u1_const)>>16;
+        v[7] = (v[7]*alpha + v1_const)>>16;
+
+        u += stride;
+        v += stride;
+    }
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
 static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
 {
    int dx;
@@ -522,7 +639,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = width;
-        if (dy)
+        if (dx)
            *y1 = ((width-x0)*dy)/dx + y0;
    }
    if (*x1 < 0)
@@ -531,7 +648,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = 0;
-        if (dy)
+        if (dx)
            *y1 = ((0-x0)*dy)/dx + y0;
    }
    if (*y1 > height)
@@ -540,7 +657,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = height;
-        if (dx)
+        if (dy)
            *x1 = ((height-y0)*dx)/dy + x0;
    }
    if (*y1 < 0)
@@ -549,7 +666,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = 0;
-        if (dx)
+        if (dy)
            *x1 = ((0-y0)*dx)/dy + x0;
    }
 }
@@ -561,10 +678,13 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif

-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
    char message[512];
    int q = oci->filter_level * 10 / 6;
+    int flags = ppflags->post_proc_flag;
+    int deblock_level = ppflags->deblocking_level;
+    int noise_level = ppflags->noise_level;

    if (!oci->frame_to_show)
        return -1;
@@ -621,7 +741,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         oci->post_proc_buffer.y_stride);
    }

-    if (flags & VP8D_DEBUG_LEVEL1)
+#if CONFIG_POSTPROC_VISUALIZER
+    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
    {
        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                (oci->frame_type == KEY_FRAME),
@@ -633,7 +754,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

-    if (flags & VP8D_DEBUG_LEVEL2)
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -665,7 +786,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }

-    if (flags & VP8D_DEBUG_LEVEL3)
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -700,45 +821,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }

-    if (flags & VP8D_DEBUG_LEVEL4)
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
-#if 0
-        int i, j;
-        unsigned char *y_ptr;
-        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
-        int mb_rows = post->y_height >> 4;
-        int mb_cols = post->y_width  >> 4;
-        int mb_index = 0;
-        MODE_INFO *mi = oci->mi;
-
-        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-        /* vp8_filter each macro block */
-        for (i = 0; i < mb_rows; i++)
-        {
-            for (j = 0; j < mb_cols; j++)
-            {
-                char zz[4];
-
-                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
-                vp8_blit_text(zz, y_ptr, post->y_stride);
-                mb_index ++;
-                y_ptr += 16;
-            }
-
-            mb_index ++; /* border */
-            y_ptr += post->y_stride  * 16 - post->y_width;
-
-        }
-
-#endif
-
    }

    /* Draw motion vectors */
-    if (flags & VP8D_DEBUG_LEVEL5)
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
    {
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
@@ -749,29 +839,144 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        MODE_INFO *mi = oci->mi;
        int x0, y0;

-        for (y0 = 8; y0 < (height + 8); y0 += 16)
+        for (y0 = 0; y0 < height; y0 += 16)
        {
-            for (x0 = 8; x0 < (width + 8); x0 += 16)
+            for (x0 = 0; x0 < width; x0 += 16)
            {
-               int x1, y1;
-               if (mi->mbmi.mode >= NEARESTMV)
+                int x1, y1;
+
+                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+                {
+                    mi++;
+                    continue;
+                }
+
+                if (mi->mbmi.mode == SPLITMV)
+                {
+                    switch (mi->mbmi.partitioning)
+                    {
+                        case 0 :    /* mv_top_bottom */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 1 :    /* mv_left_right */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 2 :    /* mv_quarters   */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[10];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
+                            break;
+                        }
+                        default :
+                        {
+                            B_MODE_INFO *bmi = mi->bmi;
+                            int bx0, by0;
+
+                            for (by0 = y0; by0 < (y0+16); by0 += 4)
+                            {
+                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+                                {
+                                    MV *mv = &bmi->mv.as_mv;
+
+                                    x1 = bx0 + 2 + (mv->col >> 3);
+                                    y1 = by0 + 2 + (mv->row >> 3);
+
+                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
+
+                                    bmi++;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (mi->mbmi.mode >= NEARESTMV)
                {
                    MV *mv = &mi->mbmi.mv.as_mv;
+                    const int lx0 = x0 + 8;
+                    const int ly0 = y0 + 8;

-                    x1 = x0 + (mv->col >> 3);
-                    y1 = y0 + (mv->row >> 3);
+                    x1 = lx0 + (mv->col >> 3);
+                    y1 = ly0 + (mv->row >> 3);

-                    if (x1 != x0 && y1 != y0)
+                    if (x1 != lx0 && y1 != ly0)
                    {
-                        constrain_line (x0, &x1, y0-1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0-1,  y1, y_buffer, y_stride);
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);

-                        constrain_line (x0, &x1, y0+1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0+1,  y1, y_buffer, y_stride);
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
                    }
                    else
-                        vp8_blit_line  (x0,  x1, y0,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
                }
+
                mi++;
            }
            mi++;
@@ -779,9 +984,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }

    /* Color in block modes */
-    if (flags & VP8D_DEBUG_LEVEL6)
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -791,18 +997,54 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j += 16)
+            for (x = 0; x < width; x += 16)
            {
                int Y = 0, U = 0, V = 0;

-                Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-                U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-                V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+                if (mi->mbmi.mode == B_PRED &&
+                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
+                {
+                    int by, bx;
+                    unsigned char *yl, *ul, *vl;
+                    B_MODE_INFO *bmi = mi->bmi;

-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                    yl = y_ptr + x;
+                    ul = u_ptr + (x>>1);
+                    vl = v_ptr + (x>>1);
+
+                    for (by = 0; by < 16; by += 4)
+                    {
+                        for (bx = 0; bx < 16; bx += 4)
+                        {
+                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+                                || (ppflags->display_mb_modes_flag & B_PRED))
+                            {
+                                Y = B_PREDICTION_MODE_colors[bmi->mode][0];
+                                U = B_PREDICTION_MODE_colors[bmi->mode][1];
+                                V = B_PREDICTION_MODE_colors[bmi->mode][2];
+
+                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            }
+                            bmi++;
+                        }
+
+                        yl += y_stride*4;
+                        ul += y_stride*1;
+                        vl += y_stride*1;
+                    }
+                }
+                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
+                {
+                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }

                mi++;
            }
@@ -815,9 +1057,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }

    /* Color in frame reference blocks */
-    if (flags & VP8D_DEBUG_LEVEL7)
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -827,18 +1069,21 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j +=16)
+            for (x = 0; x < width; x +=16)
            {
                int Y = 0, U = 0, V = 0;

-                Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-                U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-                V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+                {
+                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }

                mi++;
            }
@@ -849,6 +1094,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
            mi++;
        }
    }
+#endif

    *dest = oci->post_proc_buffer;

--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -24,7 +24,15 @@
              char whiteclamp[16], char bothclamp[16],\
              unsigned int w, unsigned int h, int pitch)

-#define prototype_postproc_blend_mb(sym)\
+#define prototype_postproc_blend_mb_inner(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_mb_outer(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_b(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)

@@ -52,22 +60,36 @@ extern prototype_postproc(vp8_postproc_downacross);
 #endif
 extern prototype_postproc_addnoise(vp8_postproc_addnoise);

-#ifndef vp8_postproc_blend_mb
-#define vp8_postproc_blend_mb vp8_blend_mb_c
+#ifndef vp8_postproc_blend_mb_inner
+#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
 #endif
-extern prototype_postproc_blend_mb(vp8_postproc_blend_mb);
+extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
+
+#ifndef vp8_postproc_blend_mb_outer
+#define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
+#endif
+extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
+
+#ifndef vp8_postproc_blend_b
+#define vp8_postproc_blend_b vp8_blend_b_c
+#endif
+extern prototype_postproc_blend_b(vp8_postproc_blend_b);

 typedef prototype_postproc((*vp8_postproc_fn_t));
 typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
 typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb((*vp8_postproc_blend_mb_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
+typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
+typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
 typedef struct
 {
-    vp8_postproc_inplace_fn_t   down;
-    vp8_postproc_inplace_fn_t   across;
-    vp8_postproc_fn_t           downacross;
-    vp8_postproc_addnoise_fn_t  addnoise;
-    vp8_postproc_blend_mb_fn_t  blend_mb;
+    vp8_postproc_inplace_fn_t           down;
+    vp8_postproc_inplace_fn_t           across;
+    vp8_postproc_fn_t                   downacross;
+    vp8_postproc_addnoise_fn_t          addnoise;
+    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
+    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
+    vp8_postproc_blend_b_fn_t           blend_b;
 } vp8_postproc_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -89,7 +111,7 @@ struct postproc_state
 #include "onyxc_int.h"
 #include "ppflags.h"
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        int deblock_level, int noise_level, int flags);
+                        vp8_ppflags_t *flags);


 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -13,17 +13,28 @@
 #define __INC_PPFLAGS_H
 enum
 {
-    VP8D_NOFILTERING    = 0,
-    VP8D_DEBLOCK        = 1<<0,
-    VP8D_DEMACROBLOCK   = 1<<1,
-    VP8D_ADDNOISE       = 1<<2,
-    VP8D_DEBUG_LEVEL1   = 1<<3,
-    VP8D_DEBUG_LEVEL2   = 1<<4,
-    VP8D_DEBUG_LEVEL3   = 1<<5,
-    VP8D_DEBUG_LEVEL4   = 1<<6,
-    VP8D_DEBUG_LEVEL5   = 1<<7,
-    VP8D_DEBUG_LEVEL6   = 1<<8,
-    VP8D_DEBUG_LEVEL7   = 1<<9
+    VP8D_NOFILTERING            = 0,
+    VP8D_DEBLOCK                = 1<<0,
+    VP8D_DEMACROBLOCK           = 1<<1,
+    VP8D_ADDNOISE               = 1<<2,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
 };

+typedef struct
+{
+    int post_proc_flag;
+    int deblocking_level;
+    int noise_level;
+    int display_ref_frame_flag;
+    int display_mb_modes_flag;
+    int display_b_modes_flag;
+    int display_mv_flag;
+} vp8_ppflags_t;
+
 #endif
--- a/vp8/common/preproc.h
+++ b/vp8/common/preproc.h
@@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc.h
-*
-*   Description  :     simple preprocessor
-*
-****************************************************************************/
-
-#ifndef __INC_PREPROC_H
-#define __INC_PREPROC_H
-
-/****************************************************************************
-*  Types
-****************************************************************************/
-
-typedef struct
-{
-    unsigned char *frame_buffer;
-    int frame;
-    unsigned int *fixed_divide;
-
-    unsigned char *frame_buffer_alloc;
-    unsigned int *fixed_divide_alloc;
-} pre_proc_instance;
-
-/****************************************************************************
-*  Functions.
-****************************************************************************/
-void pre_proc_machine_specific_config(void);
-void delete_pre_proc(pre_proc_instance *ppi);
-int init_pre_proc(pre_proc_instance *ppi, int frame_size);
-extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength);
-extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-
-#endif
--- a/vp8/common/preprocif.h
+++ b/vp8/common/preprocif.h
@@ -1,76 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc_if.h
-*
-*   Description  :     Pre-processor interface header file.
-*
-****************************************************************************/
-
-#ifndef __PREPROC_IF_H
-#define __PREPROC_IF_H
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "type_aliases.h"
-
-/****************************************************************************
-*  Types
-****************************************************************************/
-
-typedef struct
-{
-    UINT8 *Yuv0ptr;
-    UINT8 *Yuv1ptr;
-
-    UINT8   *frag_info;              // blocks coded : passed in
-    UINT32   frag_info_element_size;   // size of each element
-    UINT32   frag_info_coded_mask;     // mask to get at whether fragment is coded
-
-    UINT32 *region_index;            // Gives pixel index for top left of each block
-    UINT32 video_frame_height;
-    UINT32 video_frame_width;
-    UINT8 hfrag_pixels;
-    UINT8 vfrag_pixels;
-
-} SCAN_CONFIG_DATA;
-
-typedef enum
-{
-    SCP_FILTER_ON_OFF,
-    SCP_SET_SRF_OFFSET,
-    SCP_SET_EBO_ON_OFF,
-    SCP_SET_VCAP_LEVEL_OFFSET,
-    SCP_SET_SHOW_LOCAL
-
-} SCP_SETTINGS;
-
-typedef struct PP_INSTANCE *x_pp_inst;
-
-/****************************************************************************
-*  Module statics
-****************************************************************************/
-/* Controls whether Early break out is on or off in default case */
-#define EARLY_BREAKOUT_DEFAULT  TRUE
-
-/****************************************************************************
-*  Functions
-****************************************************************************/
-extern  void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value);
-extern  UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator);
-extern  x_pp_inst create_pp_instance(void);
-extern  void delete_pp_instance(x_pp_inst *);
-extern  BOOL scan_yuvinit(x_pp_inst,  SCAN_CONFIG_DATA *scan_config_ptr);
-
-#endif
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -19,7 +19,7 @@
 extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
 extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif

--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -461,7 +461,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
            partition_size = user_data_end - partition;
        }

-        if (user_data_end - partition < partition_size)
+        if (partition + partition_size > user_data_end
+            || partition + partition_size < partition)
            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                               "Truncated packet or corrupt partition "
                               "%d length", i + 1);
@@ -580,7 +581,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
    data += 3;

-    if (data_end - data < first_partition_length_in_bytes)
+    if (data + first_partition_length_in_bytes > data_end
+        || data + first_partition_length_in_bytes < data)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet or corrupt partition 0 length");
    vp8_setup_version(pc);
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -506,7 +506,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +524,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,

    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else

    if (pbi->common.frame_to_show)
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -596,7 +596,7 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 }


-int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
@@ -647,7 +647,6 @@ int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
-    return 0;
 #else
    (void) pbi;
    (void) width;
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -29,10 +29,9 @@
    push    {r4-r11, lr}

    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 20
-    add     r2, r2, r2, lsl #2          ; xcount
+    ;  sizeof (TOKENEXTRA) is 8
    sub     sp, sp, #12
-    add     r2, r1, r2, lsl #2          ; stop = p + xcount
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
    str     r2, [sp, #0]
    str     r3, [sp, #8]                ; save vp8_coef_encodings
    ldr     r2, [r0, #vp8_writer_lowvalue]
@@ -41,13 +40,13 @@
    b       check_p_lt_stop

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #8]                ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -142,12 +141,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #48]               ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -155,7 +153,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -62,13 +62,13 @@ mb_row_loop
    ; actuall work gets done here!

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #20]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -163,12 +163,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #8]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -176,7 +175,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -90,13 +90,13 @@ mb_row_loop
    ; actual work gets done here!

 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #80]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -191,12 +191,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #84]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
-    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -204,7 +203,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor

 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
 }

 /*
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -51,7 +51,6 @@ DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));

 DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
 DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_prob_bc,               offsetof(vp8_extra_bit_struct, prob_bc));
 DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
 DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));

@@ -67,8 +66,8 @@ DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));

 // These two sizes are used in vp7cx_pack_tokens.  They are hard coded
 //  so if the size changes this will have to be adjusted.
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,6 +33,7 @@ typedef struct

    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *quant;
+    short *quant_fast;
    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
@@ -81,6 +82,7 @@ typedef struct
    int errthresh;
    int rddiv;
    int rdmult;
+    INT64 activity_sum;

    int mvcosts[2][MVvals+1];
    int *mvcost[2];
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -62,7 +62,6 @@ unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

 static const int qrounding_factors[129] =
 {
-    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -78,12 +77,18 @@ static const int qrounding_factors[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
 };

 static const int qzbin_factors[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -94,17 +99,11 @@ static const int qzbin_factors[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80,
+    80
 };

 static const int qrounding_factors_y2[129] =
 {
-    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -120,12 +119,18 @@ static const int qrounding_factors_y2[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
 };

 static const int qzbin_factors_y2[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -136,26 +141,30 @@ static const int qzbin_factors_y2[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
-    80,
+    80
 };

-//#define EXACT_QUANT
+#define EXACT_QUANT
 #ifdef EXACT_QUANT
-static void vp8cx_invert_quant(short *quant, short *shift, short d)
+static void vp8cx_invert_quant(int improved_quant, short *quant,
+                               short *shift, short d)
 {
-    unsigned t;
-    int l;
-    t = d;
-    for(l = 0; t > 1; l++)
-        t>>=1;
-    t = 1 + (1<<(16+l))/d;
-    *quant = (short)(t - (1<<16));
-    *shift = l;
+    if(improved_quant)
+    {
+        unsigned t;
+        int l;
+        t = d;
+        for(l = 0; t > 1; l++)
+            t>>=1;
+        t = 1 + (1<<(16+l))/d;
+        *quant = (short)(t - (1<<16));
+        *shift = l;
+    }
+    else
+    {
+        *quant = (1 << 16) / d;
+        *shift = 0;
+    }
 }

 void vp8cx_init_quantizer(VP8_COMP *cpi)
@@ -170,7 +179,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
    {
        // dc values
        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
                           cpi->Y1quant_shift[Q] + 0, quant_val);
        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -178,7 +188,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
                           cpi->Y2quant_shift[Q] + 0, quant_val);
        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -186,7 +197,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
                           cpi->UVquant_shift[Q] + 0, quant_val);
        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -199,7 +211,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            int rc = vp8_default_zig_zag1d[i];

            quant_val = vp8_ac_yquant(Q);
-            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
+            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
                               cpi->Y1quant_shift[Q] + rc, quant_val);
            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -207,7 +220,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
+            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
                               cpi->Y2quant_shift[Q] + rc, quant_val);
            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -215,7 +229,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
+            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
                               cpi->UVquant_shift[Q] + rc, quant_val);
            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -316,6 +331,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 0; i < 16; i++)
    {
        x->block[i].quant = cpi->Y1quant[QIndex];
+        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
        x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
        x->block[i].zbin = cpi->Y1zbin[QIndex];
        x->block[i].round = cpi->Y1round[QIndex];
@@ -330,6 +346,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 16; i < 24; i++)
    {
        x->block[i].quant = cpi->UVquant[QIndex];
+        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
        x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
        x->block[i].zbin = cpi->UVzbin[QIndex];
        x->block[i].round = cpi->UVround[QIndex];
@@ -340,6 +357,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

    // Y2
    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
    x->block[24].quant = cpi->Y2quant[QIndex];
    x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
    x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -351,6 +369,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
+    // Clear Zbin mode boost for default case
+    cpi->zbin_mode_boost = 0;
+
    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
    // when these values are not all zero.
    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
@@ -363,6 +384,62 @@ void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 }


+/* activity_avg must be positive, or flat regions could get a zero weight
+ *  (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ *  vp8_activity_masking().
+ */
+#define VP8_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ *  purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ *  which will be faster.
+ */
+static const unsigned char VP8_VAR_OFFS[16]=
+{
+    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+
+unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    unsigned int act;
+    unsigned int sse;
+    int sum;
+    unsigned int a;
+    unsigned int b;
+    unsigned int d;
+    /* TODO: This could also be done over smaller areas (8x8), but that would
+     *  require extensive changes elsewhere, as lambda is assumed to be fixed
+     *  over an entire MB in most of the code.
+     * Another option is to compute four 8x8 variances, and pick a single
+     *  lambda using a non-linear combination (e.g., the smallest, or second
+     *  smallest, etc.).
+     */
+    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,
+     x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);
+    /* This requires a full 32 bits of precision. */
+    act = (sse<<8) - sum*sum;
+    /* Drop 4 to give us some headroom to work with. */
+    act = (act + 8) >> 4;
+    /* If the region is flat, lower the activity some more. */
+    if (act < 8<<12)
+        act = act < 5<<12 ? act : 5<<12;
+    /* TODO: For non-flat regions, edge regions should receive less masking
+     *  than textured regions, but identifying edge regions quickly and
+     *  reliably enough is still a subject of experimentation.
+     * This will be most noticable near edges with a complex shape (e.g.,
+     *  text), but the 4x4 transform size should make this less of a problem
+     *  than it would be for an 8x8 transform.
+     */
+    /* Apply the masking to the RD multiplier. */
+    a = act + 4*cpi->activity_avg;
+    b = 4*act + cpi->activity_avg;
+    x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);
+    return act;
+}
+
+

 static
 void encode_mb_row(VP8_COMP *cpi,
@@ -374,6 +451,7 @@ void encode_mb_row(VP8_COMP *cpi,
                   int *segment_counts,
                   int *totalrate)
 {
+    INT64 activity_sum = 0;
    int i;
    int recon_yoffset, recon_uvoffset;
    int mb_col;
@@ -402,14 +480,14 @@ void encode_mb_row(VP8_COMP *cpi,
    // Set up limit values for vertical motion vector components
    // to prevent them extending beyond the UMV borders
    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) 
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                        + (VP8BORDERINPIXELS - 16);

    // for each macroblock col in image
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
    {
-        // Distance of Mb to the left & right edges, specified in 
-        // 1/8th pel units as they are always compared to values 
+        // Distance of Mb to the left & right edges, specified in
+        // 1/8th pel units as they are always compared to values
        // that are in 1/8th pel units
        xd->mb_to_left_edge = -((mb_col * 16) << 3);
        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
@@ -417,7 +495,7 @@ void encode_mb_row(VP8_COMP *cpi,
        // Set up limit values for horizontal motion vector components
        // to prevent them extending beyond the UMV borders
        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) 
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
                            + (VP8BORDERINPIXELS - 16);

        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
@@ -425,6 +503,12 @@ void encode_mb_row(VP8_COMP *cpi,
        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
        xd->left_available = (mb_col != 0);

+        x->rddiv = cpi->RDDIV;
+        x->rdmult = cpi->RDMULT;
+
+        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+            activity_sum += vp8_activity_masking(cpi, x);
+
        // Is segmentation enabled
        // MB level adjutment to quantizer
        if (xd->segmentation_enabled)
@@ -531,6 +615,7 @@ void encode_mb_row(VP8_COMP *cpi,
    // this is to account for the border
    xd->mode_info_context++;
    x->partition_info++;
+    x->activity_sum += activity_sum;
 }


@@ -647,8 +732,7 @@ void vp8_encode_frame(VP8_COMP *cpi)

    vp8_setup_block_ptrs(x);

-    x->rddiv = cpi->RDDIV;
-    x->rdmult = cpi->RDMULT;
+    x->activity_sum = 0;

 #if 0
    // Experimental rd code
@@ -703,11 +787,12 @@ void vp8_encode_frame(VP8_COMP *cpi)
        else
        {
 #if CONFIG_MULTITHREAD
+            int i;
+
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
-                int i;
                cpi->current_mb_col_main = -1;

                for (i = 0; i < cpi->encoding_thread_count; i++)
@@ -785,6 +870,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
                totalrate += cpi->mb_row_ei[i].totalrate;
            }

+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
+            }
+
 #endif

        }
@@ -920,6 +1010,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif

+    /* Update the average activity for the next frame.
+     * This is feed-forward for now; it could also be saved in two-pass, or
+     *  done during lookahead when that is eventually added.
+     */
+    cpi->activity_avg = (unsigned int )(x->activity_sum/cpi->common.MBs);
+    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
+        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
@@ -1181,7 +1279,18 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
+        /* Are we using the fast quantizer for the mode selection? */
+        if(cpi->sf.use_fastquant_for_pick)
+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+
+        /* switch back to the regular quantizer for the encode */
+        if (cpi->sf.improved_quant)
+        {
+            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+        }
+
    }
    else
 #endif
@@ -1214,11 +1323,25 @@ int vp8cx_encode_inter_macroblock
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
-            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME))
-                cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
+                 cpi->zbin_mode_boost = 0;
            else
-                cpi->zbin_mode_boost = 0;
+            {
+                if (xd->mode_info_context->mbmi.mode == ZEROMV)
+                {
+                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+                    cpi->zbin_mode_boost = 0;
+                else
+                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
        }
+        else
+            cpi->zbin_mode_boost = 0;

        vp8cx_mb_init_quantizer(cpi,  x);
    }
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -105,7 +105,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

 #if !(CONFIG_REALTIME_ONLY)
 #if 1
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mby(x, rtcd);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -243,9 +243,9 @@ struct vp8_token_state{
 };

 // TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 1
-#define UV_RD_MULT 1
-#define Y2_RD_MULT 4
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 16

 static const int plane_rd_mult[4]=
 {
@@ -309,8 +309,10 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    eob = d->eob;

    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-    /* TODO: These should vary with the block type, since the quantizer does. */
-    rdmult = (mb->rdmult << 2)*err_mult;
+    rdmult = mb->rdmult * err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+        rdmult = (rdmult * 9)>>4;
+
    rddiv = mb->rddiv;
    best_mask[0] = best_mask[1] = 0;
    /* Initialize the sentinel node of the trellis. */
@@ -633,7 +635,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mb(x);

 #if !(CONFIG_REALTIME_ONLY)
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mb(x, rtcd);
 #endif

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -61,6 +61,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                    volatile int *last_row_current_mb_col;
+                    INT64 activity_sum = 0;

                    if (ithread > 0)
                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
@@ -111,6 +112,12 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                        xd->left_available = (mb_col != 0);

+                        x->rddiv = cpi->RDDIV;
+                        x->rdmult = cpi->RDMULT;
+
+                        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                            activity_sum += vp8_activity_masking(cpi, x);
+
                        // Is segmentation enabled
                        // MB level adjutment to quantizer
                        if (xd->segmentation_enabled)
@@ -126,6 +133,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        else
                            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default

+                        x->active_ptr = cpi->active_map + seg_map_index + mb_col;

                        if (cm->frame_type == KEY_FRAME)
                        {
@@ -157,8 +165,28 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                                cpi->inter_zz_count ++;

-                        }
+                            // Special case code for cyclic refresh
+                            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+                            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                            {
+                                cpi->segmentation_map[seg_map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;

+                                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+                                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+                                // else mark it as dirty (1).
+                                if (xd->mode_info_context->mbmi.segment_id)
+                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
+                                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                                {
+                                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
+                                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
+                                }
+                                else
+                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
+
+                            }
+                        }
                        cpi->tplist[mb_row].stop = *tp;

                        x->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
@@ -197,6 +225,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    // this is to account for the border
                    xd->mode_info_context++;
                    x->partition_info++;
+                    x->activity_sum += activity_sum;

                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -240,8 +269,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->sadperbit16      = x->sadperbit16;
    z->sadperbit4       = x->sadperbit4;
    z->errthresh        = x->errthresh;
-    z->rddiv            = x->rddiv;
-    z->rdmult           = x->rdmult;

    /*
    z->mv_col_min    = x->mv_col_min;
@@ -255,6 +282,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
+    z->optimize          = x->optimize;

    /*
    z->mvc              = x->mvc;
@@ -282,6 +310,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    for (i = 0; i < 25; i++)
    {
        z->block[i].quant           = x->block[i].quant;
+        z->block[i].quant_fast      = x->block[i].quant_fast;
        z->block[i].quant_shift     = x->block[i].quant_shift;
        z->block[i].zbin            = x->block[i].zbin;
        z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
@@ -392,8 +421,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

        vp8_setup_block_ptrs(mb);

-        mb->rddiv = cpi->RDDIV;
-        mb->rdmult = cpi->RDMULT;
+        mb->activity_sum = 0;

        mbd->left_context = &cm->left_context;
        mb->mvc = cm->fc.mvc;
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -472,7 +472,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

    // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
        tmp_err += new_mv_mode_penalty;

@@ -495,7 +495,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
            num00--;
        else
        {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                tmp_err += new_mv_mode_penalty;

@@ -1145,6 +1145,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->output_frame_rate = cpi->oxcf.frame_rate;
    cpi->bits_left = (long long)(cpi->total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->bits_left -= (long long)(cpi->total_stats->duration * two_pass_min_rate / 10000000.0);
+    cpi->clip_bits_total = cpi->bits_left;

    vp8_avg_stats(cpi->total_stats);

@@ -1173,17 +1174,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    {
        start_pos = cpi->stats_in;               // Note starting "file" position

-        cpi->modified_total_error_left = 0.0;
+        cpi->modified_error_total = 0.0;
+        cpi->modified_error_used = 0.0;

        while (vp8_input_stats(cpi, &this_frame) != EOF)
        {
-            cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
+            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
        }
+        cpi->modified_error_left = cpi->modified_error_total;

        reset_fpf_position(cpi, start_pos);            // Reset file position

    }

+    // Calculate the clip target modified bits per error
+    // The observed bpe starts as the same number.
+    cpi->clip_bpe =  cpi->bits_left /
+                     DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
+    cpi->observed_bpe = cpi->clip_bpe;
+
    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }

@@ -1439,7 +1448,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        // Boost for arf frame
        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (cpi->baseline_gf_interval * 50);
+        Boost += (i * 50);
        allocation_chunks = (i * 100) + Boost;

        // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1585,6 +1594,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Reset the file position
    reset_fpf_position(cpi, start_pos);

+    // Update the record of error used so far (only done once per gf group)
+    cpi->modified_error_used += gf_group_err;
+
    // Assign  bits to the arf or gf.
    {
        int Boost;
@@ -1738,16 +1750,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
@@ -1892,6 +1894,16 @@ void vp8_second_pass(VP8_COMP *cpi)
    // Is this a GF / ARF (Note that a KF is always also a GF)
    if (cpi->frames_till_gf_update_due == 0)
    {
+        // Update monitor of the bits per error observed so far.
+        // Done once per gf group based on what has gone before
+        // so do nothing if this is the first frame.
+        if (cpi->common.current_video_frame > 0)
+        {
+            cpi->observed_bpe =
+                (double)(cpi->clip_bits_total - cpi->bits_left) /
+                cpi->modified_error_used;
+        }
+
        // Define next gf group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
        define_gf_group(cpi, &this_frame_copy);
@@ -1980,7 +1992,14 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->ni_av_qi                     = cpi->worst_quality;
        }
    }
-    else
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ( (cpi->common.current_video_frame <
+                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                  (unsigned int)cpi->total_stats->count) )
    {
        if (frames_left < 1)
            frames_left = 1;
@@ -2199,7 +2218,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    }

    // Calculate the number of bits that should be assigned to the kf group.
-    if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
+    if ((cpi->bits_left > 0) && ((int)cpi->modified_error_left > 0))
    {
        // Max for a single normal frame (not key frame)
        int max_bits = frame_max_bits(cpi);
@@ -2211,7 +2230,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // complexity of the section
        cpi->kf_group_bits = (long long)( cpi->bits_left *
                                          ( kf_group_err /
-                                            cpi->modified_total_error_left ));
+                                            cpi->modified_error_left ));

        // Clip based on maximum per frame rate defined by the user.
        max_grp_bits = (long long)max_bits * (long long)cpi->frames_to_key;
@@ -2344,17 +2363,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -2474,7 +2483,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            double  alt_kf_grp_bits =
                        ((double)cpi->bits_left *
                         (kf_mod_err * (double)cpi->frames_to_key) /
-                         DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left));
+                         DOUBLE_DIVIDE_CHECK(cpi->modified_error_left));

            alt_kf_bits = (int)((double)kf_boost *
                                (alt_kf_grp_bits / (double)allocation_chunks));
@@ -2492,7 +2501,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            alt_kf_bits =
                (int)((double)cpi->bits_left *
                      (kf_mod_err /
-                       DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left)));
+                       DOUBLE_DIVIDE_CHECK(cpi->modified_error_left)));

            if (alt_kf_bits > cpi->kf_bits)
            {
@@ -2512,7 +2521,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    // Adjust the count of total modified error left.
    // The count of bits left is adjusted elsewhere based on real coded frame sizes
-    cpi->modified_total_error_left -= kf_group_err;
+    cpi->modified_error_left -= kf_group_err;

    if (cpi->oxcf.allow_spatial_resampling)
    {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;

+    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
+    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
+    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
+    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
+    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
+
    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -88,6 +94,8 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)

    cpi->rtcd.search.full_search             = vp8_full_search_sad;
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
+
+    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif

    // Pure C:
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -913,7 +913,8 @@ int vp8_diamond_search_sad
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    int i, j, step;
@@ -940,6 +941,8 @@ int vp8_diamond_search_sad
    unsigned char *check_here;
    int thissad;

+    *num00 = 0;
+
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -949,7 +952,7 @@ int vp8_diamond_search_sad
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -961,8 +964,6 @@ int vp8_diamond_search_sad
    best_mv->row = ref_row;
    best_mv->col = ref_col;

-    *num00 = 0;
-
    for (step = 0; step < tot_steps ; step++)
    {
        for (j = 0 ; j < x->searches_per_step ; j++)
@@ -982,7 +983,7 @@ int vp8_diamond_search_sad
                {
                    this_mv.row = this_row_offset << 3;
                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1013,7 +1014,7 @@ int vp8_diamond_search_sad
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }

 int vp8_diamond_search_sadx4
@@ -1028,7 +1029,8 @@ int vp8_diamond_search_sadx4
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    int i, j, step;
@@ -1055,6 +1057,8 @@ int vp8_diamond_search_sadx4
    unsigned char *check_here;
    unsigned int thissad;

+    *num00 = 0;
+
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -1064,7 +1068,7 @@ int vp8_diamond_search_sadx4
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -1076,8 +1080,6 @@ int vp8_diamond_search_sadx4
    best_mv->row = ref_row;
    best_mv->col = ref_col;

-    *num00 = 0;
-
    for (step = 0; step < tot_steps ; step++)
    {
        int all_in = 1, t;
@@ -1108,7 +1110,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (sad_array[t] < bestsad)
                        {
@@ -1137,7 +1139,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = this_row_offset << 3;
                        this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                        if (thissad < bestsad)
                        {
@@ -1168,12 +1170,12 @@ int vp8_diamond_search_sadx4
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }


 #if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1211,7 +1213,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
        // Baseline value at the centre

        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1239,7 +1241,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
            this_mv.col = c << 3;
            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

            if (thissad < bestsad)
            {
@@ -1258,12 +1260,12 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }

-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1301,7 +1303,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1323,7 +1325,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
        check_here = r * mv_stride + in_what + col_min;
        c = col_min;

-        while ((c + 3) < col_max)
+        while ((c + 2) < col_max)
        {
            int i;

@@ -1336,7 +1338,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1359,7 +1361,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);

                if (thissad < bestsad)
                {
@@ -1381,13 +1383,165 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
 #endif


+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    unsigned int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned short sad_array8[8];
+    unsigned int sad_array[3];
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 7) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
+
+            for (i = 0; i < 8; i++)
+            {
+                thissad = (unsigned int)sad_array8[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.col = c << 3;
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->row = r;
+                    best_mv->col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
 {
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -24,7 +24,7 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
-
+#define MAX_POSSIBLE_MV (1 << 11)                               // Maximum MV in 1/8 pel units

 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
@@ -67,7 +67,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int distance, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvcost[2], \
-     int *mvsadcost[2] \
+     int *mvsadcost[2], \
+     MV *center_mv \
    )

 #define prototype_diamond_search_sad(sym)\
@@ -83,7 +84,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int *num00, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvsadcost[2], \
-     int *mvcost[2] \
+     int *mvcost[2], \
+     MV *center_mv \
    )

 #if ARCH_X86 || ARCH_X86_64
@@ -93,6 +95,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
 typedef prototype_full_search_sad(*vp8_full_search_fn_t);
 extern prototype_full_search_sad(vp8_full_search_sad);
 extern prototype_full_search_sad(vp8_full_search_sadx3);
+extern prototype_full_search_sad(vp8_full_search_sadx8);

 typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
 extern prototype_diamond_search_sad(vp8_diamond_search_sad);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -73,6 +73,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);

+extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi);

 static void set_default_lf_deltas(VP8_COMP *cpi);

@@ -174,17 +175,6 @@ static const int kf_high_motion_minq[QINDEX_RANGE] =
    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
    35,35,36,36,37,38,39,40,41,42,43,44,45,46,47,48,
 };
-/*static const int kf_minq[QINDEX_RANGE] =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6,
-    7, 7, 8, 8, 9, 9, 10,10,11,11,12,12,13,13,14,14,
-    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
-    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
-    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38
-};*/
 static const int gf_low_motion_minq[QINDEX_RANGE] =
 {
    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
@@ -218,27 +208,16 @@ static const int gf_high_motion_minq[QINDEX_RANGE] =
    41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80,
 };
-/*static const int gf_arf_minq[QINDEX_RANGE] =
-{
-    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
-    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
-    9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,
-    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
-    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
-    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,39,
-    39,40,40,41,41,42,42,43,43,44,45,46,47,48,49,50,
-    51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66
-};*/
 static const int inter_minq[QINDEX_RANGE] =
 {
-    0,0,0,0,1,1,2,3,3,4,4,5,6,6,7,7,
-    8,8,9,9,10,11,11,12,12,13,13,14,14,15,15,16,
-    16,17,17,17,18,18,19,19,20,20,21,21,22,22,22,23,
-    23,24,24,24,25,25,26,27,28,28,29,30,31,32,33,34,
-    35,35,36,37,38,39,39,40,41,42,43,43,44,45,46,47,
-    47,48,49,49,51,52,53,54,54,55,56,56,57,57,58,58,
-    59,59,60,61,61,62,62,63,64,64,65,66,67,67,68,69,
-    69,70,71,71,72,73,74,75,76,76,77,78,79,80,81,81,
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
 };

 void vp8_initialize()
@@ -283,6 +262,21 @@ static void setup_features(VP8_COMP *cpi)

 void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 {
+    // Delete last frame MV storage buffers
+    if (cpi->lfmv != 0)
+        vpx_free(cpi->lfmv);
+
+    cpi->lfmv = 0;
+
+    if (cpi->lf_ref_frame_sign_bias != 0)
+        vpx_free(cpi->lf_ref_frame_sign_bias);
+
+    cpi->lf_ref_frame_sign_bias = 0;
+
+    if (cpi->lf_ref_frame != 0)
+        vpx_free(cpi->lf_ref_frame);
+
+    cpi->lf_ref_frame = 0;

    // Delete sementation map
    if (cpi->segmentation_map != 0)
@@ -331,8 +325,15 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)

    cpi->mb.pip = 0;

-    vpx_free(cpi->total_stats);
-    vpx_free(cpi->this_frame_stats);
+    if(cpi->total_stats)
+        vpx_free(cpi->total_stats);
+
+    cpi->total_stats = 0;
+
+    if(cpi->this_frame_stats)
+        vpx_free(cpi->this_frame_stats);
+
+    cpi->this_frame_stats = 0;
 }

 static void enable_segmentation(VP8_PTR ptr)
@@ -563,6 +564,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    int Speed = cpi->Speed;
    int i;
    VP8_COMMON *cm = &cpi->common;
+    int last_improved_quant = sf->improved_quant;

    // Initialise default mode frequency sampling variables
    for (i = 0; i < MAX_MODES; i ++)
@@ -589,6 +591,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    sf->max_fs_radius = 32;
    sf->iterative_sub_pixel = 1;
    sf->optimize_coefficients = 1;
+    sf->use_fastquant_for_pick = 0;

    sf->first_step = 0;
    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -682,6 +685,32 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->thresh_mult[THR_NEARG    ] = 1000;
        sf->thresh_mult[THR_NEARA    ] = 1000;

+#if 1
+        sf->thresh_mult[THR_ZEROMV   ] = 0;
+        sf->thresh_mult[THR_ZEROG    ] = 0;
+        sf->thresh_mult[THR_ZEROA    ] = 0;
+        sf->thresh_mult[THR_NEARESTMV] = 0;
+        sf->thresh_mult[THR_NEARESTG ] = 0;
+        sf->thresh_mult[THR_NEARESTA ] = 0;
+        sf->thresh_mult[THR_NEARMV   ] = 0;
+        sf->thresh_mult[THR_NEARG    ] = 0;
+        sf->thresh_mult[THR_NEARA    ] = 0;
+
+//        sf->thresh_mult[THR_DC       ] = 0;
+
+//        sf->thresh_mult[THR_V_PRED   ] = 1000;
+//        sf->thresh_mult[THR_H_PRED   ] = 1000;
+//        sf->thresh_mult[THR_B_PRED   ] = 2000;
+//        sf->thresh_mult[THR_TM       ] = 1000;
+
+        sf->thresh_mult[THR_NEWMV    ] = 1000;
+        sf->thresh_mult[THR_NEWG     ] = 1000;
+        sf->thresh_mult[THR_NEWA     ] = 1000;
+
+        sf->thresh_mult[THR_SPLITMV  ] = 1700;
+        sf->thresh_mult[THR_SPLITG   ] = 4500;
+        sf->thresh_mult[THR_SPLITA   ] = 4500;
+#else
        sf->thresh_mult[THR_NEWMV    ] = 1500;
        sf->thresh_mult[THR_NEWG     ] = 1500;
        sf->thresh_mult[THR_NEWA     ] = 1500;
@@ -689,7 +718,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->thresh_mult[THR_SPLITMV  ] = 5000;
        sf->thresh_mult[THR_SPLITG   ] = 10000;
        sf->thresh_mult[THR_SPLITA   ] = 10000;
-
+#endif
        sf->full_freq[0] = 15;
        sf->full_freq[1] = 31;

@@ -761,8 +790,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_SPLITA   ] = 20000;
            }

-            sf->improved_quant = 0;
-            sf->improved_dct = 0;
+            sf->use_fastquant_for_pick = 1;

            sf->first_step = 1;
            sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -770,6 +798,8 @@ void vp8_set_speed_features(VP8_COMP *cpi)

        if (Speed > 1)
        {
+            sf->use_fastquant_for_pick = 0;
+
            cpi->mode_check_freq[THR_SPLITG] = 15;
            cpi->mode_check_freq[THR_SPLITA] = 15;
            cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -803,7 +833,13 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_SPLITA   ] = 50000;
            }

-            // Only do recode loop on key frames and golden frames
+            sf->first_step = 1;
+
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            // Only do recode loop on key frames, golden frames and
+            // alt ref frames
            sf->recode_loop = 2;

            sf->full_freq[0] = 31;
@@ -1262,6 +1298,8 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    {
        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
    }
+    if (cpi->sf.improved_quant != last_improved_quant)
+        vp8cx_init_quantizer(cpi);

 #if CONFIG_RUNTIME_CPU_DETECT
    cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
@@ -1329,6 +1367,9 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi)

 static int vp8_alloc_partition_data(VP8_COMP *cpi)
 {
+    if(cpi->mb.pip)
+        vpx_free(cpi->mb.pip);
+
    cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
                                (cpi->common.mb_rows + 1),
                                sizeof(PARTITION_INFO));
@@ -1396,8 +1437,16 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)

    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

+    if(cpi->total_stats)
+        vpx_free(cpi->total_stats);
+
    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+
+    if(cpi->this_frame_stats)
+        vpx_free(cpi->this_frame_stats);
+
    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+
    if(!cpi->total_stats || !cpi->this_frame_stats)
        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate firstpass stats");
@@ -2145,7 +2194,10 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->alt_is_last  = 0 ;
    cpi->gold_is_alt  = 0 ;

-
+    // allocate memory for storing last frame's MVs for MV prediction.
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int_mv)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));

    // Create the encoder segmentation map and set all entries to 0
    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
@@ -2201,6 +2253,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    init_context_counters();
 #endif

+    /*Initialize the feed-forward activity masking.*/
+    cpi->activity_avg = 90<<12;

    cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
    cpi->key_frame_frequency = cpi->oxcf.key_freq;
@@ -2341,6 +2395,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
    cpi->fn_ptr[BLOCK_16X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
    cpi->fn_ptr[BLOCK_16X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);

    cpi->fn_ptr[BLOCK_16X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
@@ -2350,6 +2405,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_16X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
    cpi->fn_ptr[BLOCK_16X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);

    cpi->fn_ptr[BLOCK_8X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
@@ -2359,6 +2415,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_8X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
    cpi->fn_ptr[BLOCK_8X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);

    cpi->fn_ptr[BLOCK_8X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
@@ -2368,6 +2425,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_8X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
    cpi->fn_ptr[BLOCK_8X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);

    cpi->fn_ptr[BLOCK_4X4].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
@@ -2377,6 +2435,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_4X4].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
    cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);

 #if !(CONFIG_REALTIME_ONLY)
@@ -3427,6 +3486,37 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 #endif
 // return of 0 means drop frame

+// Function to test for conditions that indeicate we should loop
+// back and recode a frame.
+static BOOL recode_loop_test( VP8_COMP *cpi,
+                              int high_limit, int low_limit,
+                              int q, int maxq, int minq )
+{
+    BOOL    force_recode = FALSE;
+    VP8_COMMON *cm = &cpi->common;
+
+    // Is frame recode allowed at all
+    // Yes if either recode mode 1 is selected or mode two is selcted
+    // and the frame is a key frame. golden frame or alt_ref_frame
+    if ( (cpi->sf.recode_loop == 1) ||
+         ( (cpi->sf.recode_loop == 2) &&
+           ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame ||
+             cm->refresh_alt_ref_frame ) ) )
+    {
+        // General over and under shoot tests
+        if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+             ((cpi->projected_frame_size < low_limit) && (q > minq)) )
+        {
+            force_recode = TRUE;
+        }
+        // Specific rate control mode related tests
+        // TBD
+    }
+
+    return force_recode;
+}
+
 static void encode_frame_to_data_rate
 (
    VP8_COMP *cpi,
@@ -3489,8 +3579,18 @@ static void encode_frame_to_data_rate
    cpi->zbin_over_quant = 0;
    cpi->zbin_mode_boost = 0;

-    // Enable mode based tweaking of the zbin
+    // Enable or disable mode based tweaking of the zbin
+    // For 2 Pass Only used where GF/ARF prediction quality
+    // is above a threshold
+    cpi->zbin_mode_boost = 0;
    cpi->zbin_mode_boost_enabled = TRUE;
+    if (cpi->pass == 2)
+    {
+        if ( cpi->gfu_boost <= 400 )
+        {
+            cpi->zbin_mode_boost_enabled = FALSE;
+        }
+    }

    // Current default encoder behaviour for the altref sign bias
    if (cpi->source_alt_ref_active)
@@ -3771,17 +3871,16 @@ static void encode_frame_to_data_rate

    vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);

-    // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+    // Limit Q range for the adaptive loop.
    bottom_index = cpi->active_best_quality;
    top_index    = cpi->active_worst_quality;
+    q_low  = cpi->active_best_quality;
+    q_high = cpi->active_worst_quality;

    vp8_save_coding_context(cpi);

    loop_count = 0;

-    q_low  = cpi->best_quality;
-    q_high = cpi->worst_quality;
-

    scale_and_extend_source(cpi->un_scaled_source, cpi);
 #if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
@@ -3817,7 +3916,6 @@ static void encode_frame_to_data_rate
        if (cm->frame_type == KEY_FRAME)
        {
            vp8_de_noise(cpi->Source, cpi->Source, l , 1,  0, RTCD(postproc));
-            cpi->ppi.frame = 0;
        }
        else
        {
@@ -3829,10 +3927,6 @@ static void encode_frame_to_data_rate
            {
                src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
            }
-
-            //temp_filter(&cpi->ppi,src,src,
-            //  cm->last_frame.y_width * cm->last_frame.y_height,
-            //  cpi->oxcf.noise_sensitivity);
        }
    }

@@ -3963,15 +4057,13 @@ static void encode_frame_to_data_rate

                Q = vp8_regulate_q(cpi, cpi->this_frame_target);

-                q_low  = cpi->best_quality;
-                q_high = cpi->worst_quality;
-
                vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);

-                // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+                // Limit Q range for the adaptive loop.
                bottom_index = cpi->active_best_quality;
                top_index    = cpi->active_worst_quality;
-
+                q_low  = cpi->active_best_quality;
+                q_high = cpi->active_worst_quality;

                loop_count++;
                Loop = TRUE;
@@ -4011,19 +4103,18 @@ static void encode_frame_to_data_rate
 #if !(CONFIG_REALTIME_ONLY)

        // Is the projected frame size out of range and are we allowed to attempt to recode.
-        if (((cpi->sf.recode_loop == 1) ||
-             ((cpi->sf.recode_loop == 2) && (cm->refresh_golden_frame || (cm->frame_type == KEY_FRAME)))) &&
-            (((cpi->projected_frame_size > frame_over_shoot_limit) && (Q < top_index)) ||
-             //((cpi->projected_frame_size > frame_over_shoot_limit ) && (Q == top_index) && (cpi->zbin_over_quant < ZBIN_OQ_MAX)) ||
-             ((cpi->projected_frame_size < frame_under_shoot_limit) && (Q > bottom_index)))
-           )
+        if ( recode_loop_test( cpi,
+                               frame_over_shoot_limit, frame_under_shoot_limit,
+                               Q, top_index, bottom_index ) )
        {
            int last_q = Q;
            int Retries = 0;

            // Frame size out of permitted range:
            // Update correction factor & compute new Q to try...
-            if (cpi->projected_frame_size > frame_over_shoot_limit)
+
+            // Frame is too large
+            if (cpi->projected_frame_size > cpi->this_frame_target)
            {
                //if ( cpi->zbin_over_quant == 0 )
                q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
@@ -4067,6 +4158,7 @@ static void encode_frame_to_data_rate

                overshoot_seen = TRUE;
            }
+            // Frame is too small
            else
            {
                if (cpi->zbin_over_quant == 0)
@@ -4160,6 +4252,36 @@ static void encode_frame_to_data_rate
    }
 #endif

+    // Update the GF useage maps.
+    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
+    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+    // This frame's MVs are saved and will be used in next frame's MV prediction.
+    if(cm->show_frame)   //do not save for altref frame
+    {
+      int mb_row;
+      int mb_col;
+      MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
+      //static int last_video_frame = 0;
+
+      if(cm->frame_type != KEY_FRAME)
+      {
+        for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+        {
+          for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+          {
+              if(tmp->mbmi.ref_frame != INTRA_FRAME)
+                cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride)].as_int = tmp->mbmi.mv.as_int;
+
+              cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+              cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride)] = tmp->mbmi.ref_frame;
+              tmp++;
+          }
+        }
+      }
+    }
+
+
    // Update the GF useage maps.
    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
@@ -4218,10 +4340,11 @@ static void encode_frame_to_data_rate
            {
                vp8cx_set_alt_lf_level(cpi, cm->filter_level);
                vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
-                cm->last_frame_type = cm->frame_type;
                cm->last_filter_type = cm->filter_type;
                cm->last_sharpness_level = cm->sharpness_level;
            }
+            /* Move storing frame_type out of the above loop since it is also needed in motion search besides loopfilter */
+            cm->last_frame_type = cm->frame_type;

            vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

@@ -4533,7 +4656,7 @@ static void encode_frame_to_data_rate
    }
    else
    {
-        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame)
+        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame && (cpi->common.frame_type != KEY_FRAME))
            // Update the alternate reference frame and stats as appropriate.
            update_alt_ref_frame_and_stats(cpi);
        else
@@ -4856,7 +4979,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                {
                    int thiserr;
                    cpi->oxcf.arnr_strength = i;
-                    vp8cx_temp_filter_c(cpi);
+                    vp8_temporal_filter_prepare_c(cpi);

                    thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
                                                  &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
@@ -4871,7 +4994,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                if (besti != -1)
                {
                    cpi->oxcf.arnr_strength = besti;
-                    vp8cx_temp_filter_c(cpi);
+                    vp8_temporal_filter_prepare_c(cpi);
                    s = &cpi->alt_ref_buffer;

                    // FWG not sure if I need to copy this data for the Alt Ref frame
@@ -4883,7 +5006,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                    s = &cpi->src_buffer[cpi->last_alt_ref_sei];

 #else
-                vp8cx_temp_filter_c(cpi);
+                vp8_temporal_filter_prepare_c(cpi);
                s = &cpi->alt_ref_buffer;

                // FWG not sure if I need to copy this data for the Alt Ref frame
@@ -4967,17 +5090,16 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon

    *frame_flags = cpi->source_frame_flags;

-#if CONFIG_PSNR
-
    if (cpi->source_time_stamp < cpi->first_time_stamp_ever)
+    {
        cpi->first_time_stamp_ever = cpi->source_time_stamp;
-
-#endif
+        cpi->last_end_time_stamp_seen = cpi->source_time_stamp;
+    }

    // adjust frame rates based on timestamps given
    if (!cm->refresh_alt_ref_frame)
    {
-        if (cpi->last_time_stamp_seen == 0)
+        if (cpi->source_time_stamp == cpi->first_time_stamp_ever)
        {
            double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp);

@@ -4985,7 +5107,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
        }
        else
        {
-            long long nanosecs = cpi->source_time_stamp - cpi->last_time_stamp_seen;
+            long long nanosecs = cpi->source_end_time_stamp
+                - cpi->last_end_time_stamp_seen;
            double this_fps = 10000000.000 / nanosecs;

            vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
@@ -4993,6 +5116,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
        }

        cpi->last_time_stamp_seen = cpi->source_time_stamp;
+        cpi->last_end_time_stamp_seen = cpi->source_end_time_stamp;
    }

    if (cpi->compressor_speed == 2)
@@ -5208,7 +5332,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
    return 0;
 }

-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
 {
    VP8_COMP *cpi = (VP8_COMP *) comp;

@@ -5218,7 +5342,7 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int debloc
    {
        int ret;
 #if CONFIG_POSTPROC
-        ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
+        ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else

        if (cpi->common.frame_to_show)
@@ -5311,12 +5435,12 @@ int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert
 {
    VP8_COMP *cpi = (VP8_COMP *) comp;

-    if (horiz_mode >= NORMAL && horiz_mode <= ONETWO)
+    if (horiz_mode <= ONETWO)
        cpi->common.horiz_scale = horiz_mode;
    else
        return -1;

-    if (vert_mode >= NORMAL && vert_mode <= ONETWO)
+    if (vert_mode <= ONETWO)
        cpi->common.vert_scale  = vert_mode;
    else
        return -1;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,7 +18,6 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "onyxc_int.h"
-#include "preproc.h"
 #include "variance.h"
 #include "dct.h"
 #include "encodemb.h"
@@ -28,6 +27,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
+#include "temporal_filter.h"

 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -46,6 +46,8 @@
 #define MAX_THRESHMULT  512

 #define GF_ZEROMV_ZBIN_BOOST 24
+#define LF_ZEROMV_ZBIN_BOOST 12
+#define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192

 #define VP8_TEMPORAL_ALT_REF 1
@@ -180,6 +182,8 @@ typedef struct
    int first_step;
    int optimize_coefficients;

+    int use_fastquant_for_pick;
+
 } SPEED_FEATURES;

 typedef struct
@@ -227,6 +231,7 @@ typedef struct VP8_ENCODER_RTCD
    vp8_encodemb_rtcd_vtable_t  encodemb;
    vp8_quantize_rtcd_vtable_t  quantize;
    vp8_search_rtcd_vtable_t    search;
+    vp8_temporal_rtcd_vtable_t  temporal;
 } VP8_ENCODER_RTCD;

 enum
@@ -239,6 +244,12 @@ enum
    BLOCK_MAX_SEGMENTS
 };

+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
 typedef struct
 {

@@ -260,6 +271,9 @@ typedef struct
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);


    MACROBLOCK mb;
@@ -276,14 +290,14 @@ typedef struct
    unsigned int source_frame_flags;
    YV12_BUFFER_CONFIG scaled_source;

-    int source_buffer_count;
-    int source_encode_index;
-    int source_alt_ref_pending;
-    int source_alt_ref_active;
+    int source_buffer_count;    // number of src_buffers in use for lagged encoding
+    int source_encode_index;    // index of buffer in src_buffer to encode
+    int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
+    int source_alt_ref_active;  // an alt ref frame has been encoded and is usable

-    int last_alt_ref_sei;
-    int is_src_frame_alt_ref;
-    int is_next_src_alt_ref;
+    int last_alt_ref_sei;       // index into src_buffers of frame used as alt reference
+    int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+    int is_next_src_alt_ref;    // source of next frame to encode is an exact copy of an alt ref frame

    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
@@ -319,6 +333,7 @@ typedef struct
    int mvcostmultiplier;
    int subseqblockweight;
    int errthresh;
+    unsigned int activity_avg;

    int RDMULT;
    int RDDIV ;
@@ -399,6 +414,7 @@ typedef struct
    int inter_frame_target;
    double output_frame_rate;
    long long last_time_stamp_seen;
+    long long last_end_time_stamp_seen;
    long long first_time_stamp_ever;

    int ni_av_qi;
@@ -454,8 +470,6 @@ typedef struct
    unsigned char *output_partition2;
    size_t output_partition2size;

-    pre_proc_instance ppi;
-
    int frames_to_key;
    int gfu_boost;
    int kf_boost;
@@ -466,11 +480,17 @@ typedef struct
    double start_tot_err_left;
    double min_error;

-    double modified_total_error_left;
+    double modified_error_total;
+    double modified_error_used;
+    double modified_error_left;
+    double clip_bpe;
+    double observed_bpe;
+
    double avg_iiratio;

    int target_bandwidth;
    long long bits_left;
+    long long clip_bits_total;
    FIRSTPASS_STATS *total_stats;
    FIRSTPASS_STATS *this_frame_stats;
    FIRSTPASS_STATS *stats_in, *stats_in_end;
@@ -611,9 +631,6 @@ typedef struct
    unsigned int tempdata2;

    int base_skip_false_prob[128];
-    unsigned int section_is_low_motion;
-    unsigned int section_benefits_from_aggresive_q;
-    unsigned int section_is_fast_motion;
    unsigned int section_intra_rating;

    double section_max_qfactor;
@@ -661,6 +678,10 @@ typedef struct
    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
    int gf_active_count;

+    //Store last frame's MV info for next frame MV prediction
+    int_mv *lfmv;
+    int *lf_ref_frame_sign_bias;
+    int *lf_ref_frame;

 } VP8_COMP;

@@ -670,6 +691,8 @@ void vp8_encode_frame(VP8_COMP *cpi);

 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);

+unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
+
 int rd_cost_intra_mb(MACROBLOCKD *x);

 void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -685,7 +685,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
 #if 0

            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
+            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -698,7 +698,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                    num00--;
                else
                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
+                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);

                    if (thissme < bestsme)
                    {
@@ -724,7 +724,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -743,7 +743,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9

                        if (thissme < bestsme)
                        {
--- a/vp8/encoder/preproc.c
+++ b/vp8/encoder/preproc.c
@@ -1,251 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     preproc.c
-*
-*   Description  :     Simple pre-processor.
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-#include "memory.h"
-#include "preproc7.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*  Macros
-****************************************************************************/
-#define FRAMECOUNT 7
-#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-/****************************************************************************
-*  Exported Global Variables
-****************************************************************************/
-void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-void temp_filter_mmx
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-);
-void temp_filter_wmt
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-);
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_c
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_c
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int frame = 0;
-
-            do
-            {
-                *frameptr = s[byte];
-                ++frameptr;
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            d[byte] = s[byte];
-
-            ++byte;
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int modifier;
-        int offset = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            int accumulator = 0;
-            int count = 0;
-            int frame = 0;
-
-            frameptr[offset] = s[byte];
-
-            do
-            {
-                int pixel_value = *frameptr;
-
-                modifier   = s[byte];
-                modifier  -= pixel_value;
-                modifier  *= modifier;
-                modifier >>= strength;
-                modifier  *= 3;
-
-                if (modifier > 16)
-                    modifier = 16;
-
-                modifier = 16 - modifier;
-
-                accumulator += modifier * pixel_value;
-
-                count += modifier;
-
-                frameptr++;
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            accumulator += (count >> 1);
-            accumulator *= ppi->fixed_divide[count];
-            accumulator >>= 16;
-
-            d[byte] = accumulator;
-
-            ++byte;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-}
-/****************************************************************************
- *
- *  ROUTINE       : delete_pre_proc
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Deletes a pre-processing instance.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void delete_pre_proc(pre_proc_instance *ppi)
-{
-    if (ppi->frame_buffer_alloc)
-        vpx_free(ppi->frame_buffer_alloc);
-
-    ppi->frame_buffer_alloc = 0;
-    ppi->frame_buffer      = 0;
-
-    if (ppi->fixed_divide_alloc)
-        vpx_free(ppi->fixed_divide_alloc);
-
-    ppi->fixed_divide_alloc = 0;
-    ppi->fixed_divide      = 0;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : init_pre_proc
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  int frame_size        : Number of bytes in one frame.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: 1 if successful, 0 if failed.
- *
- *  FUNCTION      : Initializes prepprocessor instance.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int init_pre_proc7(pre_proc_instance *ppi, int frame_size)
-{
-    int i;
-    int mmx_enabled;
-    int xmm_enabled;
-    int wmt_enabled;
-
-    vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
-    if (wmt_enabled)
-        temp_filter = temp_filter_wmt;
-    else if (mmx_enabled)
-        temp_filter = temp_filter_mmx;
-    else
-        temp_filter = temp_filter_c;
-
-
-    delete_pre_proc(ppi);
-
-    ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char));
-
-    if (!ppi->frame_buffer_alloc)
-    {
-        delete_pre_proc(ppi);
-        return 0;
-    }
-
-    ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc);
-
-    ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int));
-
-    if (!ppi->fixed_divide_alloc)
-    {
-        delete_pre_proc(ppi);
-        return 0;
-    }
-
-    ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc);
-
-    for (i = 1; i < 255; i++)
-        ppi->fixed_divide[i] = 0x10000 / i;
-
-    return 1;
-}
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,8 +16,9 @@
 #include "entropy.h"
 #include "predictdc.h"

-//#define EXACT_QUANT
-#ifdef EXACT_QUANT
+#define EXACT_QUANT
+
+#ifdef EXACT_FASTQUANT
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -26,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    short *coeff_ptr       = b->coeff;
    short *zbin_ptr        = b->zbin;
    short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant;
+    short *quant_ptr       = b->quant_fast;
    short *quant_shift_ptr = b->quant_shift;
    short *qcoeff_ptr      = d->qcoeff;
    short *dqcoeff_ptr     = d->dqcoeff;
@@ -64,6 +65,45 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    d->eob = eob + 1;
 }

+#else
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc] = x;                          // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+        if (y)
+        {
+            eob = i;                                // last nonzero coeffs
+        }
+    }
+    d->eob = eob + 1;
+}
+
+#endif
+
+#ifdef EXACT_QUANT
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -178,39 +218,6 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
 }

 #else
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr   = b->coeff;
-    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
-    short *qcoeff_ptr  = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = d->dequant;
-
-    eob = -1;
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
-
-        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc] = x;                          // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
-
-        if (y)
-        {
-            eob = i;                                // last nonzero coeffs
-        }
-    }
-    d->eob = eob + 1;
-}

 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -45,46 +45,48 @@ extern int inter_b_modes[10];
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9

+// Work in progress recalibration of baseline rate tables based on
+// the assumption that bits per mb is inversely proportional to the
+// quantizer value.
 const int vp8_bits_per_mb[2][QINDEX_RANGE] =
 {
-    // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q:
+    // Intra case 450000/Qintra
    {
-        674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719,
-        390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402,
-        285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615,
-        229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401,
-        187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560,
-        156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700,
-        134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782,
-        114883, 112332, 108410, 105685, 103434, 101192,  98587,  95959,
-        94059,  92017,  89970,  87936,  86142,  84801,  82736,  81106,
-        79668,  78135,  76641,  75103,  73943,  72693,  71401,  70098,
-        69165,  67901,  67170,  65987,  64923,  63534,  62378,  61302,
-        59921,  58941,  57844,  56782,  55960,  54973,  54257,  53454,
-        52230,  50938,  49962,  49190,  48288,  47270,  46738,  46037,
-        45020,  44027,  43216,  42287,  41594,  40702,  40081,  39414,
-        38282,  37627,  36987,  36375,  35808,  35236,  34710,  34162,
-        33659,  33327,  32751,  32384,  31936,  31461,  30982,  30582,
+        1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
+        409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
+        250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
+        195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
+        155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
+        121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
+        102272, 100000, 97826,  97826,  95744,  93750,  91836,  90000,
+        88235,  86538,  84905,  83333,  81818,  80357,  78947,  77586,
+        76271,  75000,  73770,  72580,  71428,  70312,  69230,  68181,
+        67164,  66176,  65217,  64285,  63380,  62500,  61643,  60810,
+        60000,  59210,  59210,  58441,  57692,  56962,  56250,  55555,
+        54878,  54216,  53571,  52941,  52325,  51724,  51136,  50561,
+        49450,  48387,  47368,  46875,  45918,  45000,  44554,  44117,
+        43269,  42452,  41666,  40909,  40178,  39473,  38793,  38135,
+        36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
+        32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
    },
-
-    // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q:
+    // Inter case 285000/Qinter
    {
-        497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321,
-        233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116,
-        154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226,
-        114188, 111756, 107882, 104749, 102522,  96451,  94424,  90905,
-        87286,  84931,  82111,  80534,  77610,  74700,  73037,  70715,
-        68006,  67235,  65374,  64009,  62134,  60180,  59105,  57691,
-        55509,  54512,  53318,  52693,  51194,  49840,  48944,  46980,
-        45668,  44177,  42348,  40994,  39859,  38889,  37717,  36391,
-        35482,  34622,  33795,  32756,  32002,  31492,  30573,  29737,
-        29152,  28514,  27941,  27356,  26859,  26329,  25874,  25364,
-        24957,  24510,  24290,  23689,  23380,  22845,  22481,  22066,
-        21587,  21219,  20880,  20452,  20260,  19926,  19661,  19334,
-        18915,  18391,  18046,  17833,  17441,  17105,  16888,  16729,
-        16383,  16023,  15706,  15442,  15222,  14938,  14673,  14452,
-        14005,  13807,  13611,  13447,  13223,  13102,  12963,  12801,
-        12627,  12534,  12356,  12228,  12056,  11907,  11746,  11643,
+        712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
+        237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
+        142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
+        101785, 98275,  95000,  91935,  89062,  86363,  83823,  81428,
+        79166,  77027,  75000,  73076,  71250,  69512,  67857,  66279,
+        64772,  63333,  61956,  60638,  59375,  58163,  57000,  55882,
+        54807,  53773,  52777,  51818,  50892,  50000,  49137,  47500,
+        45967,  44531,  43181,  41911,  40714,  39583,  38513,  37500,
+        36538,  35625,  34756,  33928,  33139,  32386,  31666,  30978,
+        30319,  29687,  29081,  28500,  27941,  27403,  26886,  26388,
+        25909,  25446,  25000,  24568,  23949,  23360,  22800,  22265,
+        21755,  21268,  20802,  20357,  19930,  19520,  19127,  18750,
+        18387,  18037,  17701,  17378,  17065,  16764,  16473,  16101,
+        15745,  15405,  15079,  14766,  14467,  14179,  13902,  13636,
+        13380,  13133,  12895,  12666,  12445,  12179,  11924,  11632,
+        11445,  11220,  11003,  10795,  10594,  10401,  10215,  10035,
    }
 };

@@ -324,6 +326,7 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
        cpi->frames_till_gf_update_due = cpi->goldfreq;

    cpi->common.refresh_golden_frame = TRUE;
+    cpi->common.refresh_alt_ref_frame = TRUE;
 }

 void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
@@ -1034,9 +1037,7 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
            gf_frame_useage = pct_gf_active;

        // Is a fixed manual GF frequency being used
-        if (!cpi->auto_gold)
-            cpi->common.refresh_golden_frame = TRUE;
-        else
+        if (cpi->auto_gold)
        {
            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -126,6 +126,24 @@ void vp8_sad16x16x3_c(
    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad16x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -139,6 +157,24 @@ void vp8_sad16x8x3_c(
    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad16x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -152,6 +188,24 @@ void vp8_sad8x8x3_c(
    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad8x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x16x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -165,6 +219,24 @@ void vp8_sad8x16x3_c(
    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad8x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad4x4x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -178,6 +250,24 @@ void vp8_sad4x4x3_c(
    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

+void vp8_sad4x4x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
    const unsigned char *src_ptr,
    int  src_stride,
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,30 +36,37 @@

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+#define USE_FILTER_LUT 0         // use lookup table to improve filter

-#define USE_FILTER_LUT 1
 #if VP8_TEMPORAL_ALT_REF

 #if USE_FILTER_LUT
+// for (strength = 0; strength <= 6; strength++) {
+//   for (delta = 0; delta <= 18; delta++) {
+//     float coeff = (3.0 * delta * delta) / pow(2, strength);
+//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
+//   }
+//   printf("\n");
+// }
 static int modifier_lut[7][19] =
 {
    // Strength=0
-    {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    // Strength=1
-    {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    // Strength=2
-    {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    // Strength=3
-    {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    // Strength=4
-    {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
+    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},
    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
+    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}
 };
 #endif
-static void build_predictors_mb
+static void vp8_temporal_filter_predictors_mb_c
 (
    MACROBLOCKD *x,
    unsigned char *y_mb_ptr,
@@ -111,7 +118,7 @@ static void build_predictors_mb
        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, stride, &pred[320], 8);
    }
 }
-static void apply_temporal_filter
+void vp8_temporal_filter_apply_c
 (
    unsigned char *frame1,
    unsigned int stride,
@@ -140,16 +147,14 @@ static void apply_temporal_filter
            int pixel_value = *frame2++;

 #if USE_FILTER_LUT
-            // LUT implementation --
-            // improves precision of filter
            modifier = abs(src_byte-pixel_value);
            modifier = modifier>18 ? 0 : lut[modifier];
 #else
-            modifier   = src_byte;
-            modifier  -= pixel_value;
+            modifier   = src_byte - pixel_value;
            modifier  *= modifier;
-            modifier >>= strength;
            modifier  *= 3;
+            modifier  += 1 << (strength - 1);
+            modifier >>= strength;

            if (modifier > 16)
                modifier = 16;
@@ -171,7 +176,7 @@ static void apply_temporal_filter
 #if ALT_REF_MC_ENABLED
 static int dummy_cost[2*mv_max+1];

-static int find_matching_mb
+static int vp8_temporal_filter_find_matching_mb_c
 (
    VP8_COMP *cpi,
    YV12_BUFFER_CONFIG *arf_frame,
@@ -246,7 +251,7 @@ static int find_matching_mb
            step_param,
            sadpb / 2/*x->errorperbit*/,
            &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost); //sadpb < 9
+            mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9

        // Further step/diamond searches as necessary
        n = 0;
@@ -268,7 +273,7 @@ static int find_matching_mb
                    step_param + n,
                    sadpb / 4/*x->errorperbit*/,
                    &num00, &cpi->fn_ptr[BLOCK_16X16],
-                    mvsadcost, mvcost); //sadpb = 9
+                    mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9

                if (thissme < bestsme)
                {
@@ -292,7 +297,7 @@ static int find_matching_mb
        bestsme = cpi->find_fractional_mv_step(x, b, d,
                    &d->bmi.mv.as_mv, &best_ref_mv1,
                    x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
-                    cpi->mb.mvcost);
+                    mvcost);
    }
 #endif

@@ -308,7 +313,7 @@ static int find_matching_mb
 }
 #endif

-static void vp8cx_temp_blur1_c
+static void vp8_temporal_filter_iterate_c
 (
    VP8_COMP *cpi,
    int frame_count,
@@ -412,11 +417,12 @@ static void vp8cx_temp_blur1_c
 #define THRESH_HIGH  20000

                    // Correlation has been lost try MC
-                    err = find_matching_mb ( cpi,
-                                             cpi->frames[alt_ref_index],
-                                             cpi->frames[frame],
-                                             mb_y_offset,
-                                             THRESH_LOW );
+                    err = vp8_temporal_filter_find_matching_mb_c
+                        (cpi,
+                         cpi->frames[alt_ref_index],
+                         cpi->frames[frame],
+                         mb_y_offset,
+                         THRESH_LOW);

                    if (filter_weight[frame] < 2)
                    {
@@ -429,43 +435,46 @@ static void vp8cx_temp_blur1_c
                if (filter_weight[frame] != 0)
                {
                    // Construct the predictors
-                    build_predictors_mb (
-                              mbd,
-                              cpi->frames[frame]->y_buffer + mb_y_offset,
-                              cpi->frames[frame]->u_buffer + mb_uv_offset,
-                              cpi->frames[frame]->v_buffer + mb_uv_offset,
-                              cpi->frames[frame]->y_stride,
-                              mbd->block[0].bmi.mv.as_mv.row,
-                              mbd->block[0].bmi.mv.as_mv.col,
-                              predictor );
+                    vp8_temporal_filter_predictors_mb_c
+                        (mbd,
+                         cpi->frames[frame]->y_buffer + mb_y_offset,
+                         cpi->frames[frame]->u_buffer + mb_uv_offset,
+                         cpi->frames[frame]->v_buffer + mb_uv_offset,
+                         cpi->frames[frame]->y_stride,
+                         mbd->block[0].bmi.mv.as_mv.row,
+                         mbd->block[0].bmi.mv.as_mv.col,
+                         predictor);

                    // Apply the filter (YUV)
-                    apply_temporal_filter ( f->y_buffer + mb_y_offset,
-                                            f->y_stride,
-                                            predictor,
-                                            16,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator,
-                                            count );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->y_buffer + mb_y_offset,
+                         f->y_stride,
+                         predictor,
+                         16,
+                         strength,
+                         filter_weight[frame],
+                         accumulator,
+                         count);

-                    apply_temporal_filter ( f->u_buffer + mb_uv_offset,
-                                            f->uv_stride,
-                                            predictor + 256,
-                                            8,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator + 256,
-                                            count + 256 );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->u_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 256,
+                         8,
+                         strength,
+                         filter_weight[frame],
+                         accumulator + 256,
+                         count + 256);

-                    apply_temporal_filter ( f->v_buffer + mb_uv_offset,
-                                            f->uv_stride,
-                                            predictor + 320,
-                                            8,
-                                            strength,
-                                            filter_weight[frame],
-                                            accumulator + 320,
-                                            count + 320 );
+                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+                        (f->v_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 320,
+                         8,
+                         strength,
+                         filter_weight[frame],
+                         accumulator + 320,
+                         count + 320);
                }
            }

@@ -534,7 +543,7 @@ static void vp8cx_temp_blur1_c
    mbd->pre.v_buffer = v_buffer;
 }

-void vp8cx_temp_filter_c
+void vp8_temporal_filter_prepare_c
 (
    VP8_COMP *cpi
 )
@@ -642,7 +651,7 @@ void vp8cx_temp_filter_c
                = &cpi->src_buffer[which_buffer].source_buffer;
    }

-    vp8cx_temp_blur1_c (
+    vp8_temporal_filter_iterate_c (
        cpi,
        frames_to_blur,
        frames_to_blur_backward,
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -12,8 +12,33 @@
 #ifndef __INC_VP8_TEMPORAL_FILTER_H
 #define __INC_VP8_TEMPORAL_FILTER_H

-#include "onyx_int.h"
+#define prototype_apply(sym)\
+    void (sym) \
+    ( \
+     unsigned char *frame1, \
+     unsigned int stride, \
+     unsigned char *frame2, \
+     unsigned int block_size, \
+     int strength, \
+     int filter_weight, \
+     unsigned int *accumulator, \
+     unsigned int *count \
+    )

-void vp8cx_temp_filter_c(VP8_COMP *cpi);
+#ifndef vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+#endif
+extern prototype_apply(vp8_temporal_filter_apply);
+
+typedef struct
+{
+    prototype_apply(*apply);
+} vp8_temporal_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define TEMPORAL_INVOKE(ctx,fn) vp8_temporal_filter_##fn
+#endif

 #endif // __INC_VP8_TEMPORAL_FILTER_H
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -132,8 +132,6 @@ static void tokenize2nd_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

-        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
-
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -185,7 +183,6 @@ static void tokenize1st_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

-        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -434,7 +431,6 @@ static __inline void stuff2nd_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
-    t->section = 11;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -465,7 +461,6 @@ static __inline void stuff1st_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
-    t->section = 8;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -495,7 +490,6 @@ void stuff1st_order_buv

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
-    t->section = 13;
    t->skip_eob_node = 0;
    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -25,11 +25,10 @@ typedef struct

 typedef struct
 {
-    int Token;
-    int Extra;
    const vp8_prob *context_tree;
-    int skip_eob_node;
-    int section;
+    short           Extra;
+    unsigned char   Token;
+    unsigned char   skip_eob_node;
 } TOKENEXTRA;

 int rd_cost_mby(MACROBLOCKD *);
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -32,6 +32,16 @@
     unsigned int *sad_array\
    )

+#define prototype_sad_multi_same_address_1(sym)\
+    void (sym)\
+    (\
+     const unsigned char *src_ptr, \
+     int source_stride, \
+     const unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned short *sad_array\
+    )
+
 #define prototype_sad_multi_dif_address(sym)\
    void (sym)\
    (\
@@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);

+#ifndef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
+
+#ifndef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
+
+#ifndef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
+
+#ifndef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
+
+#ifndef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
+
 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

 #ifndef vp8_variance_sad16x16x4d
@@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);

 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
 typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
 typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
@@ -317,6 +353,12 @@ typedef struct
    vp8_sad_multi_fn_t       sad8x8x3;
    vp8_sad_multi_fn_t       sad4x4x3;

+    vp8_sad_multi1_fn_t      sad16x16x8;
+    vp8_sad_multi1_fn_t      sad16x8x8;
+    vp8_sad_multi1_fn_t      sad8x16x8;
+    vp8_sad_multi1_fn_t      sad8x8x8;
+    vp8_sad_multi1_fn_t      sad4x4x8;
+
    vp8_sad_multi_d_fn_t     sad16x16x4d;
    vp8_sad_multi_d_fn_t     sad16x8x4d;
    vp8_sad_multi_d_fn_t     sad8x16x4d;
@@ -334,6 +376,7 @@ typedef struct
    vp8_variance_fn_t       svf_halfpix_v;
    vp8_variance_fn_t       svf_halfpix_hv;
    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi1_fn_t     sdx8f;
    vp8_sad_multi_d_fn_t    sdx4df;
 } vp8_variance_fn_ptr_t;

--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,511 +11,231 @@

 %include "vpx_ports/x86_abi_support.asm"

-section .text
-    global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_short_fdct8x4_wmt)
-
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
 sym(vp8_short_fdct4x4_mmx):
    push        rbp
-    mov         rbp, rsp
+    mov         rbp,        rsp
    SHADOW_ARGS_TO_STACK 3
    GET_GOT     rbx
-    push rsi
-    push rdi
+    push        rsi
+    push        rdi
    ; end prolog
-        mov     rsi,    arg(0) ;input
-        mov     rdi,    arg(1) ;output

-        lea     rdx,    [GLOBAL(dct_const_mmx)]
-        movsxd  rax,    dword ptr arg(2) ;pitch
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output

-        lea     rcx,    [rsi + rax*2]
+        movsxd      rax,        dword ptr arg(2) ;pitch
+
+        lea         rcx,        [rsi + rax*2]
        ; read the input data
-        movq    mm0,    [rsi]
-        movq    mm1,    [rsi + rax    ]
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]

-        movq    mm2,    [rcx]
-        movq    mm3,    [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw   mm0,    3
-        psllw   mm1,    3
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]

-        psllw   mm2,    3
-        psllw   mm3,    3
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23

-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13

-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33

-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30

+        punpckhdq   mm1,        mm2         ; 01 11 21 31

-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32

-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
+        punpckhdq   mm3,        mm5         ; 03 13 23 33

+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3

        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
+        movq        mm5,        mm0
+        movq        mm4,        mm1

-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3

+        psllw       mm5,        3
+        psllw       mm4,        3
+
+        psllw       mm0,        3
+        psllw       mm1,        3

        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
+        movq        mm2,        mm0         ; a1

-        paddw   mm0,    mm1         ; a + b
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1

        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1

-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5

-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]

        ; done with vertical
        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32

-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31

-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33

+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03

-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
+        punpckhdq   mm1,        mm2         ; 01 22 12 13

-        punpckhdq   mm1,    mm2     ; 01 11 21 31
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23

-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
+        punpckhdq   mm4,        mm5         ; 30 31 32 33

-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4

+        movq        mm5,        mm0
+        movq        mm3,        mm1

-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2

-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3

-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        pxor        mm6,        mm6         ; zero out for compare

+        pcmpeqw     mm6,        mm5         ; d1 != 0
+
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower

        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-        paddw   mm0,    mm1         ; a + b
+        movq        mm2,        mm0         ; a1

-        psubw   mm2,    mm1         ; a - b
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1

-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2

        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1

-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5

-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-        ; done with vertical
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]

-        pcmpeqw mm4,    mm4
-        pcmpeqw mm5,    mm5
-        psrlw   mm4,    15
-        psrlw   mm5,    15
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)

-        psllw   mm4,    2
-        psllw   mm5,    2
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3

-        paddw   mm0,    mm4
-        paddw   mm1,    mm5
-        paddw   mm2,    mm4
-        paddw   mm3,    mm5
-
-        psraw   mm0, 3
-        psraw   mm1, 3
-        psraw   mm2, 3
-        psraw   mm3, 3
-
-        movq        [rdi   ],   mm0
-        movq        [rdi+ 8],   mm1
-        movq        [rdi+16],   mm2
-        movq        [rdi+24],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
+     ; begin epilog
+    pop         rdi
+    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        lea         rdx,    [GLOBAL(dct_const_xmm)]
-        movsxd      rax,    dword ptr arg(2) ;pitch
-
-        lea         rcx,    [rsi + rax*2]
-        ; read the input data
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm2,       [rsi + rax]
-
-        movdqa      xmm4,       [rcx]
-        movdqa      xmm3,       [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw       xmm0,        3
-        psllw       xmm2,        3
-
-        psllw       xmm4,        3
-        psllw       xmm3,        3
-
-        ; transpose for the second stage
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
-
-        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
-
-        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
-
-        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
-
-
-        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
-
-        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
-
-        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
-
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
-
-        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
-
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
-
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
-
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
-
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
-
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
-
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
-
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
-
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
-
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-
-
-        ; done with vertical
-        ; transpose for the second stage
-        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
-        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
-
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
-
-        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
-
-        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
-
-        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
-
-        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
-
-
-        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
-
-        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
-
-        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
-
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
-
-        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
-
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
-
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
-
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
-
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
-
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
-
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
-
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
-
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
-
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-        ; done with vertical
-
-
-        pcmpeqw     xmm4,       xmm4
-        pcmpeqw     xmm5,       xmm5;
-        psrlw       xmm4,       15
-        psrlw       xmm5,       15
-
-        psllw       xmm4,       2
-        psllw       xmm5,       2
-
-        paddw       xmm0,       xmm4
-        paddw       xmm1,       xmm5
-        paddw       xmm2,       xmm4
-        paddw       xmm3,       xmm5
-
-        psraw       xmm0,       3
-        psraw       xmm1,       3
-        psraw       xmm2,       3
-        psraw       xmm3,       3
-
-        movq        QWORD PTR[rdi   ],   xmm0
-        movq        QWORD PTR[rdi+ 8],   xmm1
-        movq        QWORD PTR[rdi+16],   xmm2
-        movq        QWORD PTR[rdi+24],   xmm3
-
-        psrldq      xmm0,       8
-        psrldq      xmm1,       8
-        psrldq      xmm2,       8
-        psrldq      xmm3,       8
-
-        movq        QWORD PTR[rdi+32],   xmm0
-        movq        QWORD PTR[rdi+40],   xmm1
-        movq        QWORD PTR[rdi+48],   xmm2
-        movq        QWORD PTR[rdi+56],   xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
-    times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
-    times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
-    times 4 dw 23170
-
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw 23170
-    times 2 dw -23170
-    dw 23170
-
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
-    times 4 dw 0
-    times 4 dw 60547
-    times 4 dw 46341
-    times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
-    times 8 dw 0
-    times 8 dw 60547
-    times 8 dw 46341
-    times 8 dw 25080
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,32 +11,68 @@

 %include "vpx_ports/x86_abi_support.asm"

-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-;;    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rsi, arg(0)
-    movsxd      rax, DWORD PTR arg(2)
-    lea         rdi, [rsi + rax*2]
+    mov         rdi, arg(1)

-    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
-    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
-    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
-    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
+
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30

    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20

-    mov         rdi, arg(1)
-
    movdqa      xmm2, xmm0
    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
@@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2):
    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
    movdqa      xmm1, xmm0
    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
@@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2):
    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]

-    movdqa      XMMWORD PTR[rdi + 0], xmm0
-    movdqa      XMMWORD PTR[rdi + 16], xmm1
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1

-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-;;    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY

 SECTION_RODATA
 align 16
@@ -161,7 +397,9 @@ align 16
 _cmp_mask:
    times 4 dw 1
    times 4 dw 0
-
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
 align 16
 _mult_sub:
    dw 1
@@ -176,6 +414,9 @@ align 16
 _7:
    times 4 dd 7
 align 16
+_7w:
+    times 8 dw 7
+align 16
 _14500:
    times 4 dd 14500
 align 16
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 0
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-#endif

 #endif
+
 #endif


 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_sse2);
 extern prototype_fdct(vp8_short_walsh4x4_sse2);

 extern prototype_fdct(vp8_short_fdct4x4_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 1
-/* short SSE2 DCT currently disabled, does not match the MMX version */
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
-#endif

 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2

-#undef vp8_fdct_walsh_short4x4
+#undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2

 #endif
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,6 +17,7 @@ sym(vp8_short_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
@@ -143,6 +144,7 @@ sym(vp8_short_walsh4x4_sse2):
    pop rdi
    pop rsi
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -24,5 +24,14 @@
 #endif
 #endif

+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
 #endif

--- a/vp8/encoder/x86/preproc_mmx.c
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -1,298 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "memory.h"
-#include "preproc.h"
-#include "pragmas.h"
-
-/****************************************************************************
-*  Macros
-****************************************************************************/
-#define FRAMECOUNT 7
-#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-/****************************************************************************
-*  Exported Global Variables
-****************************************************************************/
-void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_wmt
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_wmt
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
-    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int i;
-            int frame = 0;
-
-            do
-            {
-                for (i = 0; i < 8; i++)
-                {
-                    *frameptr = s[byte+i];
-                    ++frameptr;
-                }
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            for (i = 0; i < 8; i++)
-                d[byte+i] = s[byte+i];
-
-            byte += 8;
-
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int i;
-        int offset2 = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            __declspec(align(16)) unsigned short counts[8];
-            __declspec(align(16)) unsigned short sums[8];
-            __asm
-            {
-                mov         eax, offset2
-                mov         edi, s                  // source pixels
-                pxor        xmm1, xmm1              // accumulator
-
-                pxor        xmm7, xmm7
-
-                mov         esi, frameptr           // accumulator
-                pxor        xmm2, xmm2              // count
-
-                movq        xmm3, QWORD PTR [edi]
-
-                movq        QWORD PTR [esi+8*eax], xmm3
-
-                punpcklbw   xmm3, xmm2              // xmm3 source pixels
-                mov         ecx,  FRAMECOUNT
-
-                next_frame:
-                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
-                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
-                movdqa      xmm6, xmm4              // save the pixel values
-                psubsw      xmm4, xmm3              // subtracted pixel values
-                pmullw      xmm4, xmm4              // square xmm4
-                movd        xmm5, strength
-                psrlw       xmm4, xmm5              // should be strength
-                pmullw      xmm4, threes            // 3 * modifier
-                movdqa      xmm5, sixteens          // 16s
-                psubusw     xmm5, xmm4              // 16 - modifiers
-                movdqa      xmm4, xmm5              // save the modifiers
-                pmullw      xmm4, xmm6              // multiplier values
-                paddusw     xmm1, xmm4              // accumulator
-                paddusw     xmm2, xmm5              // count
-                add         esi, 8                  // next frame
-                dec         ecx                     // next set of eight pixels
-                jnz         next_frame
-
-                movdqa      counts, xmm2
-                psrlw       xmm2, 1                 // divide count by 2 for rounding
-                paddusw     xmm1, xmm2              // rounding added in
-
-                mov         frameptr, esi
-
-                movdqa      sums, xmm1
-            }
-
-            for (i = 0; i < 8; i++)
-            {
-                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
-                blurvalue >>= 16;
-                d[i] = blurvalue;
-            }
-
-            s += 8;
-            d += 8;
-            byte += 8;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-    __asm emms
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : temp_filter_mmx
- *
- *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
- *                  unsigned char *s     : Pointer to source frame.
- *                  unsigned char *d     : Pointer to destination frame.
- *                  int bytes            : Number of bytes to filter.
- *                  int strength         : Strength of filter to apply.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs a closesness adjusted temporarl blur
- *
- *  SPECIAL NOTES : Destination frame can be same as source frame.
- *
- ****************************************************************************/
-void temp_filter_mmx
-(
-    pre_proc_instance *ppi,
-    unsigned char *s,
-    unsigned char *d,
-    int bytes,
-    int strength
-)
-{
-    int byte = 0;
-    unsigned char *frameptr = ppi->frame_buffer;
-
-    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
-    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
-
-    if (ppi->frame == 0)
-    {
-        do
-        {
-            int i;
-            int frame = 0;
-
-            do
-            {
-                for (i = 0; i < 4; i++)
-                {
-                    *frameptr = s[byte+i];
-                    ++frameptr;
-                }
-
-                ++frame;
-            }
-            while (frame < FRAMECOUNT);
-
-            for (i = 0; i < 4; i++)
-                d[byte+i] = s[byte+i];
-
-            byte += 4;
-
-        }
-        while (byte < bytes);
-    }
-    else
-    {
-        int i;
-        int offset2 = (ppi->frame % FRAMECOUNT);
-
-        do
-        {
-            __declspec(align(16)) unsigned short counts[8];
-            __declspec(align(16)) unsigned short sums[8];
-            __asm
-            {
-
-                mov         eax, offset2
-                mov         edi, s                  // source pixels
-                pxor        mm1, mm1                // accumulator
-                pxor        mm7, mm7
-
-                mov         esi, frameptr           // accumulator
-                pxor        mm2, mm2                // count
-
-                movd        mm3, DWORD PTR [edi]
-                movd        DWORD PTR [esi+4*eax], mm3
-
-                punpcklbw   mm3, mm2                // mm3 source pixels
-                mov         ecx,  FRAMECOUNT
-
-                next_frame:
-                movd        mm4, DWORD PTR [esi]    // get frame buffer values
-                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
-                movq        mm6, mm4                // save the pixel values
-                psubsw      mm4, mm3                // subtracted pixel values
-                pmullw      mm4, mm4                // square mm4
-                movd        mm5, strength
-                psrlw       mm4, mm5                // should be strength
-                pmullw      mm4, threes             // 3 * modifier
-                movq        mm5, sixteens           // 16s
-                psubusw     mm5, mm4                // 16 - modifiers
-                movq        mm4, mm5                // save the modifiers
-                pmullw      mm4, mm6                // multiplier values
-                paddusw     mm1, mm4                // accumulator
-                paddusw     mm2, mm5                // count
-                add         esi, 4                  // next frame
-                dec         ecx                     // next set of eight pixels
-                jnz         next_frame
-
-                movq        counts, mm2
-                psrlw       mm2, 1                  // divide count by 2 for rounding
-                paddusw     mm1, mm2                // rounding added in
-
-                mov         frameptr, esi
-
-                movq        sums, mm1
-
-            }
-
-            for (i = 0; i < 4; i++)
-            {
-                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
-                blurvalue >>= 16;
-                d[i] = blurvalue;
-            }
-
-            s += 4;
-            d += 4;
-            byte += 4;
-        }
-        while (byte < bytes);
-    }
-
-    ++ppi->frame;
-    __asm emms
-}
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -253,10 +253,9 @@ rq_zigzag_1c:
    pop         rbp
    ret

-
 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
 ;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
+;                           short *inv_scan_order, short *round_ptr,
 ;                           short *quant_ptr, short *dqcoeff_ptr);
 global sym(vp8_fast_quantize_b_impl_sse2)
 sym(vp8_fast_quantize_b_impl_sse2):
@@ -265,32 +264,18 @@ sym(vp8_fast_quantize_b_impl_sse2):
    SHADOW_ARGS_TO_STACK 7
    push        rsi
    push        rdi
-    push        rbx
    ; end prolog

-    ALIGN_STACK 16, rax
-
-    %define save_xmm6  0
-    %define save_xmm7 16
-
-    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
-
-    sub         rsp, vp8_fastquantizeb_stack_size
-
-    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
-    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
-
    mov         rdx, arg(0)                 ;coeff_ptr
    mov         rcx, arg(2)                 ;dequant_ptr
-    mov         rax, arg(3)                 ;scan_mask
    mov         rdi, arg(4)                 ;round_ptr
    mov         rsi, arg(5)                 ;quant_ptr

    movdqa      xmm0, XMMWORD PTR[rdx]
    movdqa      xmm4, XMMWORD PTR[rdx + 16]

-    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
-    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi
+    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
+    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4
@@ -303,8 +288,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    psubw       xmm1, xmm0                  ;x = abs(z)
    psubw       xmm5, xmm4                  ;x = abs(z)

-    paddw       xmm1, xmm6
-    paddw       xmm5, xmm7
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3

    pmulhw      xmm1, XMMWORD PTR[rsi]
    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
@@ -312,8 +297,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    mov         rdi, arg(1)                 ;qcoeff_ptr
    mov         rsi, arg(6)                 ;dqcoeff_ptr

-    movdqa      xmm6, XMMWORD PTR[rcx]
-    movdqa      xmm7, XMMWORD PTR[rcx + 16]
+    movdqa      xmm2, XMMWORD PTR[rcx]
+    movdqa      xmm3, XMMWORD PTR[rcx + 16]

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
@@ -323,64 +308,47 @@ sym(vp8_fast_quantize_b_impl_sse2):
    movdqa      XMMWORD PTR[rdi], xmm1
    movdqa      XMMWORD PTR[rdi + 16], xmm5

-    pmullw      xmm6, xmm1
-    pmullw      xmm7, xmm5
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5

-    movdqa      xmm2, XMMWORD PTR[rax]
-    movdqa      xmm3, XMMWORD PTR[rax+16];
+    mov         rdi, arg(3)                 ;inv_scan_order

-    pxor        xmm4, xmm4            ;clear all bits
+    ; Start with 16
+    pxor        xmm4, xmm4                  ;clear all bits
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4            ;set all bits
+    pcmpeqw     xmm4, xmm4                  ;set all bits
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4

-    psrlw       xmm1, 15
-    psrlw       xmm5, 15
+    pand        xmm1, XMMWORD PTR[rdi]
+    pand        xmm5, XMMWORD PTR[rdi+16]

-    pmaddwd     xmm1, xmm2
-    pmaddwd     xmm5, xmm3
+    pmaxsw      xmm1, xmm5

-    movq        xmm2, xmm1
-    movq        xmm3, xmm5
+    ; now down to 8
+    pshufd      xmm5, xmm1, 00001110b

-    psrldq      xmm1, 8
-    psrldq      xmm5, 8
+    pmaxsw      xmm1, xmm5

-    paddd       xmm1, xmm5
-    paddd       xmm2, xmm3
+    ; only 4 left
+    pshuflw     xmm5, xmm1, 00001110b

-    paddd       xmm1, xmm2
-    movq        xmm5, xmm1
+    pmaxsw      xmm1, xmm5

-    psrldq      xmm1, 4
-    paddd       xmm5, xmm1
+    ; okay, just 2!
+    pshuflw     xmm5, xmm1, 00000001b

-    movq        rcx,  xmm5
-    and         rcx,  0xffff
+    pmaxsw      xmm1, xmm5

-    xor         rdx,  rdx
-    sub         rdx,  rcx
+    movd        rax, xmm1
+    and         rax, 0xff

-    bsr         rax,  rcx
-    inc         rax
-
-    sar         rdx,  31
-    and         rax,  rdx
-
-    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
-    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
-
-    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
-    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
-
-    add         rsp, vp8_fastquantizeb_stack_size
-    pop         rsp
+    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
+    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff

    ; begin epilog
-    pop         rbx
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rdi, arg(3)                 ;round_ptr
+    mov         rsi, arg(4)                 ;quant_ptr
+
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+
+    movdqa      xmm2, [rdi]                 ;round lo
+    movdqa      xmm3, [rdi + 16]            ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    pmulhw      xmm1, [rsi]
+    pmulhw      xmm5, [rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rsi, arg(5)                 ;dqcoeff_ptr
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rdi], xmm1
+    movdqa      [rdi + 16], xmm5
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [ GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+;    xor         ecx, ecx
+;    mov         eax, -1
+;find_eob_loop:
+;    shr         edx, 1
+;    jc          fq_skip
+;    mov         eax, ecx
+;fq_skip:
+;    inc         ecx
+;    cmp         ecx, 16
+;    jne         find_eob_loop
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rsi], xmm2                 ;store dqcoeff
+    movdqa      [rsi + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,6 +77,7 @@ sym(vp8_subtract_mby_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM
    GET_GOT     rbx
    push rsi
    push rdi
@@ -138,6 +139,7 @@ submby_loop:
    pop rsi
    ; begin epilog
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -297,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #endif
 #endif

+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef  vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef  vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef  vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef  vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
 #endif
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,11 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_c(input,   output,    pitch);
-    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }

-
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 short *scan_mask, short *round_ptr,
@@ -33,7 +32,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
    short *coeff_ptr   = b->coeff;
    short *zbin_ptr    = b->zbin;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -82,22 +81,16 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif

 #if HAVE_SSE2
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-{
-    vp8_short_fdct4x4_sse2(input,   output,    pitch);
-    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 short *scan_mask, short *round_ptr,
+                                 const short *inv_scan_order, short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
    short *coeff_ptr   = b->coeff;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -106,8 +99,7 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
                 coeff_ptr,
                 qcoeff_ptr,
                 dequant_ptr,
-                 scan_mask,
-
+                 vp8_default_inv_zig_zag,
                 round_ptr,
                 quant_ptr,
                 dqcoeff_ptr
@@ -179,6 +171,25 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

 #endif

+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_impl_ssse3(
+                    b->coeff,
+                    d->qcoeff,
+                    d->dequant,
+                    b->round,
+                    b->quant_fast,
+                    d->dqcoeff
+               );
+}
+#endif
+
+
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -188,6 +199,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
    int wmt_enabled = flags & HAS_SSE2;
    int SSE3Enabled = flags & HAS_SSE3;
    int SSSE3Enabled = flags & HAS_SSSE3;
+    int SSE4_1Enabled = flags & HAS_SSE4_1;

    /* Note:
     *
@@ -198,7 +210,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

    /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
-
    if (mmx_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
@@ -230,18 +241,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-#if 0 // new fdct
+
        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
-#else
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
-
-#endif

        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

@@ -254,10 +258,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
    }
-
 #endif
-#if HAVE_SSE2

+#if HAVE_SSE2
    if (wmt_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
@@ -307,10 +310,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
    }
-
 #endif
-#if HAVE_SSE3

+#if HAVE_SSE3
    if (SSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
@@ -328,16 +330,30 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
        cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
    }
-
 #endif
-#if HAVE_SSSE3

+#if HAVE_SSSE3
    if (SSSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+
    }
+#endif
+
+#if HAVE_SSE4_1
+    if (SSE4_1Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
+        cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
+        cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
+        cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
+        cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+    }
+#endif

-#endif
 #endif
 }
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -17,7 +17,6 @@ VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
-VP8_COMMON_SRCS-yes += common/preproc.h
 VP8_COMMON_SRCS-yes += common/vpxerrors.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -37,6 +37,7 @@ struct vp8_extracfg
    unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
    unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
    unsigned int                arnr_type;        /* alt_ref filter type */
+    vp8e_tuning                 tuning;

 };

@@ -67,6 +68,7 @@ static const struct extraconfig_map extracfg_map[] =
            0,                          /* arnr_max_frames */
            3,                          /* arnr_strength */
            3,                          /* arnr_type*/
+            0,                          /* tuning*/
        }
    }
 };
@@ -104,6 +106,7 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 }


+#undef ERROR
 #define ERROR(str) do {\
        ctx->base.err_detail = str;\
        return VPX_CODEC_INVALID_PARAM;\
@@ -132,8 +135,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                       const vpx_codec_enc_cfg_t *cfg,
                                       const struct vp8_extracfg *vp8_cfg)
 {
-    RANGE_CHECK(cfg, g_w,                   2, 16384);
-    RANGE_CHECK(cfg, g_h,                   2, 16384);
+    RANGE_CHECK(cfg, g_w,                   1, 16384);
+    RANGE_CHECK(cfg, g_h,                   1, 16384);
    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
    RANGE_CHECK_HI(cfg, g_profile,          3);
@@ -335,6 +338,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->arnr_strength =  vp8_cfg.arnr_strength;
    oxcf->arnr_type =      vp8_cfg.arnr_type;

+    oxcf->tuning = vp8_cfg.tuning;

    /*
        printf("Current VP8 Settings: \n");
@@ -448,6 +452,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
        MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
        MAP(VP8E_SET_ARNR_STRENGTH ,        xcfg.arnr_strength);
        MAP(VP8E_SET_ARNR_TYPE     ,        xcfg.arnr_type);
+        MAP(VP8E_SET_TUNING,                xcfg.tuning);

    }

@@ -860,8 +865,16 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {

    YV12_BUFFER_CONFIG sd;
+    vp8_ppflags_t flags = {0};

-    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag))
+    if (ctx->preview_ppcfg.post_proc_flag)
+    {
+        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+        flags.noise_level           = ctx->preview_ppcfg.noise_level;
+    }
+
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
    {

        /*
@@ -1020,6 +1033,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
    {VP8E_SET_ARNR_MAXFRAMES,           set_param},
    {VP8E_SET_ARNR_STRENGTH ,           set_param},
    {VP8E_SET_ARNR_TYPE     ,           set_param},
+    {VP8E_SET_TUNING,                   set_param},
    { -1, NULL},
 };

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -65,12 +65,19 @@ struct vpx_codec_alg_priv
    vpx_codec_priv_t        base;
    vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs)-1];
    vpx_codec_dec_cfg_t     cfg;
-    vp8_stream_info_t   si;
+    vp8_stream_info_t       si;
    int                     defer_alloc;
    int                     decoder_init;
    VP8D_PTR                pbi;
    int                     postproc_cfg_set;
    vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+    unsigned int            dbg_postproc_flag;
+    int                     dbg_color_ref_frame_flag;
+    int                     dbg_color_mb_modes_flag;
+    int                     dbg_color_b_modes_flag;
+    int                     dbg_display_mv_flag;
+#endif
    vpx_image_t             img;
    int                     img_setup;
    int                     img_avail;
@@ -253,8 +260,11 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
                                   unsigned int           data_sz,
                                   vpx_codec_stream_info_t *si)
 {
-
    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if(data + data_sz <= data)
+        res = VPX_CODEC_INVALID_PARAM;
+    else
    {
        /* Parse uncompresssed part of key frame header.
         * 3 bytes:- including version, frame type and an offset
@@ -331,7 +341,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,

    ctx->img_avail = 0;

-    /* Determine the stream parameters */
+    /* Determine the stream parameters. Note that we rely on peek_si to
+     * validate that we have a buffer that does not wrap around the top
+     * of the heap.
+     */
    if (!ctx->si.h)
        res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);

@@ -410,15 +423,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    {
        YV12_BUFFER_CONFIG sd;
        INT64 time_stamp = 0, time_end_stamp = 0;
-        int ppflag       = 0;
-        int ppdeblocking = 0;
-        int ppnoise      = 0;
+        vp8_ppflags_t flags = {0};

        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
        {
-            ppflag      = ctx->postproc_cfg.post_proc_flag;
-            ppdeblocking = ctx->postproc_cfg.deblocking_level;
-            ppnoise     = ctx->postproc_cfg.noise_level;
+            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+                                ;
+            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+            flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
        }

        if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
@@ -427,7 +452,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            res = update_error_state(ctx, &pbi->common.error);
        }

-        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag))
+        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
        {
            /* Align width/height */
            unsigned int a_w = (sd.y_width + 15) & ~15;
@@ -441,6 +466,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            vpx_img_set_rect(&ctx->img,
                             VP8BORDERINPIXELS, VP8BORDERINPIXELS,
                             sd.y_width, sd.y_height);
+            ctx->img.user_priv = user_priv;
            ctx->img_avail = 1;

        }
@@ -640,12 +666,59 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }

+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                        int ctrl_id,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+    int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+    switch (ctrl_id)
+    {
+        MAP (VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
+        MAP (VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
+        MAP (VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
+        MAP (VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
+    }
+
+    return VPX_CODEC_OK;
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                int ctrl_id,
+                                                va_list args)
+{
+    int *update_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+
+    if (update_info)
+    {
+        *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+            + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+            + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+

 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
-    {VP8_SET_REFERENCE,  vp8_set_reference},
-    {VP8_COPY_REFERENCE, vp8_get_reference},
-    {VP8_SET_POSTPROC,   vp8_set_postproc},
+    {VP8_SET_REFERENCE,             vp8_set_reference},
+    {VP8_COPY_REFERENCE,            vp8_get_reference},
+    {VP8_SET_POSTPROC,              vp8_set_postproc},
+    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
+    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
+    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
    { -1, NULL},
 };

--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -109,6 +109,8 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -38,9 +38,13 @@
 */
 enum vp8_dec_control_id
 {
-    VP8_SET_REFERENCE       = 1,    /**< pass in an external frame into decoder to be used as reference frame */
-    VP8_COPY_REFERENCE      = 2,    /**< get a copy of reference frame from the decoder */
-    VP8_SET_POSTPROC        = 3,    /**< set decoder's the post processing settings  */
+    VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+    VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
+    VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
+    VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
+    VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
+    VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
+    VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
    VP8_COMMON_CTRL_ID_MAX
 };

@@ -50,10 +54,14 @@ enum vp8_dec_control_id
 */
 enum vp8_postproc_level
 {
-    VP8_NOFILTERING    = 0,
-    VP8_DEBLOCK        = 1,
-    VP8_DEMACROBLOCK   = 2,
-    VP8_ADDNOISE       = 4
+    VP8_NOFILTERING             = 0,
+    VP8_DEBLOCK                 = 1<<0,
+    VP8_DEMACROBLOCK            = 1<<1,
+    VP8_ADDNOISE                = 1<<2,
+    VP8_DEBUG_TXT_FRAME_INFO    = 1<<3, /**< print frame information */
+    VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
+    VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
+    VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
 };

 /*!\brief post process flags
@@ -65,9 +73,9 @@ enum vp8_postproc_level

 typedef struct vp8_postproc_cfg
 {
-    int post_proc_flag;           /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
-    int deblocking_level;        /**< the strength of deblocking, valid range [0, 16] */
-    int noise_level;             /**< the strength of additive noise, valid range [0, 16] */
+    int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+    int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
+    int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
 } vp8_postproc_cfg_t;

 /*!\brief reference frame type
@@ -95,12 +103,16 @@ typedef struct vpx_ref_frame

 /*!\brief vp8 decoder control funciton parameter type
 *
- * defines the data type for each of VP8 decoder control funciton requires
+ * defines the data type for each of VP8 decoder control function requires
 */

 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)


 /*! @} - end defgroup vp8 */
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -140,7 +140,8 @@ enum vp8e_enc_control_id
    VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
    VP8E_SET_ARNR_STRENGTH ,         /**< control function to set the filter strength for the arf */
    VP8E_SET_ARNR_TYPE     ,         /**< control function to set the type of filter to use for the arf*/
-} ;
+    VP8E_SET_TUNING,                 /**< control function to set visual tuning */
+};

 /*!\brief vpx 1-D scaling mode
 *
@@ -224,6 +225,18 @@ typedef enum
 } vp8e_token_partitions;


+/*!\brief VP8 model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum
+{
+    VP8_TUNE_PSNR,
+    VP8_TUNE_SSIM
+} vp8e_tuning;
+
+
 /*!\brief VP8 encoder control function parameter type
 *
 * Defines the data types that VP8E control functions take. Note that
@@ -253,7 +266,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   vp8e_token_partitions)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH ,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE     ,     unsigned int)
-
+VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             vp8e_tuning)

 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -36,6 +36,30 @@ extern vpx_codec_iface_t* vpx_codec_vp8_dx(void);
 #include "vp8.h"


+/*!\brief VP8 decoder control functions
+ *
+ * The set of macros define the control functions of VP8 decoder interface
+ */
+enum vp8d_dec_control_id
+{
+    VP8_DECODER_CTRL_ID_START   = 256,
+    VP8D_GET_LAST_REF_UPDATES,              /**< control function to get info on which reference frames were updated
+                                            by the last decode */
+    VP8_DECODER_CTRL_ID_MAX
+} ;
+
+
+/*!\brief VP8 encoder control function parameter type
+ *
+ * Defines the data types that VP8E control functions take. Note that
+ * additional common controls are defined in vp8.h
+ *
+ */
+
+
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
+
+
 /*! @} - end defgroup vp8_decoder */


--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -74,6 +74,7 @@ void __cpuid(int CPUInfo[4], int info_type);
 #define HAS_SSE2  0x04
 #define HAS_SSE3  0x08
 #define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -117,6 +118,8 @@ x86_simd_caps(void)

    if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;

+    if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
    return flags & mask;
 }

--- a/vpxdec.c
+++ b/vpxdec.c
@@ -35,6 +35,7 @@
 #if CONFIG_MD5
 #include "md5_utils.h"
 #endif
+#include "tools_common.h"
 #include "nestegg/include/nestegg/nestegg.h"

 #ifndef PATH_MAX
@@ -107,11 +108,19 @@ static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level",
        "Enable VP8 demacroblocking, w/ level");
 static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
                                       "Enable VP8 visible debug info");
-
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
+                                       "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
+                                       "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
+                                       "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
+                                       "Draw only selected motion vectors");

 static const arg_def_t *vp8_pp_args[] =
 {
    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
    NULL
 };
 #endif
@@ -314,7 +323,8 @@ void *out_open(const char *out_fn, int do_md5)
    }
    else
    {
-        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb") : stdout;
+        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb")
+                                                  : set_binary_mode(stdout);

        if (!outfile)
        {
@@ -432,6 +442,8 @@ unsigned int file_is_raw(FILE *infile,
    int is_raw = 0;
    vpx_codec_stream_info_t si;

+    si.sz = sizeof(si);
+
    if (fread(buf, 1, 32, infile) == 32)
    {
        int i;
@@ -540,6 +552,7 @@ webm_guess_framerate(struct input_ctx *input,
    *fps_den = tstamp / 1000;
    return 0;
 fail:
+    nestegg_destroy(input->nestegg_ctx);
    input->nestegg_ctx = NULL;
    rewind(input->infile);
    return 1;
@@ -702,6 +715,10 @@ int main(int argc, const char **argv_)
    vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
+    int                     vp8_dbg_color_ref_frame = 0;
+    int                     vp8_dbg_color_mb_modes = 0;
+    int                     vp8_dbg_color_b_modes = 0;
+    int                     vp8_dbg_display_mv = 0;
 #endif
    struct input_ctx        input = {0};

@@ -787,6 +804,42 @@ int main(int argc, const char **argv_)
            if (level)
                vp8_pp_cfg.post_proc_flag |= level;
        }
+        else if (arg_match(&arg, &pp_disp_ref_frame, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_ref_frame = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mb_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_mb_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_b_modes, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_color_b_modes = flags;
+            }
+        }
+        else if (arg_match(&arg, &pp_disp_mvs, argi))
+        {
+            unsigned int flags = arg_parse_int(&arg);
+            if (flags)
+            {
+                postproc = 1;
+                vp8_dbg_display_mv = flags;
+            }
+        }

 #endif
        else
@@ -805,7 +858,7 @@ int main(int argc, const char **argv_)
        usage_exit();

    /* Open file */
-    infile = strcmp(fn, "-") ? fopen(fn, "rb") : stdin;
+    infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);

    if (!infile)
    {
@@ -876,7 +929,13 @@ int main(int argc, const char **argv_)
        }

        if(input.kind == WEBM_FILE)
-            webm_guess_framerate(&input, &fps_den, &fps_num);
+            if(webm_guess_framerate(&input, &fps_den, &fps_num))
+            {
+                fprintf(stderr, "Failed to guess framerate -- error parsing "
+                                "webm file?\n");
+                return EXIT_FAILURE;
+            }
+

        /*Note: We can't output an aspect ratio here because IVF doesn't
           store one, and neither does VP8.
@@ -920,6 +979,33 @@ int main(int argc, const char **argv_)
        return EXIT_FAILURE;
    }

+    if (vp8_dbg_color_ref_frame
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
+    {
+        fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_mb_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
+    {
+        fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_color_b_modes
+        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
+    {
+        fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (vp8_dbg_display_mv
+        && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
+    {
+        fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
 #endif

    /* Decode file */
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -35,9 +35,11 @@
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
+#include "tools_common.h"
 #include "y4minput.h"
 #include "libmkv/EbmlWriter.h"
 #include "libmkv/EbmlIDs.h"
+#include "experimental.h"

 /* Need special handling of these functions on Windows */
 #if defined(_MSC_VER)
@@ -185,11 +187,11 @@ int stats_open_mem(stats_io_t *stats, int pass)
 }


-void stats_close(stats_io_t *stats)
+void stats_close(stats_io_t *stats, int last_pass)
 {
    if (stats->file)
    {
-        if (stats->pass == 1)
+        if (stats->pass == last_pass)
        {
 #if 0
 #elif USE_POSIX_MMAP
@@ -204,7 +206,7 @@ void stats_close(stats_io_t *stats)
    }
    else
    {
-        if (stats->pass == 1)
+        if (stats->pass == last_pass)
            free(stats->buf.buf);
    }
 }
@@ -250,7 +252,8 @@ enum video_file_type

 struct detect_buffer {
    char buf[4];
-    int  valid;
+    size_t buf_read;
+    size_t position;
 };


@@ -304,14 +307,21 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,

            for (r = 0; r < h; r++)
            {
-                if (detect->valid)
+                size_t needed = w;
+                size_t buf_position = 0;
+                const size_t left = detect->buf_read - detect->position;
+                if (left > 0)
                {
-                    memcpy(ptr, detect->buf, 4);
-                    shortread |= fread(ptr+4, 1, w-4, f) < w-4;
-                    detect->valid = 0;
+                    const size_t more = (left < needed) ? left : needed;
+                    memcpy(ptr, detect->buf + detect->position, more);
+                    buf_position = more;
+                    needed -= more;
+                    detect->position += more;
+                }
+                if (needed > 0)
+                {
+                    shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
                }
-                else
-                    shortread |= fread(ptr, 1, w, f) < w;

                ptr += img->stride[plane];
            }
@@ -338,12 +348,12 @@ unsigned int file_is_ivf(FILE *infile,
                         unsigned int *fourcc,
                         unsigned int *width,
                         unsigned int *height,
-                         char          detect[4])
+                         struct detect_buffer *detect)
 {
    char raw_hdr[IVF_FILE_HDR_SZ];
    int is_ivf = 0;

-    if(memcmp(detect, "DKIF", 4) != 0)
+    if(memcmp(detect->buf, "DKIF", 4) != 0)
        return 0;

    /* See write_ivf_file_header() for more documentation on the file header
@@ -367,6 +377,7 @@ unsigned int file_is_ivf(FILE *infile,
    {
        *width = mem_get_le16(raw_hdr + 12);
        *height = mem_get_le16(raw_hdr + 14);
+        detect->position = 4;
    }

    return is_ivf;
@@ -434,7 +445,7 @@ struct EbmlGlobal
    int debug;

    FILE    *stream;
-    uint64_t last_pts_ms;
+    int64_t last_pts_ms;
    vpx_rational_t  framerate;

    /* These pointers are to the start of an element */
@@ -647,7 +658,7 @@ write_webm_block(EbmlGlobal                *glob,
    unsigned char  track_number;
    unsigned short block_timecode = 0;
    unsigned char  flags;
-    uint64_t       pts_ms;
+    int64_t        pts_ms;
    int            start_cluster = 0, is_keyframe;

    /* Calculate the PTS of this frame in milliseconds */
@@ -978,23 +989,32 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                     "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                        "alt_ref Max Frames");
+                                        "AltRef Max Frames");
 static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                       "alt_ref Strength");
+                                       "AltRef Strength");
 static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                   "alt_ref Type");
+                                   "AltRef Type");
+static const struct arg_enum_list tuning_enum[] = {
+    {"psnr", VP8_TUNE_PSNR},
+    {"ssim", VP8_TUNE_SSIM},
+    {NULL, 0}
+};
+static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
+                                   "Material to favor", tuning_enum);

 static const arg_def_t *vp8_args[] =
 {
    &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
-    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type, NULL
+    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
+    &tune_ssim, NULL
 };
 static const int vp8_arg_ctrl_map[] =
 {
    VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
    VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
    VP8E_SET_TOKEN_PARTITIONS,
-    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE, 0
+    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE,
+    VP8E_SET_TUNING, 0
 };
 #endif

@@ -1020,6 +1040,7 @@ static void usage_exit()
 #if CONFIG_VP8_ENCODER
    fprintf(stderr, "\nVP8 Specific Options:\n");
    arg_show_usage(stdout, vp8_args);
+    xxx_show_usage(stdout);
 #endif
    fprintf(stderr, "\n"
           "Included encoders:\n"
@@ -1073,6 +1094,7 @@ int main(int argc, const char **argv_)
    int                      psnr_count = 0;

    exec_name = argv_[0];
+    ebml.last_pts_ms = -1;

    if (argc < 3)
        usage_exit();
@@ -1153,6 +1175,7 @@ int main(int argc, const char **argv_)
            out_fn = arg.val;
        else if (arg_match(&arg, &debugmode, argi))
            ebml.debug = 1;
+        else if (xxx_parse_arg(argi));
        else
            argj++;
    }
@@ -1189,6 +1212,12 @@ int main(int argc, const char **argv_)
     */
    cfg.g_timebase.den = 1000;

+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    cfg.g_w = 0;
+    cfg.g_h = 0;
+
    /* Now parse the remainder of the parameters. */
    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
    {
@@ -1300,7 +1329,7 @@ int main(int argc, const char **argv_)
                if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX)
                {
                    arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i];
-                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_int(&arg);
+                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg);
                    arg_ctrl_cnt++;
                }
            }
@@ -1330,11 +1359,11 @@ int main(int argc, const char **argv_)
    {
        int frames_in = 0, frames_out = 0;
        unsigned long nbytes = 0;
-        size_t detect_bytes;
        struct detect_buffer detect;

        /* Parse certain options from the input file, if possible */
-        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb") : stdin;
+        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb")
+                                    : set_binary_mode(stdin);

        if (!infile)
        {
@@ -1344,13 +1373,11 @@ int main(int argc, const char **argv_)

        /* For RAW input sources, these bytes will applied on the first frame
         *  in read_frame().
-         * We can always read 4 bytes because the minimum supported frame size
-         *  is 2x2.
         */
-        detect_bytes = fread(detect.buf, 1, 4, infile);
-        detect.valid = 0;
+        detect.buf_read = fread(detect.buf, 1, 4, infile);
+        detect.position = 0;

-        if (detect_bytes == 4 && file_is_y4m(infile, &y4m, detect.buf))
+        if (detect.buf_read == 4 && file_is_y4m(infile, &y4m, detect.buf))
        {
            if (y4m_input_open(&y4m, infile, detect.buf, 4) >= 0)
            {
@@ -1375,8 +1402,8 @@ int main(int argc, const char **argv_)
                return EXIT_FAILURE;
            }
        }
-        else if (detect_bytes == 4 &&
-                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, detect.buf))
+        else if (detect.buf_read == 4 &&
+                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, &detect))
        {
            file_type = FILE_TYPE_IVF;
            switch (fourcc)
@@ -1395,8 +1422,15 @@ int main(int argc, const char **argv_)
        else
        {
            file_type = FILE_TYPE_RAW;
-            detect.valid = 1;
        }
+
+        if(!cfg.g_w || !cfg.g_h)
+        {
+            fprintf(stderr, "Specify stream dimensions with --width (-w) "
+                            " and --height (-h).\n");
+            return EXIT_FAILURE;
+        }
+
 #define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, cfg.field)

        if (verbose && pass == 0)
@@ -1449,7 +1483,8 @@ int main(int argc, const char **argv_)
                              cfg.g_w, cfg.g_h, 1);
        }

-        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb") : stdout;
+        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb")
+                                      : set_binary_mode(stdout);

        if (!outfile)
        {
@@ -1527,7 +1562,7 @@ int main(int argc, const char **argv_)
            vpx_codec_iter_t iter = NULL;
            const vpx_codec_cx_pkt_t *pkt;
            struct vpx_usec_timer timer;
-            int64_t frame_start;
+            int64_t frame_start, next_frame_start;

            if (!arg_limit || frames_in < arg_limit)
            {
@@ -1548,9 +1583,11 @@ int main(int argc, const char **argv_)

            frame_start = (cfg.g_timebase.den * (int64_t)(frames_in - 1)
                          * arg_framerate.den) / cfg.g_timebase.num / arg_framerate.num;
+            next_frame_start = (cfg.g_timebase.den * (int64_t)(frames_in)
+                                * arg_framerate.den)
+                                / cfg.g_timebase.num / arg_framerate.num;
            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, frame_start,
-                             cfg.g_timebase.den * arg_framerate.den
-                             / cfg.g_timebase.num / arg_framerate.num,
+                             next_frame_start - frame_start,
                             0, arg_deadline);
            vpx_usec_timer_mark(&timer);
            cx_time += vpx_usec_timer_elapsed(&timer);
@@ -1658,7 +1695,7 @@ int main(int argc, const char **argv_)
        }

        fclose(outfile);
-        stats_close(&stats);
+        stats_close(&stats, arg_passes-1);
        fprintf(stderr, "\n");

        if (one_pass_only)