fix integer promotion bug in partition size check

The check '(user_data_end - partition < partition_size)' must be evaluated as a signed comparison, but because partition_size was unsigned, the LHS was promoted to unsigned, causing an incorrect result on 32-bit. Instead, check the upper and lower bounds of the segment separately. Change-Id: Ia01708be8492e64abb16b8157e816bd59e2472cf
Save XMM registers in asm functions
2010-11-08 16:56:11 -05:00 · 2010-11-08 16:55:44 -05:00
72 changed files with 2842 additions and 4298 deletions
--- a/args.c
+++ b/args.c
@@ -135,17 +135,6 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
                     def->long_name, long_val);

        fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
-
-        if(def->enums)
-        {
-            const struct arg_enum_list *listptr;
-
-            fprintf(fp, "  %-37s\t  ", "");
-
-            for(listptr = def->enums; listptr->name; listptr++)
-                fprintf(fp, "%s%s", listptr->name,
-                        listptr[1].name ? ", " : "\n");
-        }
    }
 }

@@ -229,37 +218,3 @@ struct vpx_rational arg_parse_rational(const struct arg *arg)

    return rat;
 }
-
-
-int arg_parse_enum(const struct arg *arg)
-{
-    const struct arg_enum_list *listptr;
-    long int                    rawval;
-    char                       *endptr;
-
-    /* First see if the value can be parsed as a raw value */
-    rawval = strtol(arg->val, &endptr, 10);
-    if (arg->val[0] != '\0' && endptr[0] == '\0')
-    {
-        /* Got a raw value, make sure it's valid */
-        for(listptr = arg->def->enums; listptr->name; listptr++)
-            if(listptr->val == rawval)
-                return rawval;
-    }
-
-    /* Next see if it can be parsed as a string */
-    for(listptr = arg->def->enums; listptr->name; listptr++)
-        if(!strcmp(arg->val, listptr->name))
-            return listptr->val;
-
-    die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-    return 0;
-}
-
-
-int arg_parse_enum_or_int(const struct arg *arg)
-{
-    if(arg->def->enums)
-        return arg_parse_enum(arg);
-    return arg_parse_int(arg);
-}
--- a/args.h
+++ b/args.h
@@ -22,23 +22,14 @@ struct arg
    const struct arg_def  *def;
 };

-struct arg_enum_list
-{
-    const char *name;
-    int         val;
-};
-#define ARG_ENUM_LIST_END {0}
-
 typedef struct arg_def
 {
    const char *short_name;
    const char *long_name;
    int         has_val;
    const char *desc;
-    const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
-#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
+#define ARG_DEF(s,l,v,d) {s,l,v,d}
 #define ARG_DEF_LIST_END {0}

 struct arg arg_init(char **argv);
@@ -50,5 +41,4 @@ char **argv_dup(int argc, const char **argv);
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
-int arg_parse_enum_or_int(const struct arg *arg);
 #endif
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -547,10 +547,6 @@ process_common_toolchain() {
                tgt_isa=universal
                tgt_os=darwin9
                ;;
-            *darwin10*)
-                tgt_isa=x86_64
-                tgt_os=darwin10
-                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -610,12 +606,6 @@ process_common_toolchain() {
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
            add_ldflags "-mmacosx-version-min=10.5"
            ;;
-        *-darwin10-*)
-            add_cflags  "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
-            add_cflags  "-mmacosx-version-min=10.6"
-            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
-            add_ldflags "-mmacosx-version-min=10.6"
-            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -834,7 +824,6 @@ process_common_toolchain() {
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
-        soft_enable sse4_1

        case  ${tgt_os} in
            win*)
@@ -890,7 +879,7 @@ process_common_toolchain() {
        case  ${tgt_os} in
            win*)
                add_asflags -f win${bits}
-                enabled debug && add_asflags -g cv8
+                enabled debug && add_asflags -g dwarf2
            ;;
            linux*|solaris*)
                add_asflags -f elf${bits}
--- a/11
+++ b/11
@@ -41,7 +41,6 @@ Advanced options:
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
-  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -115,7 +114,6 @@ all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
-all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -201,7 +199,6 @@ ARCH_EXT_LIST="
    sse2
    sse3
    ssse3
-    sse4_1

    altivec
 "
@@ -252,7 +249,6 @@ CONFIG_LIST="
    shared
    small
    arm_asm_detok
-    postproc_visualizer
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -292,7 +288,6 @@ CMDLINE_SELECT="
    shared
    small
    arm_asm_detok
-    postproc_visualizer
 "

 process_cmdline() {
@@ -329,6 +324,8 @@ post_process_cmdline() {
    for c in ${CODECS}; do
        enabled ${c} && enable ${c##*_}s
    done
+
+
 }


@@ -538,10 +535,6 @@ process_toolchain() {

    # Other toolchain specific defaults
    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
-
-    if enabled postproc_visualizer; then
-        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
-    fi
 }


--- a/examples.mk
+++ b/examples.mk
@@ -17,7 +17,6 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
-vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -29,13 +28,11 @@ vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
-vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
 vpxenc.SRCS                 += libmkv/EbmlWriter.c
 vpxenc.SRCS                 += libmkv/EbmlWriter.h
-vpxenc.SRCS                 += experimental.c
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder

--- a/examples/vp8_set_maps.txt
+++ b/examples/vp8_set_maps.txt
@@ -78,8 +78,8 @@ if(frame_cnt + 1 == 22) {
 } else if(frame_cnt + 1 == 44) {
    vpx_active_map_t  active;

-    active.rows = cfg.g_h/16;
-    active.cols = cfg.g_w/16;
+    active.rows = 240/16;
+    active.cols = 320/16;

    /* pass in null map to disable active_map*/
    active.active_map = NULL;
--- a/experimental.c
+++ b/experimental.c
@@ -1,29 +0,0 @@
-#define EXPERIMENTAL_C
-#include <stdio.h>
-
-#include "args.h"
-
-/* Get argument definitions */
-#include "experimental.h"
-
-/* Build argument definition list */
-static const arg_def_t *xxx_def_list[] = {
-#include "experimental.h"
-NULL
-};
-
-void xxx_show_usage(FILE *fp)
-{
-    arg_show_usage(fp, xxx_def_list);
-}
-
-int xxx_parse_arg(char **argi)
-{
-    struct arg arg;
-
-    arg = arg_init(argi);
-    if(0);
-#include "experimental.h"
-    else return 0;
-    return 1;
-}
--- a/experimental.h
+++ b/experimental.h
@@ -1,56 +0,0 @@
-#if defined(EXPERIMENTAL_C)
-/* The experimental.c file includes this file multiple times to build up the
- * required state.
- */
-#if !defined(XXX_ARG_DEF)
-#define XXX_ARG_DEF(sym, value) \
-    static const arg_def_t xxx_arg_def_##sym = \
-        ARG_DEF(NULL, #sym, 1, "Experimental");
-
-#define XXX_DEFINE_INT(sym, value) \
-    XXX_ARG_DEF(sym, value); int xxx_##sym = value;
-#define XXX_DEFINE_UINT(sym, value) \
-    XXX_ARG_DEF(sym, value); unsigned int xxx_##sym = value;
-
-#elif !defined(XXX_ARG_DEF_LIST)
-#define XXX_ARG_DEF_LIST(sym) &xxx_arg_def_##sym,
-
-#undef  XXX_DEFINE_INT
-#define XXX_DEFINE_INT(sym, value) XXX_ARG_DEF_LIST(sym)
-
-#undef  XXX_DEFINE_UINT
-#define XXX_DEFINE_UINT(sym, value) XXX_ARG_DEF_LIST(sym)
-
-#elif !defined(XXX_ARG_MATCH)
-#define XXX_ARG_MATCH
-
-#undef  XXX_DEFINE_INT
-#define XXX_DEFINE_INT(sym, value)\
-    else if (arg_match(&arg, &xxx_arg_def_##sym, argi)) \
-        xxx_##sym = arg_parse_int(&arg);
-
-#undef  XXX_DEFINE_UINT
-#define XXX_DEFINE_UINT(sym, value)\
-    else if (arg_match(&arg, &xxx_arg_def_##sym, argi)) \
-        xxx_##sym = arg_parse_uint(&arg);
-
-#endif
-#else
-/* All other files just get the extern references to these symbols. */
-
-#define XXX_DEFINE_INT(sym, value) extern int xxx_##sym;
-#define XXX_DEFINE_UINT(sym, value) extern unsigned int xxx_##sym;
-
-
-#include <stdio.h>
-void xxx_show_usage(FILE *fp);
-int xxx_parse_arg(char **argi);
-#endif
-
-/*
- * BEGIN EXPERIMENTS BELOW
- *
- * XXX_DEFINE_INT(knob, 0)
- */
-XXX_DEFINE_INT(foo, 0)
-XXX_DEFINE_INT(bar, 0)
--- a/tools_common.c
+++ b/tools_common.c
@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <stdio.h>
-#include "tools_common.h"
-#ifdef _WIN32
-#include <io.h>
-#include <fcntl.h>
-#endif
-
-FILE* set_binary_mode(FILE *stream)
-{
-    (void)stream;
-#ifdef _WIN32
-    _setmode(_fileno(stream), _O_BINARY);
-#endif
-    return stream;
-}
--- a/tools_common.h
+++ b/tools_common.h
@@ -1,16 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef TOOLS_COMMON_H
-#define TOOLS_COMMON_H
-
-/* Sets a stdio stream into binary mode */
-FILE* set_binary_mode(FILE *stream);
-
-#endif
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -36,14 +36,6 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
    7, 11, 14, 15,
 };

-DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
-{
-    1,  2,  6,  7,
-    3,  5,  8, 13,
-    4,  9, 12, 14,
-   10, 11, 15, 16
-};
-
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);

 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
@@ -114,20 +106,23 @@ static void init_bit_trees()
    init_bit_tree(cat6, 11);
 }

+
+static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
+
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0},
-    { 0, 0, 0, 1},
-    { 0, 0, 0, 2},
-    { 0, 0, 0, 3},
-    { 0, 0, 0, 4},
-    { cat1, Pcat1, 1, 5},
-    { cat2, Pcat2, 2, 7},
-    { cat3, Pcat3, 3, 11},
-    { cat4, Pcat4, 4, 19},
-    { cat5, Pcat5, 5, 35},
-    { cat6, Pcat6, 11, 67},
-    { 0, 0, 0, 0}
+    { 0, 0, 0, 0, 0},
+    { 0, 0, 0, 0, 1},
+    { 0, 0, 0, 0, 2},
+    { 0, 0, 0, 0, 3},
+    { 0, 0, 0, 0, 4},
+    { cat1, Pcat1, bcc1, 1, 5},
+    { cat2, Pcat2, bcc2, 2, 7},
+    { cat3, Pcat3, bcc3, 3, 11},
+    { cat4, Pcat4, bcc4, 4, 19},
+    { cat5, Pcat5, bcc5, 5, 35},
+    { cat6, Pcat6, bcc6, 11, 67},
+    { 0, 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"

--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -24,10 +24,10 @@
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

 #define vp8_coef_tokens 12
@@ -42,6 +42,7 @@ typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
+    vp8bc_index_t *prob_bc;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
@@ -94,7 +95,6 @@ struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);

 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
-extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];

--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -65,13 +65,11 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;

 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
-    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
-    rtcd->postproc.blend_b          = vp8_blend_b_c;
+    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
+    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
 #endif

 #endif
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -18,7 +18,6 @@ extern "C"
 #endif

 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "type_aliases.h"
 #include "ppflags.h"
@@ -190,8 +189,6 @@ extern "C"

        struct vpx_fixed_buf         two_pass_stats_in;
        struct vpx_codec_pkt_list  *output_pkt_list;
-
-        vp8e_tuning tuning;
    } VP8_CONFIG;


@@ -207,7 +204,7 @@ extern "C"
 // and not just a copy of the pointer..
    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);

    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -105,7 +105,7 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;

-    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for loopfilter init checking and motion search. */
+    FRAME_TYPE last_frame_type;  /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
    FRAME_TYPE frame_type;

    int show_frame;
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@ extern "C"
    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);

    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);

    int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
    int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)

 /* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
+
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
@@ -41,32 +41,13 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
    { RGB_TO_YUV(0xFF0000) }    /* Red */
 };

-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
-{
-    { RGB_TO_YUV(0x6633ff) },   /* Purple */
-    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
-    { RGB_TO_YUV(0xff33cc) },   /* Pink */
-    { RGB_TO_YUV(0xff3366) },   /* Coral */
-    { RGB_TO_YUV(0x3366ff) },   /* Blue */
-    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
-    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
-    { RGB_TO_YUV(0xff6633) },   /* Orange */
-    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
-    { RGB_TO_YUV(0x8ab800) },   /* Green */
-    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
-    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
-    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
-    { RGB_TO_YUV(0xccff33) },   /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
+static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x00ff00) },   /* Blue */
    { RGB_TO_YUV(0x0000ff) },   /* Green */
    { RGB_TO_YUV(0xffff00) },   /* Yellow */
    { RGB_TO_YUV(0xff0000) },   /* Red */
 };
-#endif

 static const short kernel5[] =
 {
@@ -495,7 +476,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
 */
-void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
+void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
@@ -503,10 +484,10 @@ void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);

-    y += 2*stride + 2;
-    for (i = 0; i < 12; i++)
+    y += stride + 2;
+    for (i = 0; i < 14; i++)
    {
-        for (j = 0; j < 12; j++)
+        for (j = 0; j < 14; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
@@ -530,104 +511,6 @@ void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
    }
 }

-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
-                        int y1, int u1, int v1, int alpha, int stride)
-{
-    int i, j;
-    int y1_const = y1*((1<<16)-alpha);
-    int u1_const = u1*((1<<16)-alpha);
-    int v1_const = v1*((1<<16)-alpha);
-
-    for (i = 0; i < 2; i++)
-    {
-        for (j = 0; j < 16; j++)
-        {
-            y[j] = (y[j]*alpha + y1_const)>>16;
-        }
-        y += stride;
-    }
-
-    for (i = 0; i < 12; i++)
-    {
-        y[0]  = (y[0]*alpha  + y1_const)>>16;
-        y[1]  = (y[1]*alpha  + y1_const)>>16;
-        y[14] = (y[14]*alpha + y1_const)>>16;
-        y[15] = (y[15]*alpha + y1_const)>>16;
-        y += stride;
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        for (j = 0; j < 16; j++)
-        {
-            y[j] = (y[j]*alpha + y1_const)>>16;
-        }
-        y += stride;
-    }
-
-    stride >>= 1;
-
-    for (j = 0; j < 8; j++)
-    {
-        u[j] = (u[j]*alpha + u1_const)>>16;
-        v[j] = (v[j]*alpha + v1_const)>>16;
-    }
-    u += stride;
-    v += stride;
-
-    for (i = 0; i < 6; i++)
-    {
-        u[0] = (u[0]*alpha + u1_const)>>16;
-        v[0] = (v[0]*alpha + v1_const)>>16;
-
-        u[7] = (u[7]*alpha + u1_const)>>16;
-        v[7] = (v[7]*alpha + v1_const)>>16;
-
-        u += stride;
-        v += stride;
-    }
-
-    for (j = 0; j < 8; j++)
-    {
-        u[j] = (u[j]*alpha + u1_const)>>16;
-        v[j] = (v[j]*alpha + v1_const)>>16;
-    }
-}
-
-void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
-                        int y1, int u1, int v1, int alpha, int stride)
-{
-    int i, j;
-    int y1_const = y1*((1<<16)-alpha);
-    int u1_const = u1*((1<<16)-alpha);
-    int v1_const = v1*((1<<16)-alpha);
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            y[j] = (y[j]*alpha + y1_const)>>16;
-        }
-        y += stride;
-    }
-
-    stride >>= 1;
-
-    for (i = 0; i < 2; i++)
-    {
-        for (j = 0; j < 2; j++)
-        {
-            u[j] = (u[j]*alpha + u1_const)>>16;
-            v[j] = (v[j]*alpha + v1_const)>>16;
-        }
-        u += stride;
-        v += stride;
-    }
-}
-
 static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
 {
    int dx;
@@ -639,7 +522,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = width;
-        if (dx)
+        if (dy)
            *y1 = ((width-x0)*dy)/dx + y0;
    }
    if (*x1 < 0)
@@ -648,7 +531,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *x1 = 0;
-        if (dx)
+        if (dy)
            *y1 = ((0-x0)*dy)/dx + y0;
    }
    if (*y1 > height)
@@ -657,7 +540,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = height;
-        if (dy)
+        if (dx)
            *x1 = ((height-y0)*dx)/dy + x0;
    }
    if (*y1 < 0)
@@ -666,7 +549,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;

        *y1 = 0;
-        if (dy)
+        if (dx)
            *x1 = ((0-y0)*dx)/dy + x0;
    }
 }
@@ -678,13 +561,10 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif

-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
 {
    char message[512];
    int q = oci->filter_level * 10 / 6;
-    int flags = ppflags->post_proc_flag;
-    int deblock_level = ppflags->deblocking_level;
-    int noise_level = ppflags->noise_level;

    if (!oci->frame_to_show)
        return -1;
@@ -741,8 +621,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
         oci->post_proc_buffer.y_stride);
    }

-#if CONFIG_POSTPROC_VISUALIZER
-    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
+    if (flags & VP8D_DEBUG_LEVEL1)
    {
        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                (oci->frame_type == KEY_FRAME),
@@ -754,7 +633,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }

-    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
+    if (flags & VP8D_DEBUG_LEVEL2)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -786,7 +665,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        }
    }

-    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
+    if (flags & VP8D_DEBUG_LEVEL3)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -821,14 +700,45 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        }
    }

-    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
+    if (flags & VP8D_DEBUG_LEVEL4)
    {
        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+#if 0
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        /* vp8_filter each macro block */
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; /* border */
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+
+#endif
+
    }

    /* Draw motion vectors */
-    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
+    if (flags & VP8D_DEBUG_LEVEL5)
    {
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
@@ -839,144 +749,29 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        MODE_INFO *mi = oci->mi;
        int x0, y0;

-        for (y0 = 0; y0 < height; y0 += 16)
+        for (y0 = 8; y0 < (height + 8); y0 += 16)
        {
-            for (x0 = 0; x0 < width; x0 += 16)
+            for (x0 = 8; x0 < (width + 8); x0 += 16)
            {
-                int x1, y1;
-
-                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
-                {
-                    mi++;
-                    continue;
-                }
-
-                if (mi->mbmi.mode == SPLITMV)
-                {
-                    switch (mi->mbmi.partitioning)
-                    {
-                        case 0 :    /* mv_top_bottom */
-                        {
-                            B_MODE_INFO *bmi = &mi->bmi[0];
-                            MV *mv = &bmi->mv.as_mv;
-
-                            x1 = x0 + 8 + (mv->col >> 3);
-                            y1 = y0 + 4 + (mv->row >> 3);
-
-                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
-                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
-
-                            bmi = &mi->bmi[8];
-
-                            x1 = x0 + 8 + (mv->col >> 3);
-                            y1 = y0 +12 + (mv->row >> 3);
-
-                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
-                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
-
-                            break;
-                        }
-                        case 1 :    /* mv_left_right */
-                        {
-                            B_MODE_INFO *bmi = &mi->bmi[0];
-                            MV *mv = &bmi->mv.as_mv;
-
-                            x1 = x0 + 4 + (mv->col >> 3);
-                            y1 = y0 + 8 + (mv->row >> 3);
-
-                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
-                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
-
-                            bmi = &mi->bmi[2];
-
-                            x1 = x0 +12 + (mv->col >> 3);
-                            y1 = y0 + 8 + (mv->row >> 3);
-
-                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
-                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
-
-                            break;
-                        }
-                        case 2 :    /* mv_quarters   */
-                        {
-                            B_MODE_INFO *bmi = &mi->bmi[0];
-                            MV *mv = &bmi->mv.as_mv;
-
-                            x1 = x0 + 4 + (mv->col >> 3);
-                            y1 = y0 + 4 + (mv->row >> 3);
-
-                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
-                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
-
-                            bmi = &mi->bmi[2];
-
-                            x1 = x0 +12 + (mv->col >> 3);
-                            y1 = y0 + 4 + (mv->row >> 3);
-
-                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
-                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
-
-                            bmi = &mi->bmi[8];
-
-                            x1 = x0 + 4 + (mv->col >> 3);
-                            y1 = y0 +12 + (mv->row >> 3);
-
-                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
-                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
-
-                            bmi = &mi->bmi[10];
-
-                            x1 = x0 +12 + (mv->col >> 3);
-                            y1 = y0 +12 + (mv->row >> 3);
-
-                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
-                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
-                            break;
-                        }
-                        default :
-                        {
-                            B_MODE_INFO *bmi = mi->bmi;
-                            int bx0, by0;
-
-                            for (by0 = y0; by0 < (y0+16); by0 += 4)
-                            {
-                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
-                                {
-                                    MV *mv = &bmi->mv.as_mv;
-
-                                    x1 = bx0 + 2 + (mv->col >> 3);
-                                    y1 = by0 + 2 + (mv->row >> 3);
-
-                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
-                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
-
-                                    bmi++;
-                                }
-                            }
-                        }
-                    }
-                }
-                else if (mi->mbmi.mode >= NEARESTMV)
+               int x1, y1;
+               if (mi->mbmi.mode >= NEARESTMV)
                {
                    MV *mv = &mi->mbmi.mv.as_mv;
-                    const int lx0 = x0 + 8;
-                    const int ly0 = y0 + 8;

-                    x1 = lx0 + (mv->col >> 3);
-                    y1 = ly0 + (mv->row >> 3);
+                    x1 = x0 + (mv->col >> 3);
+                    y1 = y0 + (mv->row >> 3);

-                    if (x1 != lx0 && y1 != ly0)
+                    if (x1 != x0 && y1 != y0)
                    {
-                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
-                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);
+                        constrain_line (x0, &x1, y0-1, &y1, width, height);
+                        vp8_blit_line  (x0,  x1, y0-1,  y1, y_buffer, y_stride);

-                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
-                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
+                        constrain_line (x0, &x1, y0+1, &y1, width, height);
+                        vp8_blit_line  (x0,  x1, y0+1,  y1, y_buffer, y_stride);
                    }
                    else
-                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (x0,  x1, y0,  y1, y_buffer, y_stride);
                }
-
                mi++;
            }
            mi++;
@@ -984,10 +779,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    }

    /* Color in block modes */
-    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
-        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
+    if (flags & VP8D_DEBUG_LEVEL6)
    {
-        int y, x;
+        int i, j;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -997,54 +791,18 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (y = 0; y < height; y += 16)
+        for (i = 0; i < height; i += 16)
        {
-            for (x = 0; x < width; x += 16)
+            for (j = 0; j < width; j += 16)
            {
                int Y = 0, U = 0, V = 0;

-                if (mi->mbmi.mode == B_PRED &&
-                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
-                {
-                    int by, bx;
-                    unsigned char *yl, *ul, *vl;
-                    B_MODE_INFO *bmi = mi->bmi;
+                Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];

-                    yl = y_ptr + x;
-                    ul = u_ptr + (x>>1);
-                    vl = v_ptr + (x>>1);
-
-                    for (by = 0; by < 16; by += 4)
-                    {
-                        for (bx = 0; bx < 16; bx += 4)
-                        {
-                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
-                                || (ppflags->display_mb_modes_flag & B_PRED))
-                            {
-                                Y = B_PREDICTION_MODE_colors[bmi->mode][0];
-                                U = B_PREDICTION_MODE_colors[bmi->mode][1];
-                                V = B_PREDICTION_MODE_colors[bmi->mode][2];
-
-                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
-                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
-                            }
-                            bmi++;
-                        }
-
-                        yl += y_stride*4;
-                        ul += y_stride*1;
-                        vl += y_stride*1;
-                    }
-                }
-                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
-                {
-                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
-                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
-                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
-                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
-                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
-                }
+                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
+                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);

                mi++;
            }
@@ -1057,9 +815,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
    }

    /* Color in frame reference blocks */
-    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
+    if (flags & VP8D_DEBUG_LEVEL7)
    {
-        int y, x;
+        int i, j;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -1069,21 +827,18 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;

-        for (y = 0; y < height; y += 16)
+        for (i = 0; i < height; i += 16)
        {
-            for (x = 0; x < width; x +=16)
+            for (j = 0; j < width; j +=16)
            {
                int Y = 0, U = 0, V = 0;

-                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
-                {
-                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
-                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
-                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+                Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

-                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
-                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
-                }
+                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
+                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);

                mi++;
            }
@@ -1094,7 +849,6 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
            mi++;
        }
    }
-#endif

    *dest = oci->post_proc_buffer;

--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -24,15 +24,7 @@
              char whiteclamp[16], char bothclamp[16],\
              unsigned int w, unsigned int h, int pitch)

-#define prototype_postproc_blend_mb_inner(sym)\
-    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
-              int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_mb_outer(sym)\
-    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
-              int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_b(sym)\
+#define prototype_postproc_blend_mb(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)

@@ -60,36 +52,22 @@ extern prototype_postproc(vp8_postproc_downacross);
 #endif
 extern prototype_postproc_addnoise(vp8_postproc_addnoise);

-#ifndef vp8_postproc_blend_mb_inner
-#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
+#ifndef vp8_postproc_blend_mb
+#define vp8_postproc_blend_mb vp8_blend_mb_c
 #endif
-extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
-
-#ifndef vp8_postproc_blend_mb_outer
-#define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
-#endif
-extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
-
-#ifndef vp8_postproc_blend_b
-#define vp8_postproc_blend_b vp8_blend_b_c
-#endif
-extern prototype_postproc_blend_b(vp8_postproc_blend_b);
+extern prototype_postproc_blend_mb(vp8_postproc_blend_mb);

 typedef prototype_postproc((*vp8_postproc_fn_t));
 typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
 typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
-typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
-typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
+typedef prototype_postproc_blend_mb((*vp8_postproc_blend_mb_fn_t));
 typedef struct
 {
-    vp8_postproc_inplace_fn_t           down;
-    vp8_postproc_inplace_fn_t           across;
-    vp8_postproc_fn_t                   downacross;
-    vp8_postproc_addnoise_fn_t          addnoise;
-    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
-    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
-    vp8_postproc_blend_b_fn_t           blend_b;
+    vp8_postproc_inplace_fn_t   down;
+    vp8_postproc_inplace_fn_t   across;
+    vp8_postproc_fn_t           downacross;
+    vp8_postproc_addnoise_fn_t  addnoise;
+    vp8_postproc_blend_mb_fn_t  blend_mb;
 } vp8_postproc_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -111,7 +89,7 @@ struct postproc_state
 #include "onyxc_int.h"
 #include "ppflags.h"
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        vp8_ppflags_t *flags);
+                        int deblock_level, int noise_level, int flags);


 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -13,28 +13,17 @@
 #define __INC_PPFLAGS_H
 enum
 {
-    VP8D_NOFILTERING            = 0,
-    VP8D_DEBLOCK                = 1<<0,
-    VP8D_DEMACROBLOCK           = 1<<1,
-    VP8D_ADDNOISE               = 1<<2,
-    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
-    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
-    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
-    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
-    VP8D_DEBUG_DRAW_MV          = 1<<7,
-    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
+    VP8D_NOFILTERING    = 0,
+    VP8D_DEBLOCK        = 1<<0,
+    VP8D_DEMACROBLOCK   = 1<<1,
+    VP8D_ADDNOISE       = 1<<2,
+    VP8D_DEBUG_LEVEL1   = 1<<3,
+    VP8D_DEBUG_LEVEL2   = 1<<4,
+    VP8D_DEBUG_LEVEL3   = 1<<5,
+    VP8D_DEBUG_LEVEL4   = 1<<6,
+    VP8D_DEBUG_LEVEL5   = 1<<7,
+    VP8D_DEBUG_LEVEL6   = 1<<8,
+    VP8D_DEBUG_LEVEL7   = 1<<9
 };

-typedef struct
-{
-    int post_proc_flag;
-    int deblocking_level;
-    int noise_level;
-    int display_ref_frame_flag;
-    int display_mb_modes_flag;
-    int display_b_modes_flag;
-    int display_mv_flag;
-} vp8_ppflags_t;
-
 #endif
--- a/vp8/common/preproc.h
+++ b/vp8/common/preproc.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc.h
+*
+*   Description  :     simple preprocessor
+*
+****************************************************************************/
+
+#ifndef __INC_PREPROC_H
+#define __INC_PREPROC_H
+
+/****************************************************************************
+*  Types
+****************************************************************************/
+
+typedef struct
+{
+    unsigned char *frame_buffer;
+    int frame;
+    unsigned int *fixed_divide;
+
+    unsigned char *frame_buffer_alloc;
+    unsigned int *fixed_divide_alloc;
+} pre_proc_instance;
+
+/****************************************************************************
+*  Functions.
+****************************************************************************/
+void pre_proc_machine_specific_config(void);
+void delete_pre_proc(pre_proc_instance *ppi);
+int init_pre_proc(pre_proc_instance *ppi, int frame_size);
+extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength);
+extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+#endif
--- a/vp8/common/preprocif.h
+++ b/vp8/common/preprocif.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc_if.h
+*
+*   Description  :     Pre-processor interface header file.
+*
+****************************************************************************/
+
+#ifndef __PREPROC_IF_H
+#define __PREPROC_IF_H
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "type_aliases.h"
+
+/****************************************************************************
+*  Types
+****************************************************************************/
+
+typedef struct
+{
+    UINT8 *Yuv0ptr;
+    UINT8 *Yuv1ptr;
+
+    UINT8   *frag_info;              // blocks coded : passed in
+    UINT32   frag_info_element_size;   // size of each element
+    UINT32   frag_info_coded_mask;     // mask to get at whether fragment is coded
+
+    UINT32 *region_index;            // Gives pixel index for top left of each block
+    UINT32 video_frame_height;
+    UINT32 video_frame_width;
+    UINT8 hfrag_pixels;
+    UINT8 vfrag_pixels;
+
+} SCAN_CONFIG_DATA;
+
+typedef enum
+{
+    SCP_FILTER_ON_OFF,
+    SCP_SET_SRF_OFFSET,
+    SCP_SET_EBO_ON_OFF,
+    SCP_SET_VCAP_LEVEL_OFFSET,
+    SCP_SET_SHOW_LOCAL
+
+} SCP_SETTINGS;
+
+typedef struct PP_INSTANCE *x_pp_inst;
+
+/****************************************************************************
+*  Module statics
+****************************************************************************/
+/* Controls whether Early break out is on or off in default case */
+#define EARLY_BREAKOUT_DEFAULT  TRUE
+
+/****************************************************************************
+*  Functions
+****************************************************************************/
+extern  void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value);
+extern  UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator);
+extern  x_pp_inst create_pp_instance(void);
+extern  void delete_pp_instance(x_pp_inst *);
+extern  BOOL scan_yuvinit(x_pp_inst,  SCAN_CONFIG_DATA *scan_config_ptr);
+
+#endif
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -19,7 +19,7 @@
 extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
 extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif

--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -506,7 +506,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
 {
    int ret = -1;
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +524,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,

    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
 #else

    if (pbi->common.frame_to_show)
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -596,7 +596,7 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 }


-void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
@@ -647,6 +647,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
+    return 0;
 #else
    (void) pbi;
    (void) width;
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -29,9 +29,10 @@
    push    {r4-r11, lr}

    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 8
+    ;  sizeof (TOKENEXTRA) is 20
+    add     r2, r2, r2, lsl #2          ; xcount
    sub     sp, sp, #12
-    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
+    add     r2, r1, r2, lsl #2          ; stop = p + xcount
    str     r2, [sp, #0]
    str     r3, [sp, #8]                ; save vp8_coef_encodings
    ldr     r2, [r0, #vp8_writer_lowvalue]
@@ -40,13 +41,13 @@
    b       check_p_lt_stop

 while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #8]                ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+    ldr     r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -141,11 +142,12 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #48]               ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -153,7 +155,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -62,13 +62,13 @@ mb_row_loop
    ; actuall work gets done here!

 while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #20]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+    ldr     r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -163,11 +163,12 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #8]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -175,7 +176,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -90,13 +90,13 @@ mb_row_loop
    ; actual work gets done here!

 while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #80]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+    ldr     r7, [r1, #tokenextra_skip_eob_node]

    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -191,11 +191,12 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #84]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t

    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -203,7 +204,7 @@ token_count_lt_zero

 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits

--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor

 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
 }

 /*
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -51,6 +51,7 @@ DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));

 DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
 DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_prob_bc,               offsetof(vp8_extra_bit_struct, prob_bc));
 DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
 DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));

@@ -66,8 +67,8 @@ DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));

 // These two sizes are used in vp7cx_pack_tokens.  They are hard coded
 //  so if the size changes this will have to be adjusted.
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,7 +33,6 @@ typedef struct

    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *quant;
-    short *quant_fast;
    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
@@ -82,7 +81,6 @@ typedef struct
    int errthresh;
    int rddiv;
    int rdmult;
-    INT64 activity_sum;

    int mvcosts[2][MVvals+1];
    int *mvcost[2];
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -62,6 +62,7 @@ unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

 static const int qrounding_factors[129] =
 {
+    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -77,18 +78,12 @@ static const int qrounding_factors[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48,
-    48
+    48,
 };

 static const int qzbin_factors[129] =
 {
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
+    72, 72, 72, 72, 80, 80, 72, 72,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -99,11 +94,17 @@ static const int qzbin_factors[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80,
 };

 static const int qrounding_factors_y2[129] =
 {
+    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -119,18 +120,12 @@ static const int qrounding_factors_y2[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48, 48, 48, 48, 48, 48, 48, 48,
-    48
+    48,
 };

 static const int qzbin_factors_y2[129] =
 {
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
-    84, 84, 84, 84, 84, 84, 84, 84,
+    72, 72, 72, 72, 80, 80, 72, 72,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -141,30 +136,26 @@ static const int qzbin_factors_y2[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80,
 };

-#define EXACT_QUANT
+//#define EXACT_QUANT
 #ifdef EXACT_QUANT
-static void vp8cx_invert_quant(int improved_quant, short *quant,
-                               short *shift, short d)
+static void vp8cx_invert_quant(short *quant, short *shift, short d)
 {
-    if(improved_quant)
-    {
-        unsigned t;
-        int l;
-        t = d;
-        for(l = 0; t > 1; l++)
-            t>>=1;
-        t = 1 + (1<<(16+l))/d;
-        *quant = (short)(t - (1<<16));
-        *shift = l;
-    }
-    else
-    {
-        *quant = (1 << 16) / d;
-        *shift = 0;
-    }
+    unsigned t;
+    int l;
+    t = d;
+    for(l = 0; t > 1; l++)
+        t>>=1;
+    t = 1 + (1<<(16+l))/d;
+    *quant = (short)(t - (1<<16));
+    *shift = l;
 }

 void vp8cx_init_quantizer(VP8_COMP *cpi)
@@ -179,8 +170,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
    {
        // dc values
        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
-        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
+        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
                           cpi->Y1quant_shift[Q] + 0, quant_val);
        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -188,8 +178,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
-        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
+        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
                           cpi->Y2quant_shift[Q] + 0, quant_val);
        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -197,8 +186,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
-        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
+        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
                           cpi->UVquant_shift[Q] + 0, quant_val);
        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -211,8 +199,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            int rc = vp8_default_zig_zag1d[i];

            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
-            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
+            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
                               cpi->Y1quant_shift[Q] + rc, quant_val);
            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -220,8 +207,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
-            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
+            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
                               cpi->Y2quant_shift[Q] + rc, quant_val);
            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -229,8 +215,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;

            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
-            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
+            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
                               cpi->UVquant_shift[Q] + rc, quant_val);
            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -331,7 +316,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 0; i < 16; i++)
    {
        x->block[i].quant = cpi->Y1quant[QIndex];
-        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
        x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
        x->block[i].zbin = cpi->Y1zbin[QIndex];
        x->block[i].round = cpi->Y1round[QIndex];
@@ -346,7 +330,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 16; i < 24; i++)
    {
        x->block[i].quant = cpi->UVquant[QIndex];
-        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
        x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
        x->block[i].zbin = cpi->UVzbin[QIndex];
        x->block[i].round = cpi->UVround[QIndex];
@@ -357,7 +340,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

    // Y2
    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
-    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
    x->block[24].quant = cpi->Y2quant[QIndex];
    x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
    x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -369,9 +351,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)

 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
-    // Clear Zbin mode boost for default case
-    cpi->zbin_mode_boost = 0;
-
    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
    // when these values are not all zero.
    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
@@ -384,62 +363,6 @@ void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 }


-/* activity_avg must be positive, or flat regions could get a zero weight
- *  (infinite lambda), which confounds analysis.
- * This also avoids the need for divide by zero checks in
- *  vp8_activity_masking().
- */
-#define VP8_ACTIVITY_AVG_MIN (64)
-
-/* This is used as a reference when computing the source variance for the
- *  purposes of activity masking.
- * Eventually this should be replaced by custom no-reference routines,
- *  which will be faster.
- */
-static const unsigned char VP8_VAR_OFFS[16]=
-{
-    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-};
-
-unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
-{
-    unsigned int act;
-    unsigned int sse;
-    int sum;
-    unsigned int a;
-    unsigned int b;
-    unsigned int d;
-    /* TODO: This could also be done over smaller areas (8x8), but that would
-     *  require extensive changes elsewhere, as lambda is assumed to be fixed
-     *  over an entire MB in most of the code.
-     * Another option is to compute four 8x8 variances, and pick a single
-     *  lambda using a non-linear combination (e.g., the smallest, or second
-     *  smallest, etc.).
-     */
-    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,
-     x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);
-    /* This requires a full 32 bits of precision. */
-    act = (sse<<8) - sum*sum;
-    /* Drop 4 to give us some headroom to work with. */
-    act = (act + 8) >> 4;
-    /* If the region is flat, lower the activity some more. */
-    if (act < 8<<12)
-        act = act < 5<<12 ? act : 5<<12;
-    /* TODO: For non-flat regions, edge regions should receive less masking
-     *  than textured regions, but identifying edge regions quickly and
-     *  reliably enough is still a subject of experimentation.
-     * This will be most noticable near edges with a complex shape (e.g.,
-     *  text), but the 4x4 transform size should make this less of a problem
-     *  than it would be for an 8x8 transform.
-     */
-    /* Apply the masking to the RD multiplier. */
-    a = act + 4*cpi->activity_avg;
-    b = 4*act + cpi->activity_avg;
-    x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);
-    return act;
-}
-
-

 static
 void encode_mb_row(VP8_COMP *cpi,
@@ -451,7 +374,6 @@ void encode_mb_row(VP8_COMP *cpi,
                   int *segment_counts,
                   int *totalrate)
 {
-    INT64 activity_sum = 0;
    int i;
    int recon_yoffset, recon_uvoffset;
    int mb_col;
@@ -480,14 +402,14 @@ void encode_mb_row(VP8_COMP *cpi,
    // Set up limit values for vertical motion vector components
    // to prevent them extending beyond the UMV borders
    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) 
                        + (VP8BORDERINPIXELS - 16);

    // for each macroblock col in image
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
    {
-        // Distance of Mb to the left & right edges, specified in
-        // 1/8th pel units as they are always compared to values
+        // Distance of Mb to the left & right edges, specified in 
+        // 1/8th pel units as they are always compared to values 
        // that are in 1/8th pel units
        xd->mb_to_left_edge = -((mb_col * 16) << 3);
        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
@@ -495,7 +417,7 @@ void encode_mb_row(VP8_COMP *cpi,
        // Set up limit values for horizontal motion vector components
        // to prevent them extending beyond the UMV borders
        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) 
                            + (VP8BORDERINPIXELS - 16);

        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
@@ -503,12 +425,6 @@ void encode_mb_row(VP8_COMP *cpi,
        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
        xd->left_available = (mb_col != 0);

-        x->rddiv = cpi->RDDIV;
-        x->rdmult = cpi->RDMULT;
-
-        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
-            activity_sum += vp8_activity_masking(cpi, x);
-
        // Is segmentation enabled
        // MB level adjutment to quantizer
        if (xd->segmentation_enabled)
@@ -615,7 +531,6 @@ void encode_mb_row(VP8_COMP *cpi,
    // this is to account for the border
    xd->mode_info_context++;
    x->partition_info++;
-    x->activity_sum += activity_sum;
 }


@@ -732,7 +647,8 @@ void vp8_encode_frame(VP8_COMP *cpi)

    vp8_setup_block_ptrs(x);

-    x->activity_sum = 0;
+    x->rddiv = cpi->RDDIV;
+    x->rdmult = cpi->RDMULT;

 #if 0
    // Experimental rd code
@@ -787,12 +703,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
        else
        {
 #if CONFIG_MULTITHREAD
-            int i;
-
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
+                int i;
                cpi->current_mb_col_main = -1;

                for (i = 0; i < cpi->encoding_thread_count; i++)
@@ -870,11 +785,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
                totalrate += cpi->mb_row_ei[i].totalrate;
            }

-            for (i = 0; i < cpi->encoding_thread_count; i++)
-            {
-                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
-            }
-
 #endif

        }
@@ -1010,14 +920,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif

-    /* Update the average activity for the next frame.
-     * This is feed-forward for now; it could also be saved in two-pass, or
-     *  done during lookahead when that is eventually added.
-     */
-    cpi->activity_avg = (unsigned int )(x->activity_sum/cpi->common.MBs);
-    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
-        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
-
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
@@ -1279,18 +1181,7 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
-        /* Are we using the fast quantizer for the mode selection? */
-        if(cpi->sf.use_fastquant_for_pick)
-            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
-
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
-
-        /* switch back to the regular quantizer for the encode */
-        if (cpi->sf.improved_quant)
-        {
-            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
-        }
-
    }
    else
 #endif
@@ -1323,25 +1214,11 @@ int vp8cx_encode_inter_macroblock
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
-            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
-                 cpi->zbin_mode_boost = 0;
+            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME))
+                cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
            else
-            {
-                if (xd->mode_info_context->mbmi.mode == ZEROMV)
-                {
-                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-                    else
-                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-                }
-                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
-                    cpi->zbin_mode_boost = 0;
-                else
-                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-            }
+                cpi->zbin_mode_boost = 0;
        }
-        else
-            cpi->zbin_mode_boost = 0;

        vp8cx_mb_init_quantizer(cpi,  x);
    }
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -105,7 +105,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

 #if !(CONFIG_REALTIME_ONLY)
 #if 1
-    if (x->optimize)
+    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
        vp8_optimize_mby(x, rtcd);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -243,9 +243,9 @@ struct vp8_token_state{
 };

 // TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 4
-#define UV_RD_MULT 2
-#define Y2_RD_MULT 16
+#define Y1_RD_MULT 1
+#define UV_RD_MULT 1
+#define Y2_RD_MULT 4

 static const int plane_rd_mult[4]=
 {
@@ -309,10 +309,8 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    eob = d->eob;

    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-    rdmult = mb->rdmult * err_mult;
-    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
-        rdmult = (rdmult * 9)>>4;
-
+    /* TODO: These should vary with the block type, since the quantizer does. */
+    rdmult = (mb->rdmult << 2)*err_mult;
    rddiv = mb->rddiv;
    best_mask[0] = best_mask[1] = 0;
    /* Initialize the sentinel node of the trellis. */
@@ -635,7 +633,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mb(x);

 #if !(CONFIG_REALTIME_ONLY)
-    if (x->optimize)
+    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
        vp8_optimize_mb(x, rtcd);
 #endif

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -61,7 +61,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                    volatile int *last_row_current_mb_col;
-                    INT64 activity_sum = 0;

                    if (ithread > 0)
                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
@@ -112,12 +111,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                        xd->left_available = (mb_col != 0);

-                        x->rddiv = cpi->RDDIV;
-                        x->rdmult = cpi->RDMULT;
-
-                        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
-                            activity_sum += vp8_activity_masking(cpi, x);
-
                        // Is segmentation enabled
                        // MB level adjutment to quantizer
                        if (xd->segmentation_enabled)
@@ -133,7 +126,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        else
                            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default

-                        x->active_ptr = cpi->active_map + seg_map_index + mb_col;

                        if (cm->frame_type == KEY_FRAME)
                        {
@@ -165,28 +157,8 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                                cpi->inter_zz_count ++;

-                            // Special case code for cyclic refresh
-                            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
-                            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
-                            {
-                                cpi->segmentation_map[seg_map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
-
-                                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
-                                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
-                                // else mark it as dirty (1).
-                                if (xd->mode_info_context->mbmi.segment_id)
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
-                                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                                {
-                                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
-                                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
-                                }
-                                else
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
-
-                            }
                        }
+
                        cpi->tplist[mb_row].stop = *tp;

                        x->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
@@ -225,7 +197,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    // this is to account for the border
                    xd->mode_info_context++;
                    x->partition_info++;
-                    x->activity_sum += activity_sum;

                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -269,6 +240,8 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->sadperbit16      = x->sadperbit16;
    z->sadperbit4       = x->sadperbit4;
    z->errthresh        = x->errthresh;
+    z->rddiv            = x->rddiv;
+    z->rdmult           = x->rdmult;

    /*
    z->mv_col_min    = x->mv_col_min;
@@ -282,7 +255,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
-    z->optimize          = x->optimize;

    /*
    z->mvc              = x->mvc;
@@ -310,7 +282,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    for (i = 0; i < 25; i++)
    {
        z->block[i].quant           = x->block[i].quant;
-        z->block[i].quant_fast      = x->block[i].quant_fast;
        z->block[i].quant_shift     = x->block[i].quant_shift;
        z->block[i].zbin            = x->block[i].zbin;
        z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
@@ -421,7 +392,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,

        vp8_setup_block_ptrs(mb);

-        mb->activity_sum = 0;
+        mb->rddiv = cpi->RDDIV;
+        mb->rdmult = cpi->RDMULT;

        mbd->left_context = &cm->left_context;
        mb->mvc = cm->fc.mvc;
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -472,7 +472,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

    // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
        tmp_err += new_mv_mode_penalty;

@@ -495,7 +495,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
            num00--;
        else
        {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                tmp_err += new_mv_mode_penalty;

@@ -1145,7 +1145,6 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->output_frame_rate = cpi->oxcf.frame_rate;
    cpi->bits_left = (long long)(cpi->total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->bits_left -= (long long)(cpi->total_stats->duration * two_pass_min_rate / 10000000.0);
-    cpi->clip_bits_total = cpi->bits_left;

    vp8_avg_stats(cpi->total_stats);

@@ -1174,25 +1173,17 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    {
        start_pos = cpi->stats_in;               // Note starting "file" position

-        cpi->modified_error_total = 0.0;
-        cpi->modified_error_used = 0.0;
+        cpi->modified_total_error_left = 0.0;

        while (vp8_input_stats(cpi, &this_frame) != EOF)
        {
-            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
+            cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
        }
-        cpi->modified_error_left = cpi->modified_error_total;

        reset_fpf_position(cpi, start_pos);            // Reset file position

    }

-    // Calculate the clip target modified bits per error
-    // The observed bpe starts as the same number.
-    cpi->clip_bpe =  cpi->bits_left /
-                     DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
-    cpi->observed_bpe = cpi->clip_bpe;
-
    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }

@@ -1448,7 +1439,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        // Boost for arf frame
        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (i * 50);
+        Boost += (cpi->baseline_gf_interval * 50);
        allocation_chunks = (i * 100) + Boost;

        // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1594,9 +1585,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Reset the file position
    reset_fpf_position(cpi, start_pos);

-    // Update the record of error used so far (only done once per gf group)
-    cpi->modified_error_used += gf_group_err;
-
    // Assign  bits to the arf or gf.
    {
        int Boost;
@@ -1750,6 +1738,16 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

+        if (sectionstats.pcnt_motion < .17)
+            cpi->section_is_low_motion = 1;
+        else
+            cpi->section_is_low_motion = 0;
+
+        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+            cpi->section_is_fast_motion = 1;
+        else
+            cpi->section_is_fast_motion = 0;
+
        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
@@ -1894,16 +1892,6 @@ void vp8_second_pass(VP8_COMP *cpi)
    // Is this a GF / ARF (Note that a KF is always also a GF)
    if (cpi->frames_till_gf_update_due == 0)
    {
-        // Update monitor of the bits per error observed so far.
-        // Done once per gf group based on what has gone before
-        // so do nothing if this is the first frame.
-        if (cpi->common.current_video_frame > 0)
-        {
-            cpi->observed_bpe =
-                (double)(cpi->clip_bits_total - cpi->bits_left) /
-                cpi->modified_error_used;
-        }
-
        // Define next gf group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
        define_gf_group(cpi, &this_frame_copy);
@@ -1992,14 +1980,7 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->ni_av_qi                     = cpi->worst_quality;
        }
    }
-    // The last few frames of a clip almost always have to few or too many
-    // bits and for the sake of over exact rate control we dont want to make
-    // radical adjustments to the allowed quantizer range just to use up a
-    // few surplus bits or get beneath the target rate.
-    else if ( (cpi->common.current_video_frame <
-                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
-              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-                  (unsigned int)cpi->total_stats->count) )
+    else
    {
        if (frames_left < 1)
            frames_left = 1;
@@ -2218,7 +2199,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    }

    // Calculate the number of bits that should be assigned to the kf group.
-    if ((cpi->bits_left > 0) && ((int)cpi->modified_error_left > 0))
+    if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
    {
        // Max for a single normal frame (not key frame)
        int max_bits = frame_max_bits(cpi);
@@ -2230,7 +2211,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // complexity of the section
        cpi->kf_group_bits = (long long)( cpi->bits_left *
                                          ( kf_group_err /
-                                            cpi->modified_error_left ));
+                                            cpi->modified_total_error_left ));

        // Clip based on maximum per frame rate defined by the user.
        max_grp_bits = (long long)max_bits * (long long)cpi->frames_to_key;
@@ -2363,7 +2344,17 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        if (sectionstats.pcnt_motion < .17)
+            cpi->section_is_low_motion = 1;
+        else
+            cpi->section_is_low_motion = 0;
+
+        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+            cpi->section_is_fast_motion = 1;
+        else
+            cpi->section_is_fast_motion = 0;
+
+        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -2483,7 +2474,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            double  alt_kf_grp_bits =
                        ((double)cpi->bits_left *
                         (kf_mod_err * (double)cpi->frames_to_key) /
-                         DOUBLE_DIVIDE_CHECK(cpi->modified_error_left));
+                         DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left));

            alt_kf_bits = (int)((double)kf_boost *
                                (alt_kf_grp_bits / (double)allocation_chunks));
@@ -2501,7 +2492,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            alt_kf_bits =
                (int)((double)cpi->bits_left *
                      (kf_mod_err /
-                       DOUBLE_DIVIDE_CHECK(cpi->modified_error_left)));
+                       DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left)));

            if (alt_kf_bits > cpi->kf_bits)
            {
@@ -2521,7 +2512,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    // Adjust the count of total modified error left.
    // The count of bits left is adjusted elsewhere based on real coded frame sizes
-    cpi->modified_error_left -= kf_group_err;
+    cpi->modified_total_error_left -= kf_group_err;

    if (cpi->oxcf.allow_spatial_resampling)
    {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -40,12 +40,6 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;

-    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
-    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
-    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
-    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
-    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
-
    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -94,8 +88,6 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)

    cpi->rtcd.search.full_search             = vp8_full_search_sad;
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
-
-    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif

    // Pure C:
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -913,8 +913,7 @@ int vp8_diamond_search_sad
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2],
-    MV *center_mv
+    int *mvcost[2]
 )
 {
    int i, j, step;
@@ -941,8 +940,6 @@ int vp8_diamond_search_sad
    unsigned char *check_here;
    int thissad;

-    *num00 = 0;
-
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -952,7 +949,7 @@ int vp8_diamond_search_sad
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -964,6 +961,8 @@ int vp8_diamond_search_sad
    best_mv->row = ref_row;
    best_mv->col = ref_col;

+    *num00 = 0;
+
    for (step = 0; step < tot_steps ; step++)
    {
        for (j = 0 ; j < x->searches_per_step ; j++)
@@ -983,7 +982,7 @@ int vp8_diamond_search_sad
                {
                    this_mv.row = this_row_offset << 3;
                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1014,7 +1013,7 @@ int vp8_diamond_search_sad
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 }

 int vp8_diamond_search_sadx4
@@ -1029,8 +1028,7 @@ int vp8_diamond_search_sadx4
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2],
-    MV *center_mv
+    int *mvcost[2]
 )
 {
    int i, j, step;
@@ -1057,8 +1055,6 @@ int vp8_diamond_search_sadx4
    unsigned char *check_here;
    unsigned int thissad;

-    *num00 = 0;
-
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -1068,7 +1064,7 @@ int vp8_diamond_search_sadx4
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
    }

    // search_param determines the length of the initial step and hence the number of iterations
@@ -1080,6 +1076,8 @@ int vp8_diamond_search_sadx4
    best_mv->row = ref_row;
    best_mv->col = ref_col;

+    *num00 = 0;
+
    for (step = 0; step < tot_steps ; step++)
    {
        int all_in = 1, t;
@@ -1110,7 +1108,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);

                        if (sad_array[t] < bestsad)
                        {
@@ -1139,7 +1137,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = this_row_offset << 3;
                        this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);

                        if (thissad < bestsad)
                        {
@@ -1170,12 +1168,12 @@ int vp8_diamond_search_sadx4
        return INT_MAX;

    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 }


 #if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1213,7 +1211,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
        // Baseline value at the centre

        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1241,7 +1239,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
            this_mv.col = c << 3;
            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);

            if (thissad < bestsad)
            {
@@ -1260,12 +1258,12 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }

-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1303,7 +1301,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
    }

    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1325,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
        check_here = r * mv_stride + in_what + col_min;
        c = col_min;

-        while ((c + 2) < col_max)
+        while ((c + 3) < col_max)
        {
            int i;

@@ -1338,7 +1336,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);

                    if (thissad < bestsad)
                    {
@@ -1361,7 +1359,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);

                if (thissad < bestsad)
                {
@@ -1383,165 +1381,13 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er

    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
 #endif


-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
-{
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
-    unsigned char *bestaddress;
-    MV *best_mv = &d->bmi.mv.as_mv;
-    MV this_mv;
-    int bestsad = INT_MAX;
-    int r, c;
-
-    unsigned char *check_here;
-    unsigned int thissad;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-
-    int row_min = ref_row - distance;
-    int row_max = ref_row + distance;
-    int col_min = ref_col - distance;
-    int col_max = ref_col + distance;
-
-    unsigned short sad_array8[8];
-    unsigned int sad_array[3];
-
-    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
-    }
-
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-    if (col_min < x->mv_col_min)
-        col_min = x->mv_col_min;
-
-    if (col_max > x->mv_col_max)
-        col_max = x->mv_col_max;
-
-    if (row_min < x->mv_row_min)
-        row_min = x->mv_row_min;
-
-    if (row_max > x->mv_row_max)
-        row_max = x->mv_row_max;
-
-    for (r = row_min; r < row_max ; r++)
-    {
-        this_mv.row = r << 3;
-        check_here = r * mv_stride + in_what + col_min;
-        c = col_min;
-
-        while ((c + 7) < col_max)
-        {
-            int i;
-
-            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
-
-            for (i = 0; i < 8; i++)
-            {
-                thissad = (unsigned int)sad_array8[i];
-
-                if (thissad < bestsad)
-                {
-                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_mv->row = r;
-                        best_mv->col = c;
-                        bestaddress = check_here;
-                    }
-                }
-
-                check_here++;
-                c++;
-            }
-        }
-
-        while ((c + 2) < col_max)
-        {
-            int i;
-
-            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
-
-            for (i = 0; i < 3; i++)
-            {
-                thissad = sad_array[i];
-
-                if (thissad < bestsad)
-                {
-                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_mv->row = r;
-                        best_mv->col = c;
-                        bestaddress = check_here;
-                    }
-                }
-
-                check_here++;
-                c++;
-            }
-        }
-
-        while (c < col_max)
-        {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-            if (thissad < bestsad)
-            {
-                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
-
-                if (thissad < bestsad)
-                {
-                    bestsad = thissad;
-                    best_mv->row = r;
-                    best_mv->col = c;
-                    bestaddress = check_here;
-                }
-            }
-
-            check_here ++;
-            c ++;
-        }
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
-    else
-        return INT_MAX;
-}
-
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
 {
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -24,7 +24,7 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
-#define MAX_POSSIBLE_MV (1 << 11)                               // Maximum MV in 1/8 pel units
+

 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
@@ -67,8 +67,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int distance, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvcost[2], \
-     int *mvsadcost[2], \
-     MV *center_mv \
+     int *mvsadcost[2] \
    )

 #define prototype_diamond_search_sad(sym)\
@@ -84,8 +83,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int *num00, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvsadcost[2], \
-     int *mvcost[2], \
-     MV *center_mv \
+     int *mvcost[2] \
    )

 #if ARCH_X86 || ARCH_X86_64
@@ -95,7 +93,6 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
 typedef prototype_full_search_sad(*vp8_full_search_fn_t);
 extern prototype_full_search_sad(vp8_full_search_sad);
 extern prototype_full_search_sad(vp8_full_search_sadx3);
-extern prototype_full_search_sad(vp8_full_search_sadx8);

 typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
 extern prototype_diamond_search_sad(vp8_diamond_search_sad);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -73,7 +73,6 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);

-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi);

 static void set_default_lf_deltas(VP8_COMP *cpi);

@@ -175,6 +174,17 @@ static const int kf_high_motion_minq[QINDEX_RANGE] =
    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
    35,35,36,36,37,38,39,40,41,42,43,44,45,46,47,48,
 };
+/*static const int kf_minq[QINDEX_RANGE] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6,
+    7, 7, 8, 8, 9, 9, 10,10,11,11,12,12,13,13,14,14,
+    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
+    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
+    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38
+};*/
 static const int gf_low_motion_minq[QINDEX_RANGE] =
 {
    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
@@ -208,16 +218,27 @@ static const int gf_high_motion_minq[QINDEX_RANGE] =
    41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80,
 };
+/*static const int gf_arf_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,
+    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
+    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
+    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,39,
+    39,40,40,41,41,42,42,43,43,44,45,46,47,48,49,50,
+    51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66
+};*/
 static const int inter_minq[QINDEX_RANGE] =
 {
-    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
-    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
-    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
-    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
-    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
-    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
-    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
-    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+    0,0,0,0,1,1,2,3,3,4,4,5,6,6,7,7,
+    8,8,9,9,10,11,11,12,12,13,13,14,14,15,15,16,
+    16,17,17,17,18,18,19,19,20,20,21,21,22,22,22,23,
+    23,24,24,24,25,25,26,27,28,28,29,30,31,32,33,34,
+    35,35,36,37,38,39,39,40,41,42,43,43,44,45,46,47,
+    47,48,49,49,51,52,53,54,54,55,56,56,57,57,58,58,
+    59,59,60,61,61,62,62,63,64,64,65,66,67,67,68,69,
+    69,70,71,71,72,73,74,75,76,76,77,78,79,80,81,81,
 };

 void vp8_initialize()
@@ -262,21 +283,6 @@ static void setup_features(VP8_COMP *cpi)

 void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 {
-    // Delete last frame MV storage buffers
-    if (cpi->lfmv != 0)
-        vpx_free(cpi->lfmv);
-
-    cpi->lfmv = 0;
-
-    if (cpi->lf_ref_frame_sign_bias != 0)
-        vpx_free(cpi->lf_ref_frame_sign_bias);
-
-    cpi->lf_ref_frame_sign_bias = 0;
-
-    if (cpi->lf_ref_frame != 0)
-        vpx_free(cpi->lf_ref_frame);
-
-    cpi->lf_ref_frame = 0;

    // Delete sementation map
    if (cpi->segmentation_map != 0)
@@ -325,15 +331,8 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)

    cpi->mb.pip = 0;

-    if(cpi->total_stats)
-        vpx_free(cpi->total_stats);
-
-    cpi->total_stats = 0;
-
-    if(cpi->this_frame_stats)
-        vpx_free(cpi->this_frame_stats);
-
-    cpi->this_frame_stats = 0;
+    vpx_free(cpi->total_stats);
+    vpx_free(cpi->this_frame_stats);
 }

 static void enable_segmentation(VP8_PTR ptr)
@@ -564,7 +563,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    int Speed = cpi->Speed;
    int i;
    VP8_COMMON *cm = &cpi->common;
-    int last_improved_quant = sf->improved_quant;

    // Initialise default mode frequency sampling variables
    for (i = 0; i < MAX_MODES; i ++)
@@ -591,7 +589,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    sf->max_fs_radius = 32;
    sf->iterative_sub_pixel = 1;
    sf->optimize_coefficients = 1;
-    sf->use_fastquant_for_pick = 0;

    sf->first_step = 0;
    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -685,32 +682,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->thresh_mult[THR_NEARG    ] = 1000;
        sf->thresh_mult[THR_NEARA    ] = 1000;

-#if 1
-        sf->thresh_mult[THR_ZEROMV   ] = 0;
-        sf->thresh_mult[THR_ZEROG    ] = 0;
-        sf->thresh_mult[THR_ZEROA    ] = 0;
-        sf->thresh_mult[THR_NEARESTMV] = 0;
-        sf->thresh_mult[THR_NEARESTG ] = 0;
-        sf->thresh_mult[THR_NEARESTA ] = 0;
-        sf->thresh_mult[THR_NEARMV   ] = 0;
-        sf->thresh_mult[THR_NEARG    ] = 0;
-        sf->thresh_mult[THR_NEARA    ] = 0;
-
-//        sf->thresh_mult[THR_DC       ] = 0;
-
-//        sf->thresh_mult[THR_V_PRED   ] = 1000;
-//        sf->thresh_mult[THR_H_PRED   ] = 1000;
-//        sf->thresh_mult[THR_B_PRED   ] = 2000;
-//        sf->thresh_mult[THR_TM       ] = 1000;
-
-        sf->thresh_mult[THR_NEWMV    ] = 1000;
-        sf->thresh_mult[THR_NEWG     ] = 1000;
-        sf->thresh_mult[THR_NEWA     ] = 1000;
-
-        sf->thresh_mult[THR_SPLITMV  ] = 1700;
-        sf->thresh_mult[THR_SPLITG   ] = 4500;
-        sf->thresh_mult[THR_SPLITA   ] = 4500;
-#else
        sf->thresh_mult[THR_NEWMV    ] = 1500;
        sf->thresh_mult[THR_NEWG     ] = 1500;
        sf->thresh_mult[THR_NEWA     ] = 1500;
@@ -718,7 +689,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->thresh_mult[THR_SPLITMV  ] = 5000;
        sf->thresh_mult[THR_SPLITG   ] = 10000;
        sf->thresh_mult[THR_SPLITA   ] = 10000;
-#endif
+
        sf->full_freq[0] = 15;
        sf->full_freq[1] = 31;

@@ -790,7 +761,8 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_SPLITA   ] = 20000;
            }

-            sf->use_fastquant_for_pick = 1;
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;

            sf->first_step = 1;
            sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -798,8 +770,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)

        if (Speed > 1)
        {
-            sf->use_fastquant_for_pick = 0;
-
            cpi->mode_check_freq[THR_SPLITG] = 15;
            cpi->mode_check_freq[THR_SPLITA] = 15;
            cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -833,13 +803,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_SPLITA   ] = 50000;
            }

-            sf->first_step = 1;
-
-            sf->improved_quant = 0;
-            sf->improved_dct = 0;
-
-            // Only do recode loop on key frames, golden frames and
-            // alt ref frames
+            // Only do recode loop on key frames and golden frames
            sf->recode_loop = 2;

            sf->full_freq[0] = 31;
@@ -1298,8 +1262,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    {
        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
    }
-    if (cpi->sf.improved_quant != last_improved_quant)
-        vp8cx_init_quantizer(cpi);

 #if CONFIG_RUNTIME_CPU_DETECT
    cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
@@ -1367,9 +1329,6 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi)

 static int vp8_alloc_partition_data(VP8_COMP *cpi)
 {
-    if(cpi->mb.pip)
-        vpx_free(cpi->mb.pip);
-
    cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
                                (cpi->common.mb_rows + 1),
                                sizeof(PARTITION_INFO));
@@ -1437,16 +1396,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)

    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-    if(cpi->total_stats)
-        vpx_free(cpi->total_stats);
-
    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
-
-    if(cpi->this_frame_stats)
-        vpx_free(cpi->this_frame_stats);
-
    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
-
    if(!cpi->total_stats || !cpi->this_frame_stats)
        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate firstpass stats");
@@ -2194,10 +2145,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->alt_is_last  = 0 ;
    cpi->gold_is_alt  = 0 ;

-    // allocate memory for storing last frame's MVs for MV prediction.
-    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int_mv)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
+

    // Create the encoder segmentation map and set all entries to 0
    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
@@ -2253,8 +2201,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    init_context_counters();
 #endif

-    /*Initialize the feed-forward activity masking.*/
-    cpi->activity_avg = 90<<12;

    cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
    cpi->key_frame_frequency = cpi->oxcf.key_freq;
@@ -2395,7 +2341,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
    cpi->fn_ptr[BLOCK_16X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
-    cpi->fn_ptr[BLOCK_16X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
    cpi->fn_ptr[BLOCK_16X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);

    cpi->fn_ptr[BLOCK_16X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
@@ -2405,7 +2350,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_16X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
-    cpi->fn_ptr[BLOCK_16X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
    cpi->fn_ptr[BLOCK_16X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);

    cpi->fn_ptr[BLOCK_8X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
@@ -2415,7 +2359,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_8X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
-    cpi->fn_ptr[BLOCK_8X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
    cpi->fn_ptr[BLOCK_8X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);

    cpi->fn_ptr[BLOCK_8X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
@@ -2425,7 +2368,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_8X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
-    cpi->fn_ptr[BLOCK_8X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
    cpi->fn_ptr[BLOCK_8X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);

    cpi->fn_ptr[BLOCK_4X4].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
@@ -2435,7 +2377,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
    cpi->fn_ptr[BLOCK_4X4].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
-    cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
    cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);

 #if !(CONFIG_REALTIME_ONLY)
@@ -3486,37 +3427,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 #endif
 // return of 0 means drop frame

-// Function to test for conditions that indeicate we should loop
-// back and recode a frame.
-static BOOL recode_loop_test( VP8_COMP *cpi,
-                              int high_limit, int low_limit,
-                              int q, int maxq, int minq )
-{
-    BOOL    force_recode = FALSE;
-    VP8_COMMON *cm = &cpi->common;
-
-    // Is frame recode allowed at all
-    // Yes if either recode mode 1 is selected or mode two is selcted
-    // and the frame is a key frame. golden frame or alt_ref_frame
-    if ( (cpi->sf.recode_loop == 1) ||
-         ( (cpi->sf.recode_loop == 2) &&
-           ( (cm->frame_type == KEY_FRAME) ||
-             cm->refresh_golden_frame ||
-             cm->refresh_alt_ref_frame ) ) )
-    {
-        // General over and under shoot tests
-        if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
-             ((cpi->projected_frame_size < low_limit) && (q > minq)) )
-        {
-            force_recode = TRUE;
-        }
-        // Specific rate control mode related tests
-        // TBD
-    }
-
-    return force_recode;
-}
-
 static void encode_frame_to_data_rate
 (
    VP8_COMP *cpi,
@@ -3579,18 +3489,8 @@ static void encode_frame_to_data_rate
    cpi->zbin_over_quant = 0;
    cpi->zbin_mode_boost = 0;

-    // Enable or disable mode based tweaking of the zbin
-    // For 2 Pass Only used where GF/ARF prediction quality
-    // is above a threshold
-    cpi->zbin_mode_boost = 0;
+    // Enable mode based tweaking of the zbin
    cpi->zbin_mode_boost_enabled = TRUE;
-    if (cpi->pass == 2)
-    {
-        if ( cpi->gfu_boost <= 400 )
-        {
-            cpi->zbin_mode_boost_enabled = FALSE;
-        }
-    }

    // Current default encoder behaviour for the altref sign bias
    if (cpi->source_alt_ref_active)
@@ -3871,16 +3771,17 @@ static void encode_frame_to_data_rate

    vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);

-    // Limit Q range for the adaptive loop.
+    // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
    bottom_index = cpi->active_best_quality;
    top_index    = cpi->active_worst_quality;
-    q_low  = cpi->active_best_quality;
-    q_high = cpi->active_worst_quality;

    vp8_save_coding_context(cpi);

    loop_count = 0;

+    q_low  = cpi->best_quality;
+    q_high = cpi->worst_quality;
+

    scale_and_extend_source(cpi->un_scaled_source, cpi);
 #if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
@@ -3916,6 +3817,7 @@ static void encode_frame_to_data_rate
        if (cm->frame_type == KEY_FRAME)
        {
            vp8_de_noise(cpi->Source, cpi->Source, l , 1,  0, RTCD(postproc));
+            cpi->ppi.frame = 0;
        }
        else
        {
@@ -3927,6 +3829,10 @@ static void encode_frame_to_data_rate
            {
                src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
            }
+
+            //temp_filter(&cpi->ppi,src,src,
+            //  cm->last_frame.y_width * cm->last_frame.y_height,
+            //  cpi->oxcf.noise_sensitivity);
        }
    }

@@ -4057,13 +3963,15 @@ static void encode_frame_to_data_rate

                Q = vp8_regulate_q(cpi, cpi->this_frame_target);

+                q_low  = cpi->best_quality;
+                q_high = cpi->worst_quality;
+
                vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);

-                // Limit Q range for the adaptive loop.
+                // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
                bottom_index = cpi->active_best_quality;
                top_index    = cpi->active_worst_quality;
-                q_low  = cpi->active_best_quality;
-                q_high = cpi->active_worst_quality;
+

                loop_count++;
                Loop = TRUE;
@@ -4103,18 +4011,19 @@ static void encode_frame_to_data_rate
 #if !(CONFIG_REALTIME_ONLY)

        // Is the projected frame size out of range and are we allowed to attempt to recode.
-        if ( recode_loop_test( cpi,
-                               frame_over_shoot_limit, frame_under_shoot_limit,
-                               Q, top_index, bottom_index ) )
+        if (((cpi->sf.recode_loop == 1) ||
+             ((cpi->sf.recode_loop == 2) && (cm->refresh_golden_frame || (cm->frame_type == KEY_FRAME)))) &&
+            (((cpi->projected_frame_size > frame_over_shoot_limit) && (Q < top_index)) ||
+             //((cpi->projected_frame_size > frame_over_shoot_limit ) && (Q == top_index) && (cpi->zbin_over_quant < ZBIN_OQ_MAX)) ||
+             ((cpi->projected_frame_size < frame_under_shoot_limit) && (Q > bottom_index)))
+           )
        {
            int last_q = Q;
            int Retries = 0;

            // Frame size out of permitted range:
            // Update correction factor & compute new Q to try...
-
-            // Frame is too large
-            if (cpi->projected_frame_size > cpi->this_frame_target)
+            if (cpi->projected_frame_size > frame_over_shoot_limit)
            {
                //if ( cpi->zbin_over_quant == 0 )
                q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
@@ -4158,7 +4067,6 @@ static void encode_frame_to_data_rate

                overshoot_seen = TRUE;
            }
-            // Frame is too small
            else
            {
                if (cpi->zbin_over_quant == 0)
@@ -4252,36 +4160,6 @@ static void encode_frame_to_data_rate
    }
 #endif

-    // Update the GF useage maps.
-    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
-
-    // This frame's MVs are saved and will be used in next frame's MV prediction.
-    if(cm->show_frame)   //do not save for altref frame
-    {
-      int mb_row;
-      int mb_col;
-      MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
-      //static int last_video_frame = 0;
-
-      if(cm->frame_type != KEY_FRAME)
-      {
-        for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
-        {
-          for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
-          {
-              if(tmp->mbmi.ref_frame != INTRA_FRAME)
-                cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride)].as_int = tmp->mbmi.mv.as_int;
-
-              cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
-              cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride)] = tmp->mbmi.ref_frame;
-              tmp++;
-          }
-        }
-      }
-    }
-
-
    // Update the GF useage maps.
    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
@@ -4340,11 +4218,10 @@ static void encode_frame_to_data_rate
            {
                vp8cx_set_alt_lf_level(cpi, cm->filter_level);
                vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+                cm->last_frame_type = cm->frame_type;
                cm->last_filter_type = cm->filter_type;
                cm->last_sharpness_level = cm->sharpness_level;
            }
-            /* Move storing frame_type out of the above loop since it is also needed in motion search besides loopfilter */
-            cm->last_frame_type = cm->frame_type;

            vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

@@ -4656,7 +4533,7 @@ static void encode_frame_to_data_rate
    }
    else
    {
-        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame && (cpi->common.frame_type != KEY_FRAME))
+        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame)
            // Update the alternate reference frame and stats as appropriate.
            update_alt_ref_frame_and_stats(cpi);
        else
@@ -4979,7 +4856,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                {
                    int thiserr;
                    cpi->oxcf.arnr_strength = i;
-                    vp8_temporal_filter_prepare_c(cpi);
+                    vp8cx_temp_filter_c(cpi);

                    thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
                                                  &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
@@ -4994,7 +4871,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                if (besti != -1)
                {
                    cpi->oxcf.arnr_strength = besti;
-                    vp8_temporal_filter_prepare_c(cpi);
+                    vp8cx_temp_filter_c(cpi);
                    s = &cpi->alt_ref_buffer;

                    // FWG not sure if I need to copy this data for the Alt Ref frame
@@ -5006,7 +4883,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                    s = &cpi->src_buffer[cpi->last_alt_ref_sei];

 #else
-                vp8_temporal_filter_prepare_c(cpi);
+                vp8cx_temp_filter_c(cpi);
                s = &cpi->alt_ref_buffer;

                // FWG not sure if I need to copy this data for the Alt Ref frame
@@ -5090,16 +4967,17 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon

    *frame_flags = cpi->source_frame_flags;

+#if CONFIG_PSNR
+
    if (cpi->source_time_stamp < cpi->first_time_stamp_ever)
-    {
        cpi->first_time_stamp_ever = cpi->source_time_stamp;
-        cpi->last_end_time_stamp_seen = cpi->source_time_stamp;
-    }
+
+#endif

    // adjust frame rates based on timestamps given
    if (!cm->refresh_alt_ref_frame)
    {
-        if (cpi->source_time_stamp == cpi->first_time_stamp_ever)
+        if (cpi->last_time_stamp_seen == 0)
        {
            double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp);

@@ -5107,8 +4985,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
        }
        else
        {
-            long long nanosecs = cpi->source_end_time_stamp
-                - cpi->last_end_time_stamp_seen;
+            long long nanosecs = cpi->source_time_stamp - cpi->last_time_stamp_seen;
            double this_fps = 10000000.000 / nanosecs;

            vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
@@ -5116,7 +4993,6 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
        }

        cpi->last_time_stamp_seen = cpi->source_time_stamp;
-        cpi->last_end_time_stamp_seen = cpi->source_end_time_stamp;
    }

    if (cpi->compressor_speed == 2)
@@ -5332,7 +5208,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
    return 0;
 }

-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
 {
    VP8_COMP *cpi = (VP8_COMP *) comp;

@@ -5342,7 +5218,7 @@ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflag
    {
        int ret;
 #if CONFIG_POSTPROC
-        ret = vp8_post_proc_frame(&cpi->common, dest, flags);
+        ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
 #else

        if (cpi->common.frame_to_show)
@@ -5435,12 +5311,12 @@ int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert
 {
    VP8_COMP *cpi = (VP8_COMP *) comp;

-    if (horiz_mode <= ONETWO)
+    if (horiz_mode >= NORMAL && horiz_mode <= ONETWO)
        cpi->common.horiz_scale = horiz_mode;
    else
        return -1;

-    if (vert_mode <= ONETWO)
+    if (vert_mode >= NORMAL && vert_mode <= ONETWO)
        cpi->common.vert_scale  = vert_mode;
    else
        return -1;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,6 +18,7 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "onyxc_int.h"
+#include "preproc.h"
 #include "variance.h"
 #include "dct.h"
 #include "encodemb.h"
@@ -27,7 +28,6 @@
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
-#include "temporal_filter.h"

 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -46,8 +46,6 @@
 #define MAX_THRESHMULT  512

 #define GF_ZEROMV_ZBIN_BOOST 24
-#define LF_ZEROMV_ZBIN_BOOST 12
-#define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192

 #define VP8_TEMPORAL_ALT_REF 1
@@ -182,8 +180,6 @@ typedef struct
    int first_step;
    int optimize_coefficients;

-    int use_fastquant_for_pick;
-
 } SPEED_FEATURES;

 typedef struct
@@ -231,7 +227,6 @@ typedef struct VP8_ENCODER_RTCD
    vp8_encodemb_rtcd_vtable_t  encodemb;
    vp8_quantize_rtcd_vtable_t  quantize;
    vp8_search_rtcd_vtable_t    search;
-    vp8_temporal_rtcd_vtable_t  temporal;
 } VP8_ENCODER_RTCD;

 enum
@@ -244,12 +239,6 @@ enum
    BLOCK_MAX_SEGMENTS
 };

-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
 typedef struct
 {

@@ -271,9 +260,6 @@ typedef struct
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);


    MACROBLOCK mb;
@@ -290,14 +276,14 @@ typedef struct
    unsigned int source_frame_flags;
    YV12_BUFFER_CONFIG scaled_source;

-    int source_buffer_count;    // number of src_buffers in use for lagged encoding
-    int source_encode_index;    // index of buffer in src_buffer to encode
-    int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-    int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
+    int source_buffer_count;
+    int source_encode_index;
+    int source_alt_ref_pending;
+    int source_alt_ref_active;

-    int last_alt_ref_sei;       // index into src_buffers of frame used as alt reference
-    int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
-    int is_next_src_alt_ref;    // source of next frame to encode is an exact copy of an alt ref frame
+    int last_alt_ref_sei;
+    int is_src_frame_alt_ref;
+    int is_next_src_alt_ref;

    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
@@ -333,7 +319,6 @@ typedef struct
    int mvcostmultiplier;
    int subseqblockweight;
    int errthresh;
-    unsigned int activity_avg;

    int RDMULT;
    int RDDIV ;
@@ -414,7 +399,6 @@ typedef struct
    int inter_frame_target;
    double output_frame_rate;
    long long last_time_stamp_seen;
-    long long last_end_time_stamp_seen;
    long long first_time_stamp_ever;

    int ni_av_qi;
@@ -470,6 +454,8 @@ typedef struct
    unsigned char *output_partition2;
    size_t output_partition2size;

+    pre_proc_instance ppi;
+
    int frames_to_key;
    int gfu_boost;
    int kf_boost;
@@ -480,17 +466,11 @@ typedef struct
    double start_tot_err_left;
    double min_error;

-    double modified_error_total;
-    double modified_error_used;
-    double modified_error_left;
-    double clip_bpe;
-    double observed_bpe;
-
+    double modified_total_error_left;
    double avg_iiratio;

    int target_bandwidth;
    long long bits_left;
-    long long clip_bits_total;
    FIRSTPASS_STATS *total_stats;
    FIRSTPASS_STATS *this_frame_stats;
    FIRSTPASS_STATS *stats_in, *stats_in_end;
@@ -631,6 +611,9 @@ typedef struct
    unsigned int tempdata2;

    int base_skip_false_prob[128];
+    unsigned int section_is_low_motion;
+    unsigned int section_benefits_from_aggresive_q;
+    unsigned int section_is_fast_motion;
    unsigned int section_intra_rating;

    double section_max_qfactor;
@@ -678,10 +661,6 @@ typedef struct
    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
    int gf_active_count;

-    //Store last frame's MV info for next frame MV prediction
-    int_mv *lfmv;
-    int *lf_ref_frame_sign_bias;
-    int *lf_ref_frame;

 } VP8_COMP;

@@ -691,8 +670,6 @@ void vp8_encode_frame(VP8_COMP *cpi);

 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);

-unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
-
 int rd_cost_intra_mb(MACROBLOCKD *x);

 void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -685,7 +685,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
 #if 0

            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
+            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -698,7 +698,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                    num00--;
                else
                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);
+                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);

                    if (thissme < bestsme)
                    {
@@ -724,7 +724,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -743,7 +743,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9

                        if (thissme < bestsme)
                        {
--- a/vp8/encoder/preproc.c
+++ b/vp8/encoder/preproc.c
@@ -0,0 +1,251 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc.c
+*
+*   Description  :     Simple pre-processor.
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+
+#include "memory.h"
+#include "preproc7.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+);
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_c
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_c
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int frame = 0;
+
+            do
+            {
+                *frameptr = s[byte];
+                ++frameptr;
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            d[byte] = s[byte];
+
+            ++byte;
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int modifier;
+        int offset = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            int accumulator = 0;
+            int count = 0;
+            int frame = 0;
+
+            frameptr[offset] = s[byte];
+
+            do
+            {
+                int pixel_value = *frameptr;
+
+                modifier   = s[byte];
+                modifier  -= pixel_value;
+                modifier  *= modifier;
+                modifier >>= strength;
+                modifier  *= 3;
+
+                if (modifier > 16)
+                    modifier = 16;
+
+                modifier = 16 - modifier;
+
+                accumulator += modifier * pixel_value;
+
+                count += modifier;
+
+                frameptr++;
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            accumulator += (count >> 1);
+            accumulator *= ppi->fixed_divide[count];
+            accumulator >>= 16;
+
+            d[byte] = accumulator;
+
+            ++byte;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : delete_pre_proc
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Deletes a pre-processing instance.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void delete_pre_proc(pre_proc_instance *ppi)
+{
+    if (ppi->frame_buffer_alloc)
+        vpx_free(ppi->frame_buffer_alloc);
+
+    ppi->frame_buffer_alloc = 0;
+    ppi->frame_buffer      = 0;
+
+    if (ppi->fixed_divide_alloc)
+        vpx_free(ppi->fixed_divide_alloc);
+
+    ppi->fixed_divide_alloc = 0;
+    ppi->fixed_divide      = 0;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : init_pre_proc
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  int frame_size        : Number of bytes in one frame.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if successful, 0 if failed.
+ *
+ *  FUNCTION      : Initializes prepprocessor instance.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int init_pre_proc7(pre_proc_instance *ppi, int frame_size)
+{
+    int i;
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (wmt_enabled)
+        temp_filter = temp_filter_wmt;
+    else if (mmx_enabled)
+        temp_filter = temp_filter_mmx;
+    else
+        temp_filter = temp_filter_c;
+
+
+    delete_pre_proc(ppi);
+
+    ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char));
+
+    if (!ppi->frame_buffer_alloc)
+    {
+        delete_pre_proc(ppi);
+        return 0;
+    }
+
+    ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc);
+
+    ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int));
+
+    if (!ppi->fixed_divide_alloc)
+    {
+        delete_pre_proc(ppi);
+        return 0;
+    }
+
+    ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc);
+
+    for (i = 1; i < 255; i++)
+        ppi->fixed_divide[i] = 0x10000 / i;
+
+    return 1;
+}
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,9 +16,8 @@
 #include "entropy.h"
 #include "predictdc.h"

-#define EXACT_QUANT
-
-#ifdef EXACT_FASTQUANT
+//#define EXACT_QUANT
+#ifdef EXACT_QUANT
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -27,7 +26,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    short *coeff_ptr       = b->coeff;
    short *zbin_ptr        = b->zbin;
    short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant_fast;
+    short *quant_ptr       = b->quant;
    short *quant_shift_ptr = b->quant_shift;
    short *qcoeff_ptr      = d->qcoeff;
    short *dqcoeff_ptr     = d->dqcoeff;
@@ -65,45 +64,6 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
    d->eob = eob + 1;
 }

-#else
-
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr   = b->coeff;
-    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant_fast;
-    short *qcoeff_ptr  = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = d->dequant;
-
-    eob = -1;
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
-
-        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc] = x;                          // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
-
-        if (y)
-        {
-            eob = i;                                // last nonzero coeffs
-        }
-    }
-    d->eob = eob + 1;
-}
-
-#endif
-
-#ifdef EXACT_QUANT
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
@@ -218,6 +178,39 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
 }

 #else
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc] = x;                          // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+        if (y)
+        {
+            eob = i;                                // last nonzero coeffs
+        }
+    }
+    d->eob = eob + 1;
+}

 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 {
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -45,48 +45,46 @@ extern int inter_b_modes[10];
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9

-// Work in progress recalibration of baseline rate tables based on
-// the assumption that bits per mb is inversely proportional to the
-// quantizer value.
 const int vp8_bits_per_mb[2][QINDEX_RANGE] =
 {
-    // Intra case 450000/Qintra
+    // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q:
    {
-        1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
-        409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
-        250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
-        195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
-        155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
-        121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
-        102272, 100000, 97826,  97826,  95744,  93750,  91836,  90000,
-        88235,  86538,  84905,  83333,  81818,  80357,  78947,  77586,
-        76271,  75000,  73770,  72580,  71428,  70312,  69230,  68181,
-        67164,  66176,  65217,  64285,  63380,  62500,  61643,  60810,
-        60000,  59210,  59210,  58441,  57692,  56962,  56250,  55555,
-        54878,  54216,  53571,  52941,  52325,  51724,  51136,  50561,
-        49450,  48387,  47368,  46875,  45918,  45000,  44554,  44117,
-        43269,  42452,  41666,  40909,  40178,  39473,  38793,  38135,
-        36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
-        32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
+        674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719,
+        390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402,
+        285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615,
+        229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401,
+        187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560,
+        156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700,
+        134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782,
+        114883, 112332, 108410, 105685, 103434, 101192,  98587,  95959,
+        94059,  92017,  89970,  87936,  86142,  84801,  82736,  81106,
+        79668,  78135,  76641,  75103,  73943,  72693,  71401,  70098,
+        69165,  67901,  67170,  65987,  64923,  63534,  62378,  61302,
+        59921,  58941,  57844,  56782,  55960,  54973,  54257,  53454,
+        52230,  50938,  49962,  49190,  48288,  47270,  46738,  46037,
+        45020,  44027,  43216,  42287,  41594,  40702,  40081,  39414,
+        38282,  37627,  36987,  36375,  35808,  35236,  34710,  34162,
+        33659,  33327,  32751,  32384,  31936,  31461,  30982,  30582,
    },
-    // Inter case 285000/Qinter
+
+    // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q:
    {
-        712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
-        237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
-        142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
-        101785, 98275,  95000,  91935,  89062,  86363,  83823,  81428,
-        79166,  77027,  75000,  73076,  71250,  69512,  67857,  66279,
-        64772,  63333,  61956,  60638,  59375,  58163,  57000,  55882,
-        54807,  53773,  52777,  51818,  50892,  50000,  49137,  47500,
-        45967,  44531,  43181,  41911,  40714,  39583,  38513,  37500,
-        36538,  35625,  34756,  33928,  33139,  32386,  31666,  30978,
-        30319,  29687,  29081,  28500,  27941,  27403,  26886,  26388,
-        25909,  25446,  25000,  24568,  23949,  23360,  22800,  22265,
-        21755,  21268,  20802,  20357,  19930,  19520,  19127,  18750,
-        18387,  18037,  17701,  17378,  17065,  16764,  16473,  16101,
-        15745,  15405,  15079,  14766,  14467,  14179,  13902,  13636,
-        13380,  13133,  12895,  12666,  12445,  12179,  11924,  11632,
-        11445,  11220,  11003,  10795,  10594,  10401,  10215,  10035,
+        497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321,
+        233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116,
+        154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226,
+        114188, 111756, 107882, 104749, 102522,  96451,  94424,  90905,
+        87286,  84931,  82111,  80534,  77610,  74700,  73037,  70715,
+        68006,  67235,  65374,  64009,  62134,  60180,  59105,  57691,
+        55509,  54512,  53318,  52693,  51194,  49840,  48944,  46980,
+        45668,  44177,  42348,  40994,  39859,  38889,  37717,  36391,
+        35482,  34622,  33795,  32756,  32002,  31492,  30573,  29737,
+        29152,  28514,  27941,  27356,  26859,  26329,  25874,  25364,
+        24957,  24510,  24290,  23689,  23380,  22845,  22481,  22066,
+        21587,  21219,  20880,  20452,  20260,  19926,  19661,  19334,
+        18915,  18391,  18046,  17833,  17441,  17105,  16888,  16729,
+        16383,  16023,  15706,  15442,  15222,  14938,  14673,  14452,
+        14005,  13807,  13611,  13447,  13223,  13102,  12963,  12801,
+        12627,  12534,  12356,  12228,  12056,  11907,  11746,  11643,
    }
 };

@@ -326,7 +324,6 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
        cpi->frames_till_gf_update_due = cpi->goldfreq;

    cpi->common.refresh_golden_frame = TRUE;
-    cpi->common.refresh_alt_ref_frame = TRUE;
 }

 void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
@@ -1037,7 +1034,9 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi)
            gf_frame_useage = pct_gf_active;

        // Is a fixed manual GF frequency being used
-        if (cpi->auto_gold)
+        if (!cpi->auto_gold)
+            cpi->common.refresh_golden_frame = TRUE;
+        else
        {
            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -126,24 +126,6 @@ void vp8_sad16x16x3_c(
    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

-void vp8_sad16x16x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
-}
-
 void vp8_sad16x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -157,24 +139,6 @@ void vp8_sad16x8x3_c(
    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

-void vp8_sad16x8x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
-}
-
 void vp8_sad8x8x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -188,24 +152,6 @@ void vp8_sad8x8x3_c(
    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

-void vp8_sad8x8x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
-}
-
 void vp8_sad8x16x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -219,24 +165,6 @@ void vp8_sad8x16x3_c(
    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

-void vp8_sad8x16x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
-}
-
 void vp8_sad4x4x3_c(
    const unsigned char *src_ptr,
    int  src_stride,
@@ -250,24 +178,6 @@ void vp8_sad4x4x3_c(
    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }

-void vp8_sad4x4x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
-{
-    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
-}
-
 void vp8_sad16x16x4d_c(
    const unsigned char *src_ptr,
    int  src_stride,
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,37 +36,30 @@

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-#define USE_FILTER_LUT 0         // use lookup table to improve filter

+#define USE_FILTER_LUT 1
 #if VP8_TEMPORAL_ALT_REF

 #if USE_FILTER_LUT
-// for (strength = 0; strength <= 6; strength++) {
-//   for (delta = 0; delta <= 18; delta++) {
-//     float coeff = (3.0 * delta * delta) / pow(2, strength);
-//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
-//   }
-//   printf("\n");
-// }
 static int modifier_lut[7][19] =
 {
    // Strength=0
-    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    // Strength=1
-    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    // Strength=2
-    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    // Strength=3
-    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    // Strength=4
-    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},
+    {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}
+    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
 };
 #endif
-static void vp8_temporal_filter_predictors_mb_c
+static void build_predictors_mb
 (
    MACROBLOCKD *x,
    unsigned char *y_mb_ptr,
@@ -118,7 +111,7 @@ static void vp8_temporal_filter_predictors_mb_c
        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, stride, &pred[320], 8);
    }
 }
-void vp8_temporal_filter_apply_c
+static void apply_temporal_filter
 (
    unsigned char *frame1,
    unsigned int stride,
@@ -147,14 +140,16 @@ void vp8_temporal_filter_apply_c
            int pixel_value = *frame2++;

 #if USE_FILTER_LUT
+            // LUT implementation --
+            // improves precision of filter
            modifier = abs(src_byte-pixel_value);
            modifier = modifier>18 ? 0 : lut[modifier];
 #else
-            modifier   = src_byte - pixel_value;
+            modifier   = src_byte;
+            modifier  -= pixel_value;
            modifier  *= modifier;
-            modifier  *= 3;
-            modifier  += 1 << (strength - 1);
            modifier >>= strength;
+            modifier  *= 3;

            if (modifier > 16)
                modifier = 16;
@@ -176,7 +171,7 @@ void vp8_temporal_filter_apply_c
 #if ALT_REF_MC_ENABLED
 static int dummy_cost[2*mv_max+1];

-static int vp8_temporal_filter_find_matching_mb_c
+static int find_matching_mb
 (
    VP8_COMP *cpi,
    YV12_BUFFER_CONFIG *arf_frame,
@@ -251,7 +246,7 @@ static int vp8_temporal_filter_find_matching_mb_c
            step_param,
            sadpb / 2/*x->errorperbit*/,
            &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9
+            mvsadcost, mvcost); //sadpb < 9

        // Further step/diamond searches as necessary
        n = 0;
@@ -273,7 +268,7 @@ static int vp8_temporal_filter_find_matching_mb_c
                    step_param + n,
                    sadpb / 4/*x->errorperbit*/,
                    &num00, &cpi->fn_ptr[BLOCK_16X16],
-                    mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9
+                    mvsadcost, mvcost); //sadpb = 9

                if (thissme < bestsme)
                {
@@ -297,7 +292,7 @@ static int vp8_temporal_filter_find_matching_mb_c
        bestsme = cpi->find_fractional_mv_step(x, b, d,
                    &d->bmi.mv.as_mv, &best_ref_mv1,
                    x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
-                    mvcost);
+                    cpi->mb.mvcost);
    }
 #endif

@@ -313,7 +308,7 @@ static int vp8_temporal_filter_find_matching_mb_c
 }
 #endif

-static void vp8_temporal_filter_iterate_c
+static void vp8cx_temp_blur1_c
 (
    VP8_COMP *cpi,
    int frame_count,
@@ -417,12 +412,11 @@ static void vp8_temporal_filter_iterate_c
 #define THRESH_HIGH  20000

                    // Correlation has been lost try MC
-                    err = vp8_temporal_filter_find_matching_mb_c
-                        (cpi,
-                         cpi->frames[alt_ref_index],
-                         cpi->frames[frame],
-                         mb_y_offset,
-                         THRESH_LOW);
+                    err = find_matching_mb ( cpi,
+                                             cpi->frames[alt_ref_index],
+                                             cpi->frames[frame],
+                                             mb_y_offset,
+                                             THRESH_LOW );

                    if (filter_weight[frame] < 2)
                    {
@@ -435,46 +429,43 @@ static void vp8_temporal_filter_iterate_c
                if (filter_weight[frame] != 0)
                {
                    // Construct the predictors
-                    vp8_temporal_filter_predictors_mb_c
-                        (mbd,
-                         cpi->frames[frame]->y_buffer + mb_y_offset,
-                         cpi->frames[frame]->u_buffer + mb_uv_offset,
-                         cpi->frames[frame]->v_buffer + mb_uv_offset,
-                         cpi->frames[frame]->y_stride,
-                         mbd->block[0].bmi.mv.as_mv.row,
-                         mbd->block[0].bmi.mv.as_mv.col,
-                         predictor);
+                    build_predictors_mb (
+                              mbd,
+                              cpi->frames[frame]->y_buffer + mb_y_offset,
+                              cpi->frames[frame]->u_buffer + mb_uv_offset,
+                              cpi->frames[frame]->v_buffer + mb_uv_offset,
+                              cpi->frames[frame]->y_stride,
+                              mbd->block[0].bmi.mv.as_mv.row,
+                              mbd->block[0].bmi.mv.as_mv.col,
+                              predictor );

                    // Apply the filter (YUV)
-                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-                        (f->y_buffer + mb_y_offset,
-                         f->y_stride,
-                         predictor,
-                         16,
-                         strength,
-                         filter_weight[frame],
-                         accumulator,
-                         count);
+                    apply_temporal_filter ( f->y_buffer + mb_y_offset,
+                                            f->y_stride,
+                                            predictor,
+                                            16,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator,
+                                            count );

-                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-                        (f->u_buffer + mb_uv_offset,
-                         f->uv_stride,
-                         predictor + 256,
-                         8,
-                         strength,
-                         filter_weight[frame],
-                         accumulator + 256,
-                         count + 256);
+                    apply_temporal_filter ( f->u_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 256,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 256,
+                                            count + 256 );

-                    TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
-                        (f->v_buffer + mb_uv_offset,
-                         f->uv_stride,
-                         predictor + 320,
-                         8,
-                         strength,
-                         filter_weight[frame],
-                         accumulator + 320,
-                         count + 320);
+                    apply_temporal_filter ( f->v_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 320,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 320,
+                                            count + 320 );
                }
            }

@@ -543,7 +534,7 @@ static void vp8_temporal_filter_iterate_c
    mbd->pre.v_buffer = v_buffer;
 }

-void vp8_temporal_filter_prepare_c
+void vp8cx_temp_filter_c
 (
    VP8_COMP *cpi
 )
@@ -651,7 +642,7 @@ void vp8_temporal_filter_prepare_c
                = &cpi->src_buffer[which_buffer].source_buffer;
    }

-    vp8_temporal_filter_iterate_c (
+    vp8cx_temp_blur1_c (
        cpi,
        frames_to_blur,
        frames_to_blur_backward,
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -12,33 +12,8 @@
 #ifndef __INC_VP8_TEMPORAL_FILTER_H
 #define __INC_VP8_TEMPORAL_FILTER_H

-#define prototype_apply(sym)\
-    void (sym) \
-    ( \
-     unsigned char *frame1, \
-     unsigned int stride, \
-     unsigned char *frame2, \
-     unsigned int block_size, \
-     int strength, \
-     int filter_weight, \
-     unsigned int *accumulator, \
-     unsigned int *count \
-    )
+#include "onyx_int.h"

-#ifndef vp8_temporal_filter_apply
-#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
-#endif
-extern prototype_apply(vp8_temporal_filter_apply);
-
-typedef struct
-{
-    prototype_apply(*apply);
-} vp8_temporal_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define TEMPORAL_INVOKE(ctx,fn) vp8_temporal_filter_##fn
-#endif
+void vp8cx_temp_filter_c(VP8_COMP *cpi);

 #endif // __INC_VP8_TEMPORAL_FILTER_H
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -132,6 +132,8 @@ static void tokenize2nd_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

+        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
+
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -183,6 +185,7 @@ static void tokenize1st_order_b
        t->Token = x;
        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

+        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

        ++cpi->coef_counts       [type] [band] [pt] [x];
@@ -431,6 +434,7 @@ static __inline void stuff2nd_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->section = 11;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -461,6 +465,7 @@ static __inline void stuff1st_order_b

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+    t->section = 8;
    t->skip_eob_node = 0;
    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
    ++t;
@@ -490,6 +495,7 @@ void stuff1st_order_buv

    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+    t->section = 13;
    t->skip_eob_node = 0;
    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -25,10 +25,11 @@ typedef struct

 typedef struct
 {
+    int Token;
+    int Extra;
    const vp8_prob *context_tree;
-    short           Extra;
-    unsigned char   Token;
-    unsigned char   skip_eob_node;
+    int skip_eob_node;
+    int section;
 } TOKENEXTRA;

 int rd_cost_mby(MACROBLOCKD *);
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -32,16 +32,6 @@
     unsigned int *sad_array\
    )

-#define prototype_sad_multi_same_address_1(sym)\
-    void (sym)\
-    (\
-     const unsigned char *src_ptr, \
-     int source_stride, \
-     const unsigned char *ref_ptr, \
-     int  ref_stride, \
-     unsigned short *sad_array\
-    )
-
 #define prototype_sad_multi_dif_address(sym)\
    void (sym)\
    (\
@@ -148,31 +138,6 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);

-#ifndef vp8_variance_sad16x16x8
-#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
-
-#ifndef vp8_variance_sad16x8x8
-#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
-
-#ifndef vp8_variance_sad8x8x8
-#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
-
-#ifndef vp8_variance_sad8x16x8
-#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
-
-#ifndef vp8_variance_sad4x4x8
-#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
-#endif
-extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
-
 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

 #ifndef vp8_variance_sad16x16x4d
@@ -309,7 +274,6 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);

 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
-typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
 typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
 typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
@@ -353,12 +317,6 @@ typedef struct
    vp8_sad_multi_fn_t       sad8x8x3;
    vp8_sad_multi_fn_t       sad4x4x3;

-    vp8_sad_multi1_fn_t      sad16x16x8;
-    vp8_sad_multi1_fn_t      sad16x8x8;
-    vp8_sad_multi1_fn_t      sad8x16x8;
-    vp8_sad_multi1_fn_t      sad8x8x8;
-    vp8_sad_multi1_fn_t      sad4x4x8;
-
    vp8_sad_multi_d_fn_t     sad16x16x4d;
    vp8_sad_multi_d_fn_t     sad16x8x4d;
    vp8_sad_multi_d_fn_t     sad8x16x4d;
@@ -376,7 +334,6 @@ typedef struct
    vp8_variance_fn_t       svf_halfpix_v;
    vp8_variance_fn_t       svf_halfpix_hv;
    vp8_sad_multi_fn_t      sdx3f;
-    vp8_sad_multi1_fn_t     sdx8f;
    vp8_sad_multi_d_fn_t    sdx4df;
 } vp8_variance_fn_ptr_t;

--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,231 +11,511 @@

 %include "vpx_ports/x86_abi_support.asm"

+section .text
+    global sym(vp8_short_fdct4x4_mmx)
+    global sym(vp8_short_fdct8x4_wmt)
+
+
+%define         DCTCONSTANTSBITS         (16)
+%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
+%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
+%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
+%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
+
+
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_mmx)
 sym(vp8_short_fdct4x4_mmx):
    push        rbp
-    mov         rbp,        rsp
+    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 3
    GET_GOT     rbx
-    push        rsi
-    push        rdi
+    push rsi
+    push rdi
    ; end prolog
+        mov     rsi,    arg(0) ;input
+        mov     rdi,    arg(1) ;output

-        mov         rsi,        arg(0)      ; input
-        mov         rdi,        arg(1)      ; output
+        lea     rdx,    [GLOBAL(dct_const_mmx)]
+        movsxd  rax,    dword ptr arg(2) ;pitch

-        movsxd      rax,        dword ptr arg(2) ;pitch
-
-        lea         rcx,        [rsi + rax*2]
+        lea     rcx,    [rsi + rax*2]
        ; read the input data
-        movq        mm0,        [rsi]
-        movq        mm1,        [rsi + rax]
+        movq    mm0,    [rsi]
+        movq    mm1,    [rsi + rax    ]

-        movq        mm2,        [rcx]
-        movq        mm4,        [rcx + rax]
+        movq    mm2,    [rcx]
+        movq    mm3,    [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        psllw   mm0,    3
+        psllw   mm1,    3

-        ; transpose for the first stage
-        movq        mm3,        mm0         ; 00 01 02 03
-        movq        mm5,        mm2         ; 20 21 22 23
+        psllw   mm2,    3
+        psllw   mm3,    3

-        punpcklwd   mm0,        mm1         ; 00 10 01 11
-        punpckhwd   mm3,        mm1         ; 02 12 03 13
+        ; transpose for the second stage
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03

-        punpcklwd   mm2,        mm4         ; 20 30 21 31
-        punpckhwd   mm5,        mm4         ; 22 32 23 33
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13

-        movq        mm1,        mm0         ; 00 10 01 11
-        punpckldq   mm0,        mm2         ; 00 10 20 30
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33

-        punpckhdq   mm1,        mm2         ; 01 11 21 31

-        movq        mm2,        mm3         ; 02 12 03 13
-        punpckldq   mm2,        mm5         ; 02 12 22 32
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30

-        punpckhdq   mm3,        mm5         ; 03 13 23 33
+        punpckhdq   mm1,    mm2     ; 01 11 21 31
+
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32
+
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4

-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 3

        ; first stage
-        movq        mm5,        mm0
-        movq        mm4,        mm1
+        movq    mm5,    mm0
+        movq    mm4,    mm1

-        paddw       mm0,        mm3         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2

-        psubw       mm4,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm3         ; d1 = 0 - 3
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3

-        psllw       mm5,        3
-        psllw       mm4,        3
-
-        psllw       mm0,        3
-        psllw       mm1,        3

        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a

-        paddw       mm0,        mm1         ; op[0] = a1 + b1
-        psubw       mm2,        mm1         ; op[2] = a1 - b1
+        paddw   mm0,    mm1         ; a + b
+        psubw   mm2,    mm1         ; a - b
+
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03
+
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23

        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm4         ; c1 d1
-        punpckhwd   mm5,        mm4         ; c1 d1
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3

-        movq        mm3,        mm1
-        movq        mm4,        mm5
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13

-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3

-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1
+
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5

-        packssdw    mm1,        mm4         ; op[1]
-        packssdw    mm3,        mm5         ; op[3]

        ; done with vertical
        ; transpose for the second stage
-        movq        mm4,        mm0         ; 00 10 20 30
-        movq        mm5,        mm2         ; 02 12 22 32
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03

-        punpcklwd   mm0,        mm1         ; 00 01 10 11
-        punpckhwd   mm4,        mm1         ; 20 21 30 31
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13

-        punpcklwd   mm2,        mm3         ; 02 03 12 13
-        punpckhwd   mm5,        mm3         ; 22 23 32 33
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33

-        movq        mm1,        mm0         ; 00 01 10 11
-        punpckldq   mm0,        mm2         ; 00 01 02 03

-        punpckhdq   mm1,        mm2         ; 01 22 12 13
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30

-        movq        mm2,        mm4         ; 20 31 30 31
-        punpckldq   mm2,        mm5         ; 20 21 22 23
+        punpckhdq   mm1,    mm2     ; 01 11 21 31

-        punpckhdq   mm4,        mm5         ; 30 31 32 33
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32

-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 4
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4

-        movq        mm5,        mm0
-        movq        mm3,        mm1

-        paddw       mm0,        mm4         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
+        ; first stage
+        movq    mm5,    mm0
+        movq    mm4,    mm1

-        psubw       mm3,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm4         ; d1 = 0 - 3
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2

-        pxor        mm6,        mm6         ; zero out for compare
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3

-        pcmpeqw     mm6,        mm5         ; d1 != 0
-
-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
-                                                                ; and keep bit 0 of lower

        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a
+        paddw   mm0,    mm1         ; a + b

-        paddw       mm0,        mm1         ; a1 + b1
-        psubw       mm2,        mm1         ; a1 - b1
+        psubw   mm2,    mm1         ; a - b

-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03

-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23

-        movq        MMWORD PTR[rdi + 0 ],  mm0
-        movq        MMWORD PTR[rdi + 16],  mm2

        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm3         ; c1 d1
-        punpckhwd   mm5,        mm3         ; c1 d1
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3

-        movq        mm3,        mm1
-        movq        mm4,        mm5
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13

-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3

-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1

-        packssdw    mm1,        mm4         ; op[4]
-        packssdw    mm3,        mm5         ; op[12]
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5
+        ; done with vertical

-        paddw       mm1,        mm6         ; op[4] += (d1!=0)
+        pcmpeqw mm4,    mm4
+        pcmpeqw mm5,    mm5
+        psrlw   mm4,    15
+        psrlw   mm5,    15

-        movq        MMWORD PTR[rdi + 8 ],  mm1
-        movq        MMWORD PTR[rdi + 24],  mm3
+        psllw   mm4,    2
+        psllw   mm5,    2

-     ; begin epilog
-    pop         rdi
-    pop         rsi
+        paddw   mm0,    mm4
+        paddw   mm1,    mm5
+        paddw   mm2,    mm4
+        paddw   mm3,    mm5
+
+        psraw   mm0, 3
+        psraw   mm1, 3
+        psraw   mm2, 3
+        psraw   mm3, 3
+
+        movq        [rdi   ],   mm0
+        movq        [rdi+ 8],   mm1
+        movq        [rdi+16],   mm2
+        movq        [rdi+24],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

+
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+        mov         rsi,    arg(0) ;input
+        mov         rdi,    arg(1) ;output
+
+        lea         rdx,    [GLOBAL(dct_const_xmm)]
+        movsxd      rax,    dword ptr arg(2) ;pitch
+
+        lea         rcx,    [rsi + rax*2]
+        ; read the input data
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm2,       [rsi + rax]
+
+        movdqa      xmm4,       [rcx]
+        movdqa      xmm3,       [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        psllw       xmm0,        3
+        psllw       xmm2,        3
+
+        psllw       xmm4,        3
+        psllw       xmm3,        3
+
+        ; transpose for the second stage
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
+        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+        ; done with vertical
+
+
+        pcmpeqw     xmm4,       xmm4
+        pcmpeqw     xmm5,       xmm5;
+        psrlw       xmm4,       15
+        psrlw       xmm5,       15
+
+        psllw       xmm4,       2
+        psllw       xmm5,       2
+
+        paddw       xmm0,       xmm4
+        paddw       xmm1,       xmm5
+        paddw       xmm2,       xmm4
+        paddw       xmm3,       xmm5
+
+        psraw       xmm0,       3
+        psraw       xmm1,       3
+        psraw       xmm2,       3
+        psraw       xmm3,       3
+
+        movq        QWORD PTR[rdi   ],   xmm0
+        movq        QWORD PTR[rdi+ 8],   xmm1
+        movq        QWORD PTR[rdi+16],   xmm2
+        movq        QWORD PTR[rdi+24],   xmm3
+
+        psrldq      xmm0,       8
+        psrldq      xmm1,       8
+        psrldq      xmm2,       8
+        psrldq      xmm3,       8
+
+        movq        QWORD PTR[rdi+32],   xmm0
+        movq        QWORD PTR[rdi+40],   xmm1
+        movq        QWORD PTR[rdi+48],   xmm2
+        movq        QWORD PTR[rdi+56],   xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
 SECTION_RODATA
-align 8
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 8
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 8
-_cmp_mask:
-    times 4 dw 1
-align 8
-_7w:
-    times 4 dw 7
-align 8
-_14500:
-    times 2 dd 14500
-align 8
-_7500:
-    times 2 dd 7500
-align 8
-_12000:
-    times 2 dd 12000
-align 8
-_51000:
-    times 2 dd 51000
+;static const unsigned int dct1st_stage_rounding_mmx[2] =
+align 16
+dct1st_stage_rounding_mmx:
+    times 2 dd 8192
+
+
+;static const unsigned int dct2nd_stage_rounding_mmx[2] =
+align 16
+dct2nd_stage_rounding_mmx:
+    times 2 dd 32768
+
+
+;static const short dct_matrix[4][4]=
+align 16
+dct_matrix:
+    times 4 dw 23170
+
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+
+    dw 23170
+    times 2 dw -23170
+    dw 23170
+
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
+
+
+;static const unsigned short dct_const_mmx[4 * 4]=
+align 16
+dct_const_mmx:
+    times 4 dw 0
+    times 4 dw 60547
+    times 4 dw 46341
+    times 4 dw 25080
+
+
+;static const unsigned short dct_const_xmm[8 * 4]=
+align 16
+dct_const_xmm:
+    times 8 dw 0
+    times 8 dw 60547
+    times 8 dw 46341
+    times 8 dw 25080
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,68 +11,32 @@

 %include "vpx_ports/x86_abi_support.asm"

-%macro STACK_FRAME_CREATE 0
-%if ABI_IS_32BIT
-  %define       input       rsi
-  %define       output      rdi
-  %define       pitch       rax
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
    push        rbp
    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+;;    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rsi, arg(0)
-    mov         rdi, arg(1)
+    movsxd      rax, DWORD PTR arg(2)
+    lea         rdi, [rsi + rax*2]

-    movsxd      rax, dword ptr arg(2)
-    lea         rcx, [rsi + rax*2]
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    %define     input       rcx
-    %define     output      rdx
-    %define     pitch       r8
-  %else
-    %define     input       rdi
-    %define     output      rsi
-    %define     pitch       rdx
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY 0
-  %define     input
-  %define     output
-  %define     pitch
-
-%if ABI_IS_32BIT
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-  %endif
-%endif
-    ret
-%endmacro
-
-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
-
-    STACK_FRAME_CREATE
-
-    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
-    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
-    lea         input,          [input+2*pitch]
-    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
-    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
+    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
+    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
+    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
+    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30

    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20

+    mov         rdi, arg(1)
+
    movdqa      xmm2, xmm0
    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
@@ -87,7 +51,6 @@ sym(vp8_short_fdct4x4_sse2):
    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
-
    movdqa      xmm1, xmm0
    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
@@ -158,216 +121,17 @@ sym(vp8_short_fdct4x4_sse2):
    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]

-    movdqa      XMMWORD PTR[output +  0], xmm0
-    movdqa      XMMWORD PTR[output + 16], xmm1
+    movdqa      XMMWORD PTR[rdi + 0], xmm0
+    movdqa      XMMWORD PTR[rdi + 16], xmm1

-    STACK_FRAME_DESTROY
-
-;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct8x4_sse2)
-sym(vp8_short_fdct8x4_sse2):
-
-    STACK_FRAME_CREATE
-
-        ; read the input data
-        movdqa      xmm0,       [input        ]
-        movdqa      xmm2,       [input+  pitch]
-        lea         input,      [input+2*pitch]
-        movdqa      xmm4,       [input        ]
-        movdqa      xmm3,       [input+  pitch]
-
-        ; transpose for the first stage
-        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
-
-        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
-
-        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
-
-        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
-
-        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
-
-        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
-
-        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
-
-        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
-
-        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
-
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm2        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        psllw       xmm5,        3
-        psllw       xmm4,        3
-
-        psllw       xmm0,        3
-        psllw       xmm1,        3
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
-        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
-
-        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    xmm1,       xmm4        ; op[1]
-        packssdw    xmm3,       xmm5        ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
-
-        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-
-        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
-
-        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
-
-        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
-
-        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
-
-        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
-
-        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
-
-        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
-
-        ; xmm0 0
-        ; xmm1 4
-        ; xmm2 1
-        ; xmm3 3
-
-        movdqa      xmm5,       xmm0
-        movdqa      xmm2,       xmm1
-
-        paddw       xmm0,       xmm3        ; a1 = 0 + 3
-        paddw       xmm1,       xmm4        ; b1 = 1 + 2
-
-        psubw       xmm4,       xmm2        ; c1 = 1 - 2
-        psubw       xmm5,       xmm3        ; d1 = 0 - 3
-
-        pxor        xmm6,       xmm6        ; zero out for compare
-
-        pcmpeqw     xmm6,       xmm5        ; d1 != 0
-
-        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
-                                                                    ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movdqa      xmm2,       xmm0        ; a1
-
-        paddw       xmm0,       xmm1        ; a1 + b1
-        psubw       xmm2,       xmm1        ; a1 - b1
-
-        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
-        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
-
-        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movdqa      xmm1,       xmm5        ; d1
-        punpcklwd   xmm1,       xmm4        ; c1 d1
-        punpckhwd   xmm5,       xmm4        ; c1 d1
-
-        movdqa      xmm3,       xmm1
-        movdqa      xmm4,       xmm5
-
-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
-
-        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    xmm1,       xmm4        ; op[4]
-        packssdw    xmm3,       xmm5        ; op[12]
-
-        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
-
-        movdqa      xmm4,       xmm0
-        movdqa      xmm5,       xmm2
-
-        punpcklqdq  xmm0,       xmm1
-        punpckhqdq  xmm4,       xmm1
-
-        punpcklqdq  xmm2,       xmm3
-        punpckhqdq  xmm5,       xmm3
-
-        movdqa      XMMWORD PTR[output + 0 ],  xmm0
-        movdqa      XMMWORD PTR[output + 16],  xmm2
-        movdqa      XMMWORD PTR[output + 32],  xmm4
-        movdqa      XMMWORD PTR[output + 48],  xmm5
-
-    STACK_FRAME_DESTROY
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+;;    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

 SECTION_RODATA
 align 16
@@ -397,9 +161,7 @@ align 16
 _cmp_mask:
    times 4 dw 1
    times 4 dw 0
-align 16
-_cmp_mask8x4:
-    times 8 dw 1
+
 align 16
 _mult_sub:
    dw 1
@@ -414,9 +176,6 @@ align 16
 _7:
    times 4 dd 7
 align 16
-_7w:
-    times 8 dw 7
-align 16
 _14500:
    times 4 dd 14500
 align 16
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,31 +24,33 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT
-
+#if 0
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
 #endif

 #endif
+#endif


 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_sse2);
+extern prototype_fdct(vp8_short_fdct8x4_wmt);
 extern prototype_fdct(vp8_short_walsh4x4_sse2);

 extern prototype_fdct(vp8_short_fdct4x4_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT
-
+#if 1
+/* short SSE2 DCT currently disabled, does not match the MMX version */
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2

 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
+#endif

 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -56,7 +58,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2

-#undef  vp8_fdct_walsh_short4x4
+#undef vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2

 #endif
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -24,14 +24,5 @@
 #endif
 #endif

-#if HAVE_SSE4_1
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp8_search_full_search
-#define vp8_search_full_search vp8_full_search_sadx8
-
-#endif
-#endif
-
 #endif

--- a/vp8/encoder/x86/preproc_mmx.c
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -0,0 +1,298 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_wmt
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 8; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 8; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 8;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        xmm1, xmm1              // accumulator
+
+                pxor        xmm7, xmm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        xmm2, xmm2              // count
+
+                movq        xmm3, QWORD PTR [edi]
+
+                movq        QWORD PTR [esi+8*eax], xmm3
+
+                punpcklbw   xmm3, xmm2              // xmm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
+                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
+                movdqa      xmm6, xmm4              // save the pixel values
+                psubsw      xmm4, xmm3              // subtracted pixel values
+                pmullw      xmm4, xmm4              // square xmm4
+                movd        xmm5, strength
+                psrlw       xmm4, xmm5              // should be strength
+                pmullw      xmm4, threes            // 3 * modifier
+                movdqa      xmm5, sixteens          // 16s
+                psubusw     xmm5, xmm4              // 16 - modifiers
+                movdqa      xmm4, xmm5              // save the modifiers
+                pmullw      xmm4, xmm6              // multiplier values
+                paddusw     xmm1, xmm4              // accumulator
+                paddusw     xmm2, xmm5              // count
+                add         esi, 8                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movdqa      counts, xmm2
+                psrlw       xmm2, 1                 // divide count by 2 for rounding
+                paddusw     xmm1, xmm2              // rounding added in
+
+                mov         frameptr, esi
+
+                movdqa      sums, xmm1
+            }
+
+            for (i = 0; i < 8; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 8;
+            d += 8;
+            byte += 8;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_mmx
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 4; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 4; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 4;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        mm1, mm1                // accumulator
+                pxor        mm7, mm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        mm2, mm2                // count
+
+                movd        mm3, DWORD PTR [edi]
+                movd        DWORD PTR [esi+4*eax], mm3
+
+                punpcklbw   mm3, mm2                // mm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movd        mm4, DWORD PTR [esi]    // get frame buffer values
+                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
+                movq        mm6, mm4                // save the pixel values
+                psubsw      mm4, mm3                // subtracted pixel values
+                pmullw      mm4, mm4                // square mm4
+                movd        mm5, strength
+                psrlw       mm4, mm5                // should be strength
+                pmullw      mm4, threes             // 3 * modifier
+                movq        mm5, sixteens           // 16s
+                psubusw     mm5, mm4                // 16 - modifiers
+                movq        mm4, mm5                // save the modifiers
+                pmullw      mm4, mm6                // multiplier values
+                paddusw     mm1, mm4                // accumulator
+                paddusw     mm2, mm5                // count
+                add         esi, 4                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movq        counts, mm2
+                psrlw       mm2, 1                  // divide count by 2 for rounding
+                paddusw     mm1, mm2                // rounding added in
+
+                mov         frameptr, esi
+
+                movq        sums, mm1
+
+            }
+
+            for (i = 0; i < 4; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 4;
+            d += 4;
+            byte += 4;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -253,9 +253,10 @@ rq_zigzag_1c:
    pop         rbp
    ret

+
 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
 ;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *inv_scan_order, short *round_ptr,
+;                           short *scan_mask, short *round_ptr,
 ;                           short *quant_ptr, short *dqcoeff_ptr);
 global sym(vp8_fast_quantize_b_impl_sse2)
 sym(vp8_fast_quantize_b_impl_sse2):
@@ -264,18 +265,32 @@ sym(vp8_fast_quantize_b_impl_sse2):
    SHADOW_ARGS_TO_STACK 7
    push        rsi
    push        rdi
+    push        rbx
    ; end prolog

+    ALIGN_STACK 16, rax
+
+    %define save_xmm6  0
+    %define save_xmm7 16
+
+    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+    sub         rsp, vp8_fastquantizeb_stack_size
+
+    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
+    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
+
    mov         rdx, arg(0)                 ;coeff_ptr
    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rax, arg(3)                 ;scan_mask
    mov         rdi, arg(4)                 ;round_ptr
    mov         rsi, arg(5)                 ;quant_ptr

    movdqa      xmm0, XMMWORD PTR[rdx]
    movdqa      xmm4, XMMWORD PTR[rdx + 16]

-    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
-    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
+    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
+    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4
@@ -288,8 +303,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    psubw       xmm1, xmm0                  ;x = abs(z)
    psubw       xmm5, xmm4                  ;x = abs(z)

-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
+    paddw       xmm1, xmm6
+    paddw       xmm5, xmm7

    pmulhw      xmm1, XMMWORD PTR[rsi]
    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
@@ -297,8 +312,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
    mov         rdi, arg(1)                 ;qcoeff_ptr
    mov         rsi, arg(6)                 ;dqcoeff_ptr

-    movdqa      xmm2, XMMWORD PTR[rcx]
-    movdqa      xmm3, XMMWORD PTR[rcx + 16]
+    movdqa      xmm6, XMMWORD PTR[rcx]
+    movdqa      xmm7, XMMWORD PTR[rcx + 16]

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
@@ -308,47 +323,64 @@ sym(vp8_fast_quantize_b_impl_sse2):
    movdqa      XMMWORD PTR[rdi], xmm1
    movdqa      XMMWORD PTR[rdi + 16], xmm5

-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
+    pmullw      xmm6, xmm1
+    pmullw      xmm7, xmm5

-    mov         rdi, arg(3)                 ;inv_scan_order
+    movdqa      xmm2, XMMWORD PTR[rax]
+    movdqa      xmm3, XMMWORD PTR[rax+16];

-    ; Start with 16
-    pxor        xmm4, xmm4                  ;clear all bits
+    pxor        xmm4, xmm4            ;clear all bits
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4                  ;set all bits
+    pcmpeqw     xmm4, xmm4            ;set all bits
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4

-    pand        xmm1, XMMWORD PTR[rdi]
-    pand        xmm5, XMMWORD PTR[rdi+16]
+    psrlw       xmm1, 15
+    psrlw       xmm5, 15

-    pmaxsw      xmm1, xmm5
+    pmaddwd     xmm1, xmm2
+    pmaddwd     xmm5, xmm3

-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
+    movq        xmm2, xmm1
+    movq        xmm3, xmm5

-    pmaxsw      xmm1, xmm5
+    psrldq      xmm1, 8
+    psrldq      xmm5, 8

-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
+    paddd       xmm1, xmm5
+    paddd       xmm2, xmm3

-    pmaxsw      xmm1, xmm5
+    paddd       xmm1, xmm2
+    movq        xmm5, xmm1

-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
+    psrldq      xmm1, 4
+    paddd       xmm5, xmm1

-    pmaxsw      xmm1, xmm5
+    movq        rcx,  xmm5
+    and         rcx,  0xffff

-    movd        rax, xmm1
-    and         rax, 0xff
+    xor         rdx,  rdx
+    sub         rdx,  rcx

-    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
-    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
+    bsr         rax,  rcx
+    inc         rax
+
+    sar         rdx,  31
+    and         rax,  rdx
+
+    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
+    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
+
+    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
+    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+    add         rsp, vp8_fastquantizeb_stack_size
+    pop         rsp

    ; begin epilog
+    pop         rbx
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -1,114 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
-;               short *qcoeff_ptr,short *dequant_ptr,
-;               short *round_ptr,
-;               short *quant_ptr, short *dqcoeff_ptr);
-;
-global sym(vp8_fast_quantize_b_impl_ssse3)
-sym(vp8_fast_quantize_b_impl_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov         rdx, arg(0)                 ;coeff_ptr
-    mov         rdi, arg(3)                 ;round_ptr
-    mov         rsi, arg(4)                 ;quant_ptr
-
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-
-    movdqa      xmm2, [rdi]                 ;round lo
-    movdqa      xmm3, [rdi + 16]            ;round hi
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    psraw       xmm0, 15                    ;sign of z (aka sz)
-    psraw       xmm4, 15                    ;sign of z (aka sz)
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    pmulhw      xmm1, [rsi]
-    pmulhw      xmm5, [rsi + 16]
-
-    mov         rdi, arg(1)                 ;qcoeff_ptr
-    mov         rcx, arg(2)                 ;dequant_ptr
-    mov         rsi, arg(5)                 ;dqcoeff_ptr
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rdi], xmm1
-    movdqa      [rdi + 16], xmm5
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [ GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm1
-
-;    xor         ecx, ecx
-;    mov         eax, -1
-;find_eob_loop:
-;    shr         edx, 1
-;    jc          fq_skip
-;    mov         eax, ecx
-;fq_skip:
-;    inc         ecx
-;    cmp         ecx, 16
-;    jne         find_eob_loop
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rsi], xmm2                 ;store dqcoeff
-    movdqa      [rsi + 16], xmm3            ;store dqcoeff
-
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
-    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -1,353 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-
-;void vp8_sad16x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(vp8_sad16x16x8_sse4)
-sym(vp8_sad16x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad16x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad16x8x8_sse4)
-sym(vp8_sad16x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x8x8_sse4)
-sym(vp8_sad8x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x16x8_sse4)
-sym(vp8_sad8x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad4x4x8_c(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad4x4x8_sse4)
-sym(vp8_sad4x4x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -297,31 +297,4 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #endif
 #endif

-
-#if HAVE_SSE4_1
-extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
-extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp8_variance_sad16x16x8
-#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
-
-#undef  vp8_variance_sad16x8x8
-#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
-
-#undef  vp8_variance_sad8x16x8
-#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
-
-#undef  vp8_variance_sad8x8x8
-#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
-
-#undef  vp8_variance_sad4x4x8
-#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
-
-#endif
-#endif
-
 #endif
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,10 +18,11 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_mmx(input,   output,    pitch);
-    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
 }

+
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
                                 short *scan_mask, short *round_ptr,
@@ -32,7 +33,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
    short *coeff_ptr   = b->coeff;
    short *zbin_ptr    = b->zbin;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant_fast;
+    short *quant_ptr   = b->quant;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -81,16 +82,22 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif

 #if HAVE_SSE2
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_sse2(input,   output,    pitch);
+    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
+}
+
 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *inv_scan_order, short *round_ptr,
+                                 short *scan_mask, short *round_ptr,
                                 short *quant_ptr, short *dqcoeff_ptr);
 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
    short *coeff_ptr   = b->coeff;
    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant_fast;
+    short *quant_ptr   = b->quant;
    short *qcoeff_ptr  = d->qcoeff;
    short *dqcoeff_ptr = d->dqcoeff;
    short *dequant_ptr = d->dequant;
@@ -99,7 +106,8 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
                 coeff_ptr,
                 qcoeff_ptr,
                 dequant_ptr,
-                 vp8_default_inv_zig_zag,
+                 scan_mask,
+
                 round_ptr,
                 quant_ptr,
                 dqcoeff_ptr
@@ -171,25 +179,6 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

 #endif

-#if HAVE_SSSE3
-int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
-{
-    d->eob = vp8_fast_quantize_b_impl_ssse3(
-                    b->coeff,
-                    d->qcoeff,
-                    d->dequant,
-                    b->round,
-                    b->quant_fast,
-                    d->dqcoeff
-               );
-}
-#endif
-
-
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -199,7 +188,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
    int wmt_enabled = flags & HAS_SSE2;
    int SSE3Enabled = flags & HAS_SSE3;
    int SSSE3Enabled = flags & HAS_SSSE3;
-    int SSE4_1Enabled = flags & HAS_SSE4_1;

    /* Note:
     *
@@ -210,6 +198,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

    /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
+
    if (mmx_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
@@ -241,11 +230,18 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
+#else
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
+
+#endif

        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

@@ -258,9 +254,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)

        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
    }
-#endif

+#endif
 #if HAVE_SSE2
+
    if (wmt_enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
@@ -310,9 +307,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
    }
-#endif

+#endif
 #if HAVE_SSE3
+
    if (SSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
@@ -330,30 +328,16 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
        cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
    }
-#endif

+#endif
 #if HAVE_SSSE3
+
    if (SSSE3Enabled)
    {
        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
-
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
-
    }
-#endif

-#if HAVE_SSE4_1
-    if (SSE4_1Enabled)
-    {
-        cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
-        cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
-        cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
-        cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
-        cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
-        cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
-    }
 #endif
-
 #endif
 }
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -17,6 +17,7 @@ VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
+VP8_COMMON_SRCS-yes += common/preproc.h
 VP8_COMMON_SRCS-yes += common/vpxerrors.h

 CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -37,7 +37,6 @@ struct vp8_extracfg
    unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
    unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
    unsigned int                arnr_type;        /* alt_ref filter type */
-    vp8e_tuning                 tuning;

 };

@@ -68,7 +67,6 @@ static const struct extraconfig_map extracfg_map[] =
            0,                          /* arnr_max_frames */
            3,                          /* arnr_strength */
            3,                          /* arnr_type*/
-            0,                          /* tuning*/
        }
    }
 };
@@ -106,7 +104,6 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
 }


-#undef ERROR
 #define ERROR(str) do {\
        ctx->base.err_detail = str;\
        return VPX_CODEC_INVALID_PARAM;\
@@ -135,8 +132,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                       const vpx_codec_enc_cfg_t *cfg,
                                       const struct vp8_extracfg *vp8_cfg)
 {
-    RANGE_CHECK(cfg, g_w,                   1, 16384);
-    RANGE_CHECK(cfg, g_h,                   1, 16384);
+    RANGE_CHECK(cfg, g_w,                   2, 16384);
+    RANGE_CHECK(cfg, g_h,                   2, 16384);
    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
    RANGE_CHECK_HI(cfg, g_profile,          3);
@@ -338,7 +335,6 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->arnr_strength =  vp8_cfg.arnr_strength;
    oxcf->arnr_type =      vp8_cfg.arnr_type;

-    oxcf->tuning = vp8_cfg.tuning;

    /*
        printf("Current VP8 Settings: \n");
@@ -452,7 +448,6 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
        MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
        MAP(VP8E_SET_ARNR_STRENGTH ,        xcfg.arnr_strength);
        MAP(VP8E_SET_ARNR_TYPE     ,        xcfg.arnr_type);
-        MAP(VP8E_SET_TUNING,                xcfg.tuning);

    }

@@ -865,16 +860,8 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {

    YV12_BUFFER_CONFIG sd;
-    vp8_ppflags_t flags = {0};

-    if (ctx->preview_ppcfg.post_proc_flag)
-    {
-        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
-        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
-        flags.noise_level           = ctx->preview_ppcfg.noise_level;
-    }
-
-    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag))
    {

        /*
@@ -1033,7 +1020,6 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
    {VP8E_SET_ARNR_MAXFRAMES,           set_param},
    {VP8E_SET_ARNR_STRENGTH ,           set_param},
    {VP8E_SET_ARNR_TYPE     ,           set_param},
-    {VP8E_SET_TUNING,                   set_param},
    { -1, NULL},
 };

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -65,19 +65,12 @@ struct vpx_codec_alg_priv
    vpx_codec_priv_t        base;
    vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs)-1];
    vpx_codec_dec_cfg_t     cfg;
-    vp8_stream_info_t       si;
+    vp8_stream_info_t   si;
    int                     defer_alloc;
    int                     decoder_init;
    VP8D_PTR                pbi;
    int                     postproc_cfg_set;
    vp8_postproc_cfg_t      postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
-    unsigned int            dbg_postproc_flag;
-    int                     dbg_color_ref_frame_flag;
-    int                     dbg_color_mb_modes_flag;
-    int                     dbg_color_b_modes_flag;
-    int                     dbg_display_mv_flag;
-#endif
    vpx_image_t             img;
    int                     img_setup;
    int                     img_avail;
@@ -423,27 +416,15 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    {
        YV12_BUFFER_CONFIG sd;
        INT64 time_stamp = 0, time_end_stamp = 0;
-        vp8_ppflags_t flags = {0};
+        int ppflag       = 0;
+        int ppdeblocking = 0;
+        int ppnoise      = 0;

        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
        {
-            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
-#if CONFIG_POSTPROC_VISUALIZER
-
-                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
-                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
-                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
-                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
-#endif
-                                ;
-            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
-            flags.noise_level           = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
-            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
-            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
-            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
-            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
-#endif
+            ppflag      = ctx->postproc_cfg.post_proc_flag;
+            ppdeblocking = ctx->postproc_cfg.deblocking_level;
+            ppnoise     = ctx->postproc_cfg.noise_level;
        }

        if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
@@ -452,7 +433,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            res = update_error_state(ctx, &pbi->common.error);
        }

-        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
+        if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag))
        {
            /* Align width/height */
            unsigned int a_w = (sd.y_width + 15) & ~15;
@@ -466,7 +447,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
            vpx_img_set_rect(&ctx->img,
                             VP8BORDERINPIXELS, VP8BORDERINPIXELS,
                             sd.y_width, sd.y_height);
-            ctx->img.user_priv = user_priv;
            ctx->img_avail = 1;

        }
@@ -666,59 +646,12 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }

-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
-                                        int ctrl_id,
-                                        va_list args)
-{
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
-    int data = va_arg(args, int);
-
-#define MAP(id, var) case id: var = data; break;
-
-    switch (ctrl_id)
-    {
-        MAP (VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
-        MAP (VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
-        MAP (VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
-        MAP (VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
-    }
-
-    return VPX_CODEC_OK;
-#else
-    return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
-                                                int ctrl_id,
-                                                va_list args)
-{
-    int *update_info = va_arg(args, int *);
-    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
-
-    if (update_info)
-    {
-        *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
-            + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
-            + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
-
-        return VPX_CODEC_OK;
-    }
-    else
-        return VPX_CODEC_INVALID_PARAM;
-}
-

 vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 {
-    {VP8_SET_REFERENCE,             vp8_set_reference},
-    {VP8_COPY_REFERENCE,            vp8_get_reference},
-    {VP8_SET_POSTPROC,              vp8_set_postproc},
-    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
-    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
-    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
-    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
-    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
+    {VP8_SET_REFERENCE,  vp8_set_reference},
+    {VP8_COPY_REFERENCE, vp8_get_reference},
+    {VP8_SET_POSTPROC,   vp8_set_postproc},
    { -1, NULL},
 };

--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -109,8 +109,6 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -38,13 +38,9 @@
 */
 enum vp8_dec_control_id
 {
-    VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
-    VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
-    VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
-    VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
-    VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
-    VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
-    VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
+    VP8_SET_REFERENCE       = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+    VP8_COPY_REFERENCE      = 2,    /**< get a copy of reference frame from the decoder */
+    VP8_SET_POSTPROC        = 3,    /**< set decoder's the post processing settings  */
    VP8_COMMON_CTRL_ID_MAX
 };

@@ -54,14 +50,10 @@ enum vp8_dec_control_id
 */
 enum vp8_postproc_level
 {
-    VP8_NOFILTERING             = 0,
-    VP8_DEBLOCK                 = 1<<0,
-    VP8_DEMACROBLOCK            = 1<<1,
-    VP8_ADDNOISE                = 1<<2,
-    VP8_DEBUG_TXT_FRAME_INFO    = 1<<3, /**< print frame information */
-    VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
-    VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
-    VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
+    VP8_NOFILTERING    = 0,
+    VP8_DEBLOCK        = 1,
+    VP8_DEMACROBLOCK   = 2,
+    VP8_ADDNOISE       = 4
 };

 /*!\brief post process flags
@@ -73,9 +65,9 @@ enum vp8_postproc_level

 typedef struct vp8_postproc_cfg
 {
-    int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
-    int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
-    int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
+    int post_proc_flag;           /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+    int deblocking_level;        /**< the strength of deblocking, valid range [0, 16] */
+    int noise_level;             /**< the strength of additive noise, valid range [0, 16] */
 } vp8_postproc_cfg_t;

 /*!\brief reference frame type
@@ -103,16 +95,12 @@ typedef struct vpx_ref_frame

 /*!\brief vp8 decoder control funciton parameter type
 *
- * defines the data type for each of VP8 decoder control function requires
+ * defines the data type for each of VP8 decoder control funciton requires
 */

 VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)


 /*! @} - end defgroup vp8 */
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -140,8 +140,7 @@ enum vp8e_enc_control_id
    VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
    VP8E_SET_ARNR_STRENGTH ,         /**< control function to set the filter strength for the arf */
    VP8E_SET_ARNR_TYPE     ,         /**< control function to set the type of filter to use for the arf*/
-    VP8E_SET_TUNING,                 /**< control function to set visual tuning */
-};
+} ;

 /*!\brief vpx 1-D scaling mode
 *
@@ -225,18 +224,6 @@ typedef enum
 } vp8e_token_partitions;


-/*!\brief VP8 model tuning parameters
- *
- * Changes the encoder to tune for certain types of input material.
- *
- */
-typedef enum
-{
-    VP8_TUNE_PSNR,
-    VP8_TUNE_SSIM
-} vp8e_tuning;
-
-
 /*!\brief VP8 encoder control function parameter type
 *
 * Defines the data types that VP8E control functions take. Note that
@@ -266,7 +253,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   vp8e_token_partitions)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH ,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE     ,     unsigned int)
-VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             vp8e_tuning)
+

 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -36,30 +36,6 @@ extern vpx_codec_iface_t* vpx_codec_vp8_dx(void);
 #include "vp8.h"


-/*!\brief VP8 decoder control functions
- *
- * The set of macros define the control functions of VP8 decoder interface
- */
-enum vp8d_dec_control_id
-{
-    VP8_DECODER_CTRL_ID_START   = 256,
-    VP8D_GET_LAST_REF_UPDATES,              /**< control function to get info on which reference frames were updated
-                                            by the last decode */
-    VP8_DECODER_CTRL_ID_MAX
-} ;
-
-
-/*!\brief VP8 encoder control function parameter type
- *
- * Defines the data types that VP8E control functions take. Note that
- * additional common controls are defined in vp8.h
- *
- */
-
-
-VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
-
-
 /*! @} - end defgroup vp8_decoder */


--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -74,7 +74,6 @@ void __cpuid(int CPUInfo[4], int info_type);
 #define HAS_SSE2  0x04
 #define HAS_SSE3  0x08
 #define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -118,8 +117,6 @@ x86_simd_caps(void)

    if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;

-    if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
-
    return flags & mask;
 }

--- a/vpxdec.c
+++ b/vpxdec.c
@@ -35,7 +35,6 @@
 #if CONFIG_MD5
 #include "md5_utils.h"
 #endif
-#include "tools_common.h"
 #include "nestegg/include/nestegg/nestegg.h"

 #ifndef PATH_MAX
@@ -108,19 +107,11 @@ static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level",
        "Enable VP8 demacroblocking, w/ level");
 static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
                                       "Enable VP8 visible debug info");
-static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
-                                       "Display only selected reference frame per macro block");
-static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
-                                       "Display only selected macro block modes");
-static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
-                                       "Display only selected block modes");
-static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
-                                       "Draw only selected motion vectors");
+

 static const arg_def_t *vp8_pp_args[] =
 {
    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
-    &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
    NULL
 };
 #endif
@@ -323,8 +314,7 @@ void *out_open(const char *out_fn, int do_md5)
    }
    else
    {
-        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb")
-                                                  : set_binary_mode(stdout);
+        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb") : stdout;

        if (!outfile)
        {
@@ -442,8 +432,6 @@ unsigned int file_is_raw(FILE *infile,
    int is_raw = 0;
    vpx_codec_stream_info_t si;

-    si.sz = sizeof(si);
-
    if (fread(buf, 1, 32, infile) == 32)
    {
        int i;
@@ -552,7 +540,6 @@ webm_guess_framerate(struct input_ctx *input,
    *fps_den = tstamp / 1000;
    return 0;
 fail:
-    nestegg_destroy(input->nestegg_ctx);
    input->nestegg_ctx = NULL;
    rewind(input->infile);
    return 1;
@@ -715,10 +702,6 @@ int main(int argc, const char **argv_)
    vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
-    int                     vp8_dbg_color_ref_frame = 0;
-    int                     vp8_dbg_color_mb_modes = 0;
-    int                     vp8_dbg_color_b_modes = 0;
-    int                     vp8_dbg_display_mv = 0;
 #endif
    struct input_ctx        input = {0};

@@ -804,42 +787,6 @@ int main(int argc, const char **argv_)
            if (level)
                vp8_pp_cfg.post_proc_flag |= level;
        }
-        else if (arg_match(&arg, &pp_disp_ref_frame, argi))
-        {
-            unsigned int flags = arg_parse_int(&arg);
-            if (flags)
-            {
-                postproc = 1;
-                vp8_dbg_color_ref_frame = flags;
-            }
-        }
-        else if (arg_match(&arg, &pp_disp_mb_modes, argi))
-        {
-            unsigned int flags = arg_parse_int(&arg);
-            if (flags)
-            {
-                postproc = 1;
-                vp8_dbg_color_mb_modes = flags;
-            }
-        }
-        else if (arg_match(&arg, &pp_disp_b_modes, argi))
-        {
-            unsigned int flags = arg_parse_int(&arg);
-            if (flags)
-            {
-                postproc = 1;
-                vp8_dbg_color_b_modes = flags;
-            }
-        }
-        else if (arg_match(&arg, &pp_disp_mvs, argi))
-        {
-            unsigned int flags = arg_parse_int(&arg);
-            if (flags)
-            {
-                postproc = 1;
-                vp8_dbg_display_mv = flags;
-            }
-        }

 #endif
        else
@@ -858,7 +805,7 @@ int main(int argc, const char **argv_)
        usage_exit();

    /* Open file */
-    infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+    infile = strcmp(fn, "-") ? fopen(fn, "rb") : stdin;

    if (!infile)
    {
@@ -929,13 +876,7 @@ int main(int argc, const char **argv_)
        }

        if(input.kind == WEBM_FILE)
-            if(webm_guess_framerate(&input, &fps_den, &fps_num))
-            {
-                fprintf(stderr, "Failed to guess framerate -- error parsing "
-                                "webm file?\n");
-                return EXIT_FAILURE;
-            }
-
+            webm_guess_framerate(&input, &fps_den, &fps_num);

        /*Note: We can't output an aspect ratio here because IVF doesn't
           store one, and neither does VP8.
@@ -979,33 +920,6 @@ int main(int argc, const char **argv_)
        return EXIT_FAILURE;
    }

-    if (vp8_dbg_color_ref_frame
-        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
-    {
-        fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-    if (vp8_dbg_color_mb_modes
-        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
-    {
-        fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-    if (vp8_dbg_color_b_modes
-        && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
-    {
-        fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-    if (vp8_dbg_display_mv
-        && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
-    {
-        fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
 #endif

    /* Decode file */
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -35,11 +35,9 @@
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
-#include "tools_common.h"
 #include "y4minput.h"
 #include "libmkv/EbmlWriter.h"
 #include "libmkv/EbmlIDs.h"
-#include "experimental.h"

 /* Need special handling of these functions on Windows */
 #if defined(_MSC_VER)
@@ -187,11 +185,11 @@ int stats_open_mem(stats_io_t *stats, int pass)
 }


-void stats_close(stats_io_t *stats, int last_pass)
+void stats_close(stats_io_t *stats)
 {
    if (stats->file)
    {
-        if (stats->pass == last_pass)
+        if (stats->pass == 1)
        {
 #if 0
 #elif USE_POSIX_MMAP
@@ -206,7 +204,7 @@ void stats_close(stats_io_t *stats, int last_pass)
    }
    else
    {
-        if (stats->pass == last_pass)
+        if (stats->pass == 1)
            free(stats->buf.buf);
    }
 }
@@ -252,8 +250,7 @@ enum video_file_type

 struct detect_buffer {
    char buf[4];
-    size_t buf_read;
-    size_t position;
+    int  valid;
 };


@@ -307,21 +304,14 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,

            for (r = 0; r < h; r++)
            {
-                size_t needed = w;
-                size_t buf_position = 0;
-                const size_t left = detect->buf_read - detect->position;
-                if (left > 0)
+                if (detect->valid)
                {
-                    const size_t more = (left < needed) ? left : needed;
-                    memcpy(ptr, detect->buf + detect->position, more);
-                    buf_position = more;
-                    needed -= more;
-                    detect->position += more;
-                }
-                if (needed > 0)
-                {
-                    shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+                    memcpy(ptr, detect->buf, 4);
+                    shortread |= fread(ptr+4, 1, w-4, f) < w-4;
+                    detect->valid = 0;
                }
+                else
+                    shortread |= fread(ptr, 1, w, f) < w;

                ptr += img->stride[plane];
            }
@@ -348,12 +338,12 @@ unsigned int file_is_ivf(FILE *infile,
                         unsigned int *fourcc,
                         unsigned int *width,
                         unsigned int *height,
-                         struct detect_buffer *detect)
+                         char          detect[4])
 {
    char raw_hdr[IVF_FILE_HDR_SZ];
    int is_ivf = 0;

-    if(memcmp(detect->buf, "DKIF", 4) != 0)
+    if(memcmp(detect, "DKIF", 4) != 0)
        return 0;

    /* See write_ivf_file_header() for more documentation on the file header
@@ -377,7 +367,6 @@ unsigned int file_is_ivf(FILE *infile,
    {
        *width = mem_get_le16(raw_hdr + 12);
        *height = mem_get_le16(raw_hdr + 14);
-        detect->position = 4;
    }

    return is_ivf;
@@ -445,7 +434,7 @@ struct EbmlGlobal
    int debug;

    FILE    *stream;
-    int64_t last_pts_ms;
+    uint64_t last_pts_ms;
    vpx_rational_t  framerate;

    /* These pointers are to the start of an element */
@@ -658,7 +647,7 @@ write_webm_block(EbmlGlobal                *glob,
    unsigned char  track_number;
    unsigned short block_timecode = 0;
    unsigned char  flags;
-    int64_t        pts_ms;
+    uint64_t       pts_ms;
    int            start_cluster = 0, is_keyframe;

    /* Calculate the PTS of this frame in milliseconds */
@@ -989,32 +978,23 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                     "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                        "AltRef Max Frames");
+                                        "alt_ref Max Frames");
 static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                       "AltRef Strength");
+                                       "alt_ref Strength");
 static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                   "AltRef Type");
-static const struct arg_enum_list tuning_enum[] = {
-    {"psnr", VP8_TUNE_PSNR},
-    {"ssim", VP8_TUNE_SSIM},
-    {NULL, 0}
-};
-static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
-                                   "Material to favor", tuning_enum);
+                                   "alt_ref Type");

 static const arg_def_t *vp8_args[] =
 {
    &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
-    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
-    &tune_ssim, NULL
+    &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type, NULL
 };
 static const int vp8_arg_ctrl_map[] =
 {
    VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
    VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
    VP8E_SET_TOKEN_PARTITIONS,
-    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE,
-    VP8E_SET_TUNING, 0
+    VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH , VP8E_SET_ARNR_TYPE, 0
 };
 #endif

@@ -1040,7 +1020,6 @@ static void usage_exit()
 #if CONFIG_VP8_ENCODER
    fprintf(stderr, "\nVP8 Specific Options:\n");
    arg_show_usage(stdout, vp8_args);
-    xxx_show_usage(stdout);
 #endif
    fprintf(stderr, "\n"
           "Included encoders:\n"
@@ -1094,7 +1073,6 @@ int main(int argc, const char **argv_)
    int                      psnr_count = 0;

    exec_name = argv_[0];
-    ebml.last_pts_ms = -1;

    if (argc < 3)
        usage_exit();
@@ -1175,7 +1153,6 @@ int main(int argc, const char **argv_)
            out_fn = arg.val;
        else if (arg_match(&arg, &debugmode, argi))
            ebml.debug = 1;
-        else if (xxx_parse_arg(argi));
        else
            argj++;
    }
@@ -1212,12 +1189,6 @@ int main(int argc, const char **argv_)
     */
    cfg.g_timebase.den = 1000;

-    /* Never use the library's default resolution, require it be parsed
-     * from the file or set on the command line.
-     */
-    cfg.g_w = 0;
-    cfg.g_h = 0;
-
    /* Now parse the remainder of the parameters. */
    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
    {
@@ -1329,7 +1300,7 @@ int main(int argc, const char **argv_)
                if (arg_ctrl_cnt < ARG_CTRL_CNT_MAX)
                {
                    arg_ctrls[arg_ctrl_cnt][0] = ctrl_args_map[i];
-                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_enum_or_int(&arg);
+                    arg_ctrls[arg_ctrl_cnt][1] = arg_parse_int(&arg);
                    arg_ctrl_cnt++;
                }
            }
@@ -1359,11 +1330,11 @@ int main(int argc, const char **argv_)
    {
        int frames_in = 0, frames_out = 0;
        unsigned long nbytes = 0;
+        size_t detect_bytes;
        struct detect_buffer detect;

        /* Parse certain options from the input file, if possible */
-        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb")
-                                    : set_binary_mode(stdin);
+        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb") : stdin;

        if (!infile)
        {
@@ -1373,11 +1344,13 @@ int main(int argc, const char **argv_)

        /* For RAW input sources, these bytes will applied on the first frame
         *  in read_frame().
+         * We can always read 4 bytes because the minimum supported frame size
+         *  is 2x2.
         */
-        detect.buf_read = fread(detect.buf, 1, 4, infile);
-        detect.position = 0;
+        detect_bytes = fread(detect.buf, 1, 4, infile);
+        detect.valid = 0;

-        if (detect.buf_read == 4 && file_is_y4m(infile, &y4m, detect.buf))
+        if (detect_bytes == 4 && file_is_y4m(infile, &y4m, detect.buf))
        {
            if (y4m_input_open(&y4m, infile, detect.buf, 4) >= 0)
            {
@@ -1402,8 +1375,8 @@ int main(int argc, const char **argv_)
                return EXIT_FAILURE;
            }
        }
-        else if (detect.buf_read == 4 &&
-                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, &detect))
+        else if (detect_bytes == 4 &&
+                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, detect.buf))
        {
            file_type = FILE_TYPE_IVF;
            switch (fourcc)
@@ -1422,15 +1395,8 @@ int main(int argc, const char **argv_)
        else
        {
            file_type = FILE_TYPE_RAW;
+            detect.valid = 1;
        }
-
-        if(!cfg.g_w || !cfg.g_h)
-        {
-            fprintf(stderr, "Specify stream dimensions with --width (-w) "
-                            " and --height (-h).\n");
-            return EXIT_FAILURE;
-        }
-
 #define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, cfg.field)

        if (verbose && pass == 0)
@@ -1483,8 +1449,7 @@ int main(int argc, const char **argv_)
                              cfg.g_w, cfg.g_h, 1);
        }

-        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb")
-                                      : set_binary_mode(stdout);
+        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb") : stdout;

        if (!outfile)
        {
@@ -1562,7 +1527,7 @@ int main(int argc, const char **argv_)
            vpx_codec_iter_t iter = NULL;
            const vpx_codec_cx_pkt_t *pkt;
            struct vpx_usec_timer timer;
-            int64_t frame_start, next_frame_start;
+            int64_t frame_start;

            if (!arg_limit || frames_in < arg_limit)
            {
@@ -1583,11 +1548,9 @@ int main(int argc, const char **argv_)

            frame_start = (cfg.g_timebase.den * (int64_t)(frames_in - 1)
                          * arg_framerate.den) / cfg.g_timebase.num / arg_framerate.num;
-            next_frame_start = (cfg.g_timebase.den * (int64_t)(frames_in)
-                                * arg_framerate.den)
-                                / cfg.g_timebase.num / arg_framerate.num;
            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, frame_start,
-                             next_frame_start - frame_start,
+                             cfg.g_timebase.den * arg_framerate.den
+                             / cfg.g_timebase.num / arg_framerate.num,
                             0, arg_deadline);
            vpx_usec_timer_mark(&timer);
            cx_time += vpx_usec_timer_elapsed(&timer);
@@ -1695,7 +1658,7 @@ int main(int argc, const char **argv_)
        }

        fclose(outfile);
-        stats_close(&stats, arg_passes-1);
+        stats_close(&stats);
        fprintf(stderr, "\n");

        if (one_pass_only)