Update CHANGELOG for v1.0.0 (Duclair) release

Change-Id: I64472f717e5ef3672e1032b7ee24e73c4d0fff1f
Update AUTHORS
2012-01-27 10:36:39 -08:00 · 2012-01-27 10:36:12 -08:00 · 2012-01-27 10:36:11 -08:00 · 2012-01-27 10:13:20 -08:00 · 2012-01-26 09:37:27 -08:00 · 2012-01-24 15:41:59 -08:00
136 changed files with 9054 additions and 4206 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -3,3 +3,5 @@ Johann Koenig <johannkoenig@google.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Tom Finegan <tomfinegan@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
--- a/4
+++ b/4
@@ -6,10 +6,12 @@ Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Deb Mukherjee <debargha@google.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
@@ -21,6 +23,7 @@ Henrik Lundin <hlundin@google.com>
 James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
+Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
@@ -41,6 +44,7 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Ronald S. Bultje <rbultje@google.com>
 Scott LaVarnway <slavarnway@google.com>
--- a/48
+++ b/48
@@ -1,3 +1,51 @@
+2012-01-27 v1.0.0 "Duclair"
+  Our fourth named release, focused on performance and features related to
+  real-time encoding. It also fixes a decoder crash bug introduced in
+  v0.9.7, so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+      This release is ABI incompatible with prior releases of libvpx, so the
+      "major" version number has been bumped to 1. You must recompile your
+      applications against the latest version of the libvpx headers. The
+      API remains compatible, and this should not require code changes in most
+      applications.
+
+  - Enhancements:
+      This release introduces several substantial new features to the encoder,
+      of particular interest to real time streaming applications.
+
+      Temporal scalability allows the encoder to produce a stream that can
+      be decimated to different frame rates, with independent rate targetting
+      for each substream.
+
+      Multiframe quality enhancement postprocessing can make visual quality
+      more consistent in the presence of frames that are substantially
+      different quality than the surrounding frames, as in the temporal
+      scalability case and in some forced keyframe scenarios.
+
+      Multiple-resolution encoding support allows the encoding of the
+      same content at different resolutions faster than encoding them
+      separately.
+
+  - Speed:
+      Optimization targets for this release included the decoder and the real-
+      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
+      this release. Encoder improvements followed a curve where speeds 1-3
+      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
+      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
+      Cayuga release.
+
+  - Quality:
+      Encoder quality in the single stream case is consistent with the Cayuga
+      release.
+
+  - Bug Fixes:
+      This release fixes an OOB read decoder crash bug present in v0.9.7
+      related to the clamping of motion vectors in SPLITMV blocks. This
+      behavior could be triggered by corrupt input or by starting
+      decoding from a P-frame.
+
+
 2011-08-15 v0.9.7-p1 "Cayuga" patch 1
  This is an incremental bugfix release against Cayuga. All users of that
  release are strongly encouraged to upgrade.
--- a/8
+++ b/8
@@ -42,17 +42,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  --help output of the configure script. As of this writing, the list of
  available targets is:

+    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
-    armv5te-symbian-gcc
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
-    armv6-symbian-gcc
-    iwmmxt-linux-rvct
-    iwmmxt-linux-gcc
-    iwmmxt2-linux-rvct
-    iwmmxt2-linux-gcc
+    armv7-android-gcc
    armv7-linux-rvct
    armv7-linux-gcc
    mips32-linux-gcc
--- a/args.c
+++ b/args.c
@@ -57,7 +57,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv)
    }
    else if (def->long_name)
    {
-        int name_len = strlen(def->long_name);
+        const size_t name_len = strlen(def->long_name);

        if (strlen(arg.argv[0]) >= name_len + 2
            && arg.argv[0][1] == '-'
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -0,0 +1,193 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+#
+# This file is to be used for compiling libvpx for Android using the NDK.
+# In an Android project place a libvpx checkout in the jni directory.
+# Run the configure script from the jni directory.  Base libvpx
+# encoder/decoder configuration will look similar to:
+# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+#                    --sdk-path=/opt/android-ndk-r6b/
+#
+# When targeting Android, realtime-only is enabled by default.  This can
+# be overridden by adding the command line flag:
+#  --disable-realtime-only
+#
+# This will create .mk files that contain variables that contain the
+# source files to compile.
+#
+# Place an Android.mk file in the jni directory that references the
+# Android.mk file in the libvpx directory:
+# LOCAL_PATH := $(call my-dir)
+# include $(CLEAR_VARS)
+# include libvpx/build/make/Android.mk
+#
+# There are currently two TARGET_ARCH_ABI targets for ARM.
+# armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
+# Application.mk in the jni directory that contains:
+# APP_ABI := armeabi-v7a
+#
+# To change to building armeabi, run ./libvpx/configure again, but with
+# --target=arm5te-android-gcc and and modify the Application.mk file to
+# set APP_ABI := armeabi
+#
+# Running ndk-build will build libvpx and include it in your project.
+#
+
+CONFIG_DIR := $(LOCAL_PATH)
+LIBVPX_PATH := $(LOCAL_PATH)/libvpx
+ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
+ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+
+# Makefiles created by the libvpx configure process
+# This will need to be fixed to handle x86.
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  include $(CONFIG_DIR)/libs-armv7-android-gcc.mk
+else
+  include $(CONFIG_DIR)/libs-armv5te-android-gcc.mk
+endif
+
+# Rule that is normally in Makefile created by libvpx
+# configure.  Used to filter out source files based on configuration.
+enabled=$(filter-out $($(1)-no),$($(1)-yes))
+
+# Override the relative path that is defined by the libvpx
+# configure process
+SRC_PATH_BARE := $(LIBVPX_PATH)
+
+# Include the list of files to be built
+include $(LIBVPX_PATH)/libs.mk
+
+# Want arm, not thumb, optimized
+LOCAL_ARM_MODE := arm
+LOCAL_CFLAGS := -O3
+
+# -----------------------------------------------------------------------------
+# Template  : asm_offsets_template
+# Arguments : 1: assembly offsets file to be created
+#             2: c file to base assembly offsets on
+# Returns   : None
+# Usage     : $(eval $(call asm_offsets_template,<asmfile>, <srcfile>
+# Rationale : Create offsets at compile time using for structures that are
+#             defined in c, but used in assembly functions.
+# -----------------------------------------------------------------------------
+define asm_offsets_template
+
+_SRC:=$(2)
+_OBJ:=$(ASM_CNV_PATH)/$$(notdir $(2)).S
+
+_FLAGS = $$($$(my)CFLAGS) \
+          $$(call get-src-file-target-cflags,$(2)) \
+          $$(call host-c-includes,$$(LOCAL_C_INCLUDES) $$(CONFIG_DIR)) \
+          $$(LOCAL_CFLAGS) \
+          $$(NDK_APP_CFLAGS) \
+          $$(call host-c-includes,$$($(my)C_INCLUDES)) \
+          -DINLINE_ASM \
+          -S \
+
+_TEXT = "Compile $$(call get-src-file-text,$(2))"
+_CC   = $$(TARGET_CC)
+
+$$(eval $$(call ev-build-file))
+
+$(1) : $$(_OBJ) $(2)
+	@mkdir -p $$(dir $$@)
+	@grep -w EQU $$< | tr -d '\#' | $(CONFIG_DIR)/$(ASM_CONVERSION) > $$@
+endef
+
+# Use ads2gas script to convert from RVCT format to GAS format.  This passes
+#  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
+#  to handle removing these
+ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/asm_com_offsets.asm
+ifeq ($(CONFIG_VP8_DECODER), yes)
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_dec_offsets.asm
+endif
+ifeq ($(CONFIG_VP8_ENCODER), yes)
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_enc_offsets.asm
+endif
+
+.PRECIOUS: %.asm.s
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
+	@mkdir -p $(dir $@)
+	@$(CONFIG_DIR)/$(ASM_CONVERSION) <$< > $@
+
+
+LOCAL_SRC_FILES += vpx_config.c
+
+# Remove duplicate entries
+CODEC_SRCS_UNIQUE = $(sort $(CODEC_SRCS))
+
+# Pull out C files.  vpx_config.c is in the immediate directory and
+# so it does not need libvpx/ prefixed like the rest of the source files.
+CODEC_SRCS_C = $(filter %.c, $(CODEC_SRCS_UNIQUE))
+LOCAL_CODEC_SRCS_C = $(filter-out vpx_config.c, $(CODEC_SRCS_C))
+
+LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libvpx/$(file))
+
+# Pull out assembly files, splitting NEON from the rest.  This is
+# done to specify that the NEON assembly files use NEON assembler flags.
+CODEC_SRCS_ASM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM = $(foreach v, \
+                 $(CODEC_SRCS_ASM_ALL), \
+                 $(if $(findstring neon,$(v)),,$(v)))
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                         $(CODEC_SRCS_ASM))
+LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  CODEC_SRCS_ASM_NEON = $(foreach v, \
+                        $(CODEC_SRCS_ASM_ALL),\
+                        $(if $(findstring neon,$(v)),$(v),))
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                                $(CODEC_SRCS_ASM_NEON))
+  LOCAL_SRC_FILES += $(patsubst %.s, \
+                     %.s.neon, \
+                     $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+endif
+
+LOCAL_CFLAGS += \
+    -DHAVE_CONFIG_H=vpx_config.h \
+    -I$(LIBVPX_PATH) \
+    -I$(ASM_CNV_PATH)
+
+LOCAL_MODULE := libvpx
+
+LOCAL_LDLIBS := -llog
+
+LOCAL_STATIC_LIBRARIES := cpufeatures
+
+.PHONY: clean
+clean:
+	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
+	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	@$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
+	@$(RM) -r $(ASM_CNV_PATH)
+
+include $(BUILD_SHARED_LIBRARY)
+
+$(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_com_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/common/asm_com_offsets.c))
+
+ifeq ($(CONFIG_VP8_DECODER), yes)
+  $(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_dec_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/decoder/asm_dec_offsets.c))
+endif
+
+ifeq ($(CONFIG_VP8_ENCODER), yes)
+  $(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_enc_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c))
+endif
+
+$(call import-module,cpufeatures)
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -129,11 +129,14 @@ while (<STDIN>)
    # ARM code
    s/\sARM/.arm/g;

+    # eabi_attributes numerical equivalents can be found in the
+    # "ARM IHI 0045C" document.
+
    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
+    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;

    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
+    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;

    # Use PROC and ENDP to give the symbols a .size directive.
    # This makes them show up properly in debugging tools like gdb and valgrind.
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -30,6 +30,8 @@ my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8

 my @incoming_array;

+my @imported_functions;
+
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -132,7 +134,18 @@ while (<STDIN>)
    # Make function visible to linker, and make additional symbol with
    # prepended underscore
    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-    s/IMPORT\s+\|([\$\w]*)\|/.globl $1/;
+
+    # Prepend imported functions with _
+    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
+    {
+        $function = trim($1);
+        push(@imported_functions, $function);
+    }
+
+    foreach $function (@imported_functions)
+    {
+        s/$function/_$function/;
+    }

    # No vertical bars required; make additional symbol with prepended
    # underscore
@@ -157,8 +170,8 @@ while (<STDIN>)
    s/\sPRESERVE8/@ PRESERVE8/g;

    # Strip PROC and ENDPROC
-    s/PROC/@/g;
-    s/ENDP/@/g;
+    s/\bPROC\b/@/g;
+    s/\bENDP\b/@/g;

    # EQU directive
    s/(.*)EQU(.*)/.set $1, $2/;
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -477,7 +477,11 @@ process_common_cmdline() {
        --libdir=*)
        libdir="${optval}"
        ;;
-        --libc|--as|--prefix|--libdir)
+        --sdk-path=*)
+        [ -d "${optval}" ] || die "Not a directory: ${optval}"
+        sdk_path="${optval}"
+        ;;
+        --libc|--as|--prefix|--libdir|--sdk-path)
        die "Option ${opt} requires argument"
        ;;
        --help|-h) show_help
@@ -561,6 +565,10 @@ process_common_toolchain() {
                tgt_isa=x86_64
                tgt_os=darwin10
                ;;
+            *darwin11*)
+                tgt_isa=x86_64
+                tgt_os=darwin11
+                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -599,8 +607,8 @@ process_common_toolchain() {

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*|iwmmxt*) enable arm;;
-    mips*)        enable mips;;
+        arm*) enable arm;;
+        mips*) enable mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -617,6 +625,9 @@ process_common_toolchain() {
    if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then
        osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk"
    fi
+    if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then
+        osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk"
+    fi

    case ${toolchain} in
        *-darwin8-*)
@@ -637,6 +648,12 @@ process_common_toolchain() {
            add_ldflags "-isysroot ${osx_sdk_dir}"
            add_ldflags "-mmacosx-version-min=10.6"
            ;;
+        *-darwin11-*)
+            add_cflags  "-isysroot ${osx_sdk_dir}"
+            add_cflags  "-mmacosx-version-min=10.7"
+            add_ldflags "-isysroot ${osx_sdk_dir}"
+            add_ldflags "-mmacosx-version-min=10.7"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -652,37 +669,25 @@ process_common_toolchain() {

    # Process ARM architecture variants
    case ${toolchain} in
-    arm*|iwmmxt*)
-    # on arm, isa versions are supersets
-    enabled armv7a && soft_enable armv7 ### DEBUG
-    enabled armv7 && soft_enable armv6
-    enabled armv7 || enabled armv6 && soft_enable armv5te
-    enabled armv7 || enabled armv6 && soft_enable fast_unaligned
-    enabled iwmmxt2 && soft_enable iwmmxt
-    enabled iwmmxt && soft_enable armv5te
+    arm*)
+        # on arm, isa versions are supersets
+        enabled armv7a && soft_enable armv7 ### DEBUG
+        enabled armv7 && soft_enable armv6
+        enabled armv7 || enabled armv6 && soft_enable armv5te
+        enabled armv7 || enabled armv6 && soft_enable fast_unaligned

-    asm_conversion_cmd="cat"
+        asm_conversion_cmd="cat"

        case ${tgt_cc} in
        gcc)
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                CROSS=${CROSS:-arm-iwmmxt-linux-gnueabi-}
-            elif enabled symbian; then
-                CROSS=${CROSS:-arm-none-symbianelf-}
-            else
-                CROSS=${CROSS:-arm-none-linux-gnueabi-}
-            fi
+            CROSS=${CROSS:-arm-none-linux-gnueabi-}
            link_with_cc=gcc
            setup_gnu_toolchain
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                check_add_asflags -mcpu=${tgt_isa}
-            elif enabled armv7
+            if enabled armv7
            then
                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
                check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
@@ -727,12 +732,49 @@ process_common_toolchain() {
            disable multithread
            disable os_support
            ;;
+
+        android*)
+            SDK_PATH=${sdk_path}
+            COMPILER_LOCATION=`find "${SDK_PATH}" \
+                               -name "arm-linux-androideabi-gcc*" -print -quit`
+            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+            CC=${TOOLCHAIN_PATH}gcc
+            AR=${TOOLCHAIN_PATH}ar
+            LD=${TOOLCHAIN_PATH}gcc
+            AS=${TOOLCHAIN_PATH}as
+            STRIP=${TOOLCHAIN_PATH}strip
+            NM=${TOOLCHAIN_PATH}nm
+
+            if [ -z "${alt_libc}" ]; then
+                alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+                          awk '{n = split($0,a,"/"); \
+                                split(a[n-1],b,"-"); \
+                                print $0 " " b[2]}' | \
+                          sort -g -k 2 | \
+                          awk '{ print $1 }' | tail -1`
+            fi
+
+            add_cflags "--sysroot=${alt_libc}"
+            add_ldflags "--sysroot=${alt_libc}"
+
+            enable pic
+            soft_enable realtime_only
+            if enabled armv7
+            then
+                enable runtime_cpu_detect
+            fi
+          ;;
+
        darwin*)
-            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
+            if [ -z "${sdk_path}" ]; then
+                SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
+            else
+                SDK_PATH=${sdk_path}
+            fi
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -746,13 +788,14 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"
+            if [ -z "${alt_libc}" ]; then
+                alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.0.sdk
+            fi

-            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk
+            add_cflags  "-isysroot ${alt_libc}"

            # Add the paths for the alternate libc
-            for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
+            for d in usr/include; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done
@@ -789,19 +832,6 @@ process_common_toolchain() {
            fi
        ;;

-        symbian*)
-            enable symbian
-            # Add the paths for the alternate libc
-            for d in include/libc; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
-            done
-            for d in release/armv5/urel; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-            done
-            add_cflags -DIMPORT_C=
-
        esac
    ;;
    mips*)
@@ -988,6 +1018,7 @@ EOF
    if enabled multithread; then
        case ${toolchain} in
            *-win*);;
+            *-android-gcc);;
            *) check_header pthread.h && add_extralibs -lpthread
        esac
    fi
--- a/17
+++ b/17
@@ -25,6 +25,7 @@ Advanced options:
  ${toggle_unit_tests}            build unit tests
  --libc=PATH                     path to alternate libc
  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
+  --sdk-path=PATH                 path to root of sdk (iOS, android builds only)
  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                  supported by hardware [auto]
  ${toggle_codec_srcs}            in/exclude codec library source code
@@ -35,7 +36,7 @@ Advanced options:
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
-  ${toggle_multithread}           multithreaded encoding and decoding.
+  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
  ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
@@ -44,6 +45,7 @@ Advanced options:
  ${toggle_static}                static library support
  ${toggle_small}                 favor smaller size over speed
  ${toggle_postproc_visualizer}   macro block / block level visualizers
+  ${toggle_multi_res_encoding}    enable multiple-resolution encoding

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -79,19 +81,15 @@ EOF

 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
+all_platforms="${all_platforms} armv5te-android-gcc"
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
-all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} iwmmxt-linux-rvct"
-all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-linux-rvct"
-all_platforms="${all_platforms} iwmmxt2-linux-gcc"
+all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
@@ -118,6 +116,7 @@ all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
+all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -196,8 +195,6 @@ ARCH_EXT_LIST="
    armv5te
    armv6
    armv7
-    iwmmxt
-    iwmmxt2

    mips32

@@ -262,6 +259,7 @@ CONFIG_LIST="
    postproc_visualizer
    os_support
    unit_tests
+    multi_res_encoding
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -304,6 +302,7 @@ CMDLINE_SELECT="
    small
    postproc_visualizer
    unit_tests
+    multi_res_encoding
 "

 process_cmdline() {
--- a/examples.mk
+++ b/examples.mk
@@ -96,6 +96,17 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame

+# C file is provided, not generated automatically.
+GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS  \
+                         += third_party/libyuv/include/libyuv/basic_types.h  \
+                            third_party/libyuv/include/libyuv/cpu_id.h  \
+                            third_party/libyuv/include/libyuv/scale.h  \
+                            third_party/libyuv/source/row.h \
+                            third_party/libyuv/source/scale.c  \
+                            third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding

 # Handle extra library flags depending on codec configuration

--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -58,7 +58,7 @@ if(frame_cnt%30 == 1) {
    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn off postproc");
 } else if(frame_cnt%30 == 16) {
-    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0};
+    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};

    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn on postproc");
--- a/third_party/libyuv/README.webm
+++ b/third_party/libyuv/README.webm
@@ -0,0 +1,17 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 102
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
+Modified the original scaler code from C++ to C to fit in our current build
+system. This is a temporal solution, and will be improved later.
--- a/third_party/libyuv/include/libyuv/basic_types.h
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;
+typedef long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // __LP64__
+typedef unsigned long long uint64;
+typedef long long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;
+typedef short int16;
+typedef unsigned char uint8;
+typedef char int8;
+#endif  // INT_TYPES_DEFINED
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+
+#define ALIGNP(p, t) \
+  ((uint8*)((((uintptr_t)(p) + \
+  ((t)-1)) & ~((t)-1))))
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// These flags are only valid on x86 processors
+static const int kCpuHasSSE2 = 1;
+static const int kCpuHasSSSE3 = 2;
+
+// These flags are only valid on ARM processors
+static const int kCpuHasNEON = 4;
+
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 8;
+
+// Detect CPU has SSE2 etc.
+// test_flag parameter should be one of kCpuHas constants above
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  extern int cpu_info_;
+  extern int InitCpuFlags();
+  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// -1 to enable all cpu specific optimizations.
+// 0 to disable all cpu specific optimizations.
+void MaskCpuFlags(int enable_flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering
+typedef enum {
+  kFilterNone = 0,  // Point sample; Fastest
+  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 2  // Highest quality
+}FilterMode;
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering);
+
+// Legacy API.  Deprecated
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          int interpolate);
+
+// Legacy API.  Deprecated
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                int interpolate);
+
+// For testing, allow disabling of optimizations.
+void SetUseReferenceImpl(int use);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
--- a/third_party/libyuv/source/cpu_id.c
+++ b/third_party/libyuv/source/cpu_id.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#ifdef __ANDROID__
+#include <cpu-features.h>
+#endif
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86
+
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (
+    "cpuid                                     \n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type)
+  );
+}
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// CPU detect function for SIMD instruction sets.
+int cpu_info_ = 0;
+
+int InitCpuFlags() {
+#ifdef CPU_X86
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
+              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+              kCpuInitialized;
+#elif defined(__ANDROID__) && defined(__ARM_NEON__)
+  uint64_t features = android_getCpuFeatures();
+  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
+              kCpuInitialized;
+#elif defined(__ARM_NEON__)
+  // gcc -mfpu=neon defines __ARM_NEON__
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+  // to disable Neon on devices that do not have it.
+  cpu_info_ = kCpuHasNEON | kCpuInitialized;
+#else
+  cpu_info_ = kCpuInitialized;
+#endif
+  return cpu_info_;
+}
+
+void MaskCpuFlags(int enable_flags) {
+  InitCpuFlags();
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/third_party/libyuv/source/row.h
+++ b/third_party/libyuv/source/row.h
@@ -0,0 +1,264 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBYUV_SOURCE_ROW_H_
+#define LIBYUV_SOURCE_ROW_H_
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#define kMaxStride (2048 * 4)
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_FASTCONVERTYUVTOARGBROW_NEON
+void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
+void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#define HAS_FASTCONVERTYUVTOABGRROW_NEON
+void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#endif
+
+// The following are available on all x86 platforms
+#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(YUV_DISABLE_ASM)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_BG24TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOUVROW_SSSE3
+#define HAS_RAWTOUVROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_FASTCONVERTYTOARGBROW_SSE2
+#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
+#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
+#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
+#define HAS_REVERSE_ROW_SSSE3
+#endif
+
+// The following are available on Neon platforms
+#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#define HAS_REVERSE_ROW_NEON
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+#endif
+#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
+#define HASRGB24TOYROW_SSSE3
+#endif
+#ifdef HASRGB24TOYROW_SSSE3
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_SSSE3
+void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
+#endif
+#ifdef HAS_REVERSE_ROW_NEON
+void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
+#endif
+void ReverseRow_C(const uint8* src, uint8* dst, int width);
+
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+                  uint8* dst_u, uint8* dst_v, int width);
+
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+#endif
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+#endif
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
+#endif
+
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+//extern "C"
+SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+
+void FastConvertYUVToARGBRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToBGRARow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUVToABGRRow_C(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* rgb_buf,
+                               int width);
+
+void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYToARGBRow_C(const uint8* y_buf,
+                             uint8* rgb_buf,
+                             int width);
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
+void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+
+void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     uint8* rgb_buf,
+                                     int width);
+
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+                                uint8* rgb_buf,
+                                int width);
+#endif
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* rgb_buf,
+                                   int width);
+
+void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
+                                      const uint8* u_buf,
+                                      const uint8* v_buf,
+                                      uint8* rgb_buf,
+                                      int width);
+
+#endif
+
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
+                                uint8* rgb_buf,
+                                int width);
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // LIBYUV_SOURCE_ROW_H_
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
--- a/usage.dox
+++ b/usage.dox
@@ -82,6 +82,7 @@

    The available initialization methods are:
    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
+    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
    \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif


--- a/usage_cx.dox
+++ b/usage_cx.dox
@@ -1,6 +1,6 @@
 /*! \page usage_encode Encode

-    The vpx_codec_encode() function is at the core of the decode loop. It
+    The vpx_codec_encode() function is at the core of the encode loop. It
    processes raw images passed by the application, producing packets of
    compressed data. The <code>deadline</code> parameter controls the amount
    of time in microseconds the encoder should spend working on the frame. For
@@ -10,5 +10,4 @@

    \ref samples

-
 */
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -43,6 +43,8 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)

    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);

    vpx_free(oci->above_context);
    vpx_free(oci->mip);
@@ -101,6 +103,8 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        return 1;
    }

+    oci->post_proc_buffer_int_used = 0;
+
    oci->mb_rows = height >> 4;
    oci->mb_cols = width >> 4;
    oci->MBs = oci->mb_rows * oci->mb_cols;
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,7 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/pragmas.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
@@ -63,6 +62,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
        rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
+
+        rtcd->dequant.block               = vp8_dequantize_b_v6;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+
    }
 #endif

@@ -97,6 +102,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mby_neon;
        rtcd->recon.build_intra_predictors_mby_s =
            vp8_build_intra_predictors_mby_s_neon;
+
+        rtcd->dequant.block               = vp8_dequantize_b_neon;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+
    }
 #endif

--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"


 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -10,9 +10,8 @@


 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"

 #if HAVE_ARMV7
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
@@ -24,22 +23,20 @@ extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

 #if HAVE_ARMV7

-void vp8_dequantize_b_neon(BLOCKD *d)
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
 }
 #endif

 #if HAVE_ARMV6
-void vp8_dequantize_b_v6(BLOCKD *d)
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -22,13 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
 #endif
@@ -44,13 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif

--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"

 /* place these declarations here because we don't want to maintain them
 * outside of this scope
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
--- a/vp8/common/bigend.h
+++ b/vp8/common/bigend.h
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _bigend_h
-#define _bigend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) )
-#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) )
-
-#define high_byte(x) (unsigned char)x
-#define mid2Byte(x) (unsigned char)(x >> 8)
-#define mid1Byte(x) (unsigned char)(x >> 16)
-#define low_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 1
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -21,9 +21,6 @@ void vpx_log(const char *format, ...);
 #include "subpixel.h"
 #include "vpx_ports/mem.h"

-#define TRUE    1
-#define FALSE   0
-
 /*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
@@ -170,12 +167,23 @@ typedef struct
    union b_mode_info bmi[16];
 } MODE_INFO;

+#if CONFIG_MULTI_RES_ENCODING
+/* The information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+    MB_PREDICTION_MODE mode;
+    MV_REFERENCE_FRAME ref_frame;
+    int_mv mv;
+    //union b_mode_info bmi[16];
+    int dissim;    // dissimilarity level of the macroblock
+} LOWER_RES_INFO;
+#endif
+
 typedef struct
 {
    short *qcoeff;
    short *dqcoeff;
    unsigned char  *predictor;
-    short *diff;
    short *dequant;

    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
@@ -194,12 +202,16 @@ typedef struct

 typedef struct MacroBlockD
 {
-    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);

+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
+
    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
    BLOCKD block[25];
    int fullpixel_mask;
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -18,8 +18,6 @@

 #include "vpx_mem/vpx_mem.h"

-#include "common_types.h"
-
 /* Only need this for fixed-size arrays, for structs just assign. */

 #define vp8_copy( Dest, Src) { \
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -14,12 +14,11 @@
 #include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"

-void vp8_dequantize_b_c(BLOCKD *d)
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
 {
    int i;
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    for (i = 0; i < 16; i++)
    {
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -14,7 +14,7 @@
 #include "vp8/common/blockd.h"

 #define prototype_dequant_block(sym) \
-    void sym(BLOCKD *x)
+    void sym(BLOCKD *x, short *DQC)

 #define prototype_dequant_idct_add(sym) \
    void sym(short *input, short *dq, \
--- a/vp8/common/dma_desc.h
+++ b/vp8/common/dma_desc.h
@@ -1,125 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _dma_desc_h
-#define _dma_desc_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-#define NDSIZE_LG   0x00000900  // Next Descriptor Size
-#define NDSIZE_SM   0x00000800  // Next Descriptor Size
-#define NDSIZE_7    0x00000700  // Next Descriptor Size
-#define NDSIZE_6    0x00000600  // Next Descriptor Size
-#define NDSIZE_5    0x00000500  // Next Descriptor Size
-#define NDSIZE_4    0x00000400  // Next Descriptor Size
-#define NDSIZE_3    0x00000300  // Next Descriptor Size
-#define NDSIZE_2    0x00000200  // Next Descriptor Size
-#define NDSIZE_1    0x00000100  // Next Descriptor Size
-
-#define FLOW_STOP       0x0000
-#define FLOW_AUTO       0x1000
-#define FLOW_DESC_AR    0x4000
-#define FLOW_DESC_SM    0x6000
-#define FLOW_DESC_LG    0x7000
-
-    typedef struct
-    {
-        unsigned int ndp;
-        //unsigned short ndpl;
-        //unsigned short ndph;
-        unsigned int sa;
-        //unsigned short sal;
-        //unsigned short sah;
-
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-
-    } LARGE_DESC;
-
-    typedef struct
-    {
-        unsigned short ndpl;
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } SMALL_DESC;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } ARRAY_DESC_7;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-    } ARRAY_DESC_6;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-    } ARRAY_DESC_5;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-    } ARRAY_DESC_4;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-    } ARRAY_DESC_3;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-    } ARRAY_DESC_2;
-
-    typedef struct
-    {
-        unsigned short sal;
-    } ARRAY_DESC_1;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif //_dma_desc_h
--- a/vp8/common/duck_io.h
+++ b/vp8/common/duck_io.h
@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _duck_io_h
-#define _duck_io_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined (_WIN32)
-    typedef __int64 int64_t;
-#elif defined(__MWERKS__)
-    typedef long long int64_t;
-#elif defined(__APPLE__) || defined(__POWERPC)
-#include <ppc/types.h>
-#else
-    typedef long long int64_t;
-#endif
-
-    typedef struct
-    {
-        int64_t  offset;     // offset to start from
-        int    blocking;    // non-zero for blocking
-    } re_open_t;
-
-
-    typedef enum
-    {
-        SAL_ERR_MAX                 = -10,
-        SAL_ERROR                   = -11, // Default error
-        SAL_ERR_WSASTARTUP          = -12,
-        SAL_ERR_SOCKET_CREATE       = -13,
-        SAL_ERR_RESOLVING_HOSTNAME  = -14,
-        SAL_ERR_SERVER_CONNECTION   = -15,
-        SAL_ERR_SENDING_DATA        = -16,
-        SAL_ERR_RECEIVING_DATA      = -17,
-        SAL_ERR_404_FILE_NOT_FOUND  = -18,
-        SAL_ERR_PARSING_HTTP_HEADER = -19,
-        SAL_ERR_PARSING_CONTENT_LEN = -20,
-        SAL_ERR_CONNECTION_TIMEOUT  = -21,
-        SAL_ERR_FILE_OPEN_FAILED    = -22,
-        SAL_ERR_MIN                 = -23
-    } SAL_ERR; /* EMH 1-15-03 */
-
-
-    typedef struct sal_err_map_temp
-    {
-        SAL_ERR code;
-        const char *decode;
-
-    } sal_err_map_t;
-
-
-    static char *sal_err_text(SAL_ERR e)
-    {
-        int t;
-        const sal_err_map_t g_sal_err_map[] =
-        {
-            {   SAL_ERR_WSASTARTUP,             "Error with WSAStartup"         },
-            {   SAL_ERR_SOCKET_CREATE,          "Error creating socket"         },
-            {   SAL_ERR_RESOLVING_HOSTNAME,     "Error resolving hostname"      },
-            {   SAL_ERR_SERVER_CONNECTION,      "Error connecting to server"    },
-            {   SAL_ERR_SENDING_DATA,           "Error sending data"            },
-            {   SAL_ERR_RECEIVING_DATA,         "Error receiving data"          },
-            {   SAL_ERR_404_FILE_NOT_FOUND,     "Error file not found "         },
-            {   SAL_ERR_PARSING_HTTP_HEADER,    "Error parsing http header"     },
-            {   SAL_ERR_PARSING_CONTENT_LEN,    "Error parsing content length"  },
-            {   SAL_ERR_CONNECTION_TIMEOUT,     "Error Connection timed out"    },
-            {   SAL_ERR_FILE_OPEN_FAILED,       "Error opening file"            }
-        };
-
-        for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++)
-        {
-            if (e == g_sal_err_map[t].code)
-                return (char *) g_sal_err_map[t].decode;
-        }
-
-        return 0;
-    }
-
-
-
-
-
-
-
-    int duck_open(const char *fname, unsigned long user_data);
-
-    void duck_close(int ghndl);
-
-    int duck_read(int ghndl, unsigned char *buf, int nbytes);
-
-    int64_t duck_seek(int g_hndl, int64_t offs, int origin);
-
-    int duck_read_finished(int han, int flag); /* FWG 7-9-99 */
-
-    int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */
-
-    int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */
-
-    int64_t duck_available_data(int handle); /* EMH 10-23-03 */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -134,13 +134,51 @@ void vp8_find_near_mvs
    best_mv->as_int = near_mvs[0].as_int;
    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
    nearby->as_int = near_mvs[CNT_NEAR].as_int;
-
-    //TODO: move clamp outside findnearmv
-    vp8_clamp_mv2(nearest, xd);
-    vp8_clamp_mv2(nearby, xd);
-    vp8_clamp_mv2(best_mv, xd);
 }

+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+    inv->as_mv.row = src->as_mv.row * -1;
+    inv->as_mv.col = src->as_mv.col * -1;
+    vp8_clamp_mv2(inv, xd);
+    vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    int sign_bias = ref_frame_sign_bias[refframe];
+
+    vp8_find_near_mvs(xd,
+                      here,
+                      &mode_mv_sb[sign_bias][NEARESTMV],
+                      &mode_mv_sb[sign_bias][NEARMV],
+                      &best_mv_sb[sign_bias],
+                      cnt,
+                      refframe,
+                      ref_frame_sign_bias);
+
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+                         &mode_mv_sb[sign_bias][NEARMV], xd);
+    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+                         &best_mv_sb[sign_bias], xd);
+
+    return sign_bias;
+}
+
+
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 )
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -60,10 +60,10 @@ static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
                                int mb_to_bottom_edge)
 {
    unsigned int need_to_clamp;
-    need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
    return need_to_clamp;
 }

@@ -77,6 +77,19 @@ void vp8_find_near_mvs
    int *ref_frame_sign_bias
 );

+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 );
--- a/vp8/common/g_common.h
+++ b/vp8/common/g_common.h
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-extern void (*vp8_clear_system_state)(void);
-extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
-extern void (*de_interlace)
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int Width,
-    int Height,
-    int Stride
-);
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,7 +10,6 @@


 #include "vpx_config.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -70,6 +69,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #if CONFIG_RUNTIME_CPU_DETECT
    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

+
+    rtcd->dequant.block             = vp8_dequantize_b_c;
+    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;
+    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;
+    rtcd->dequant.idct_add_uv_block =
+        vp8_dequant_idct_add_uv_block_c;
+
+
    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
    rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -1,56 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "invtrans.h"
-
-
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
-                             int pitch)
-{
-    if (*b->eob > 1)
-    {
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
-              *(b->base_dst) + b->dst, b->dst_stride);
-    }
-    else
-    {
-        IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
-                         *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-}
-
-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    if(x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        /* do 2nd order transform on the dc block */
-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
-    }
-
-}
-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 8);
-    }
-
-}
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,9 +15,49 @@
 #include "vpx_config.h"
 #include "idct.h"
 #include "blockd.h"
-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#include "onyxc_int.h"

+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
+                                      const VP8_COMMON_RTCD *rtcd)
+{
+    short *DQC = xd->dequant_y1;
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh16)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh1)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+        DQC = xd->dequant_y1_dc;
+    }
+    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+                    (xd->qcoeff, DQC,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+}
 #endif
--- a/vp8/common/littlend.h
+++ b/vp8/common/littlend.h
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _littlend_h
-#define _littlend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) (x)
-#define invert4(x) (x)
-
-#define low_byte(x) (unsigned char)x
-#define mid1Byte(x) (unsigned char)(x >> 8)
-#define mid2Byte(x) (unsigned char)(x >> 16)
-#define high_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 0
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -87,7 +87,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 4; c++)
        {
-            x->block[r*4+c].diff      = &x->diff[r * 4 * 16 + c * 4];
            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
        }
    }
@@ -96,7 +95,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[16+r*2+c].diff      = &x->diff[256 + r * 4 * 8 + c * 4];
            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;

        }
@@ -106,14 +104,11 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[20+r*2+c].diff      = &x->diff[320+ r * 4 * 8 + c * 4];
            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;

        }
    }

-    x->block[24].diff = &x->diff[384];
-
    for (r = 0; r < 25; r++)
    {
        x->block[r].qcoeff  = x->qcoeff  + r * 16;
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -17,13 +17,14 @@ extern "C"
 {
 #endif

+#include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
 #include "ppflags.h"
-    typedef int *VP8_PTR;
+
+    struct VP8_COMP;

    /* Create/destroy static data structures. */

@@ -146,10 +147,14 @@ extern "C"
        int over_shoot_pct;

        // buffering parameters
-        int64_t starting_buffer_level;  // in seconds
+        int64_t starting_buffer_level;  // in bytes
        int64_t optimal_buffer_level;
        int64_t maximum_buffer_size;

+        int64_t starting_buffer_level_in_ms;  // in milli-seconds
+        int64_t optimal_buffer_level_in_ms;
+        int64_t maximum_buffer_size_in_ms;
+
        // controlling quality
        int fixed_q;
        int worst_allowed_q;
@@ -207,32 +212,45 @@ extern "C"
        unsigned int periodicity;
        unsigned int layer_id[MAX_PERIODICITY];

+#if CONFIG_MULTI_RES_ENCODING
+        /* Number of total resolutions encoded */
+        unsigned int mr_total_resolutions;
+
+        /* Current encoder ID */
+        unsigned int mr_encoder_id;
+
+        /* Down-sampling factor */
+        vpx_rational_t mr_down_sampling_factor;
+
+        /* Memory location to store low-resolution encoder's mode info */
+        void* mr_low_res_mode_info;
+#endif
    } VP8_CONFIG;


    void vp8_initialize();

-    VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf);
-    void vp8_remove_compressor(VP8_PTR *comp);
+    struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(struct VP8_COMP* *comp);

-    void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
-    void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+    void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);

 // receive a frames worth of data caller can assume that a copy of this frame is made
 // and not just a copy of the pointer..
-    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
-    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+    int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+    int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+    int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);

-    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_update_entropy(VP8_PTR comp, int update);
-    int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
-    int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols);
-    int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-    int vp8_get_quantizer(VP8_PTR c);
+    int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(struct VP8_COMP* comp, int update);
+    int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(struct VP8_COMP* c);

 #ifdef __cplusplus
 }
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -22,6 +22,7 @@
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
+#include "dequantize.h"

 /*#ifdef PACKET_TESTING*/
 #include "header.h"
@@ -73,6 +74,7 @@ typedef enum
 typedef struct VP8_COMMON_RTCD
 {
 #if CONFIG_RUNTIME_CPU_DETECT
+    vp8_dequant_rtcd_vtable_t        dequant;
    vp8_idct_rtcd_vtable_t        idct;
    vp8_recon_rtcd_vtable_t       recon;
    vp8_subpix_rtcd_vtable_t      subpix;
@@ -91,9 +93,9 @@ typedef struct VP8Common
 {
    struct vpx_internal_error_info  error;

-    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);

    int Width;
    int Height;
@@ -112,6 +114,8 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;

+    YV12_BUFFER_CONFIG post_proc_buffer_int;
+    int post_proc_buffer_int_used;

    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
    FRAME_TYPE frame_type;
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -18,13 +18,13 @@
 extern "C"
 {
 #endif
-#include "type_aliases.h"
 #include "vpx_scale/yv12config.h"
 #include "ppflags.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_codec.h"

-    typedef void   *VP8D_PTR;
+    struct VP8D_COMP;
+
    typedef struct
    {
        int     Width;
@@ -49,19 +49,19 @@ extern "C"

    void vp8dx_initialize(void);

-    void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x);
+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);

-    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);

-    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);

-    vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);

-    VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+    struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);

-    void vp8dx_remove_decompressor(VP8D_PTR comp);
+    void vp8dx_remove_decompressor(struct VP8D_COMP* comp);

 #ifdef __cplusplus
 }
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -12,9 +12,12 @@
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
+#include "common.h"
+#include "recon.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
+#include "../encoder/variance.h"

 #include <math.h>
 #include <stdlib.h>
@@ -26,6 +29,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)

 /* global constants */
+#define MFQE_PRECISION 4
 #if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
@@ -121,7 +125,6 @@ const short vp8_rv[] =
    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };

-
 extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
@@ -175,6 +178,12 @@ void vp8_post_proc_down_and_across_c
        p_src = dst_ptr;
        p_dst = dst_ptr;

+        for (i = -8; i<0; i++)
+          p_src[i]=p_src[0];
+
+        for (i = cols; i<cols+8; i++)
+          p_src[i]=p_src[cols-1];
+
        for (i = 0; i < 8; i++)
            d[i] = p_src[i];

@@ -225,13 +234,19 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
    unsigned char *s = src;
    unsigned char d[16];

-
-    s += 8; cols -= 16;
    for (r = 0; r < rows; r++)
    {
        int sumsq = 0;
        int sum   = 0;

+        for (i = -8; i<0; i++)
+          s[i]=s[0];
+
+        // 17 avoids valgrind warning - we buffer values in c in d
+        // and only write them when we've read 8 ahead...
+        for (i = cols; i<cols+17; i++)
+          s[i]=s[cols-1];
+
        for (i = -8; i <= 6; i++)
        {
            sumsq += s[i] * s[i];
@@ -270,7 +285,7 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
    int r, c, i;
    const short *rv3 = &vp8_rv[63&rand()];

-    for (c = 0; c < cols; c++)
+    for (c = 0; c < cols; c++ )
    {
        unsigned char *s = &dst[c];
        int sumsq = 0;
@@ -278,7 +293,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
        unsigned char d[16];
        const short *rv2 = rv3 + ((c * 17) & 127);

-        s += 8 * pitch; rows -= 8;
+        for (i = -8; i < 0; i++)
+          s[i*pitch]=s[0];
+
+        // 17 avoids valgrind warning - we buffer values in c in d
+        // and only write them when we've read 8 ahead...
+        for (i = rows; i < rows+17; i++)
+          s[i*pitch]=s[(rows-1)*pitch];
+
        for (i = -8; i <= 6; i++)
        {
            sumsq += s[i*pitch] * s[i*pitch];
@@ -317,7 +339,8 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,

    POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
-    //POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+

    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
@@ -325,11 +348,11 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
 }

 void vp8_deblock(YV12_BUFFER_CONFIG         *source,
-                        YV12_BUFFER_CONFIG         *post,
-                        int                         q,
-                        int                         low_var_thresh,
-                        int                         flag,
-                        vp8_postproc_rtcd_vtable_t *rtcd)
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp8_postproc_rtcd_vtable_t *rtcd)
 {
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
@@ -674,12 +697,219 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 }


+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8, 4 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int blksizeby2 = blksize >> 1;
+    int qdiff = qcurr - qprev;
+
+    int i, j;
+    unsigned char *yp;
+    unsigned char *ydp;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, sse, sad, thr;
+    if (blksize == 16)
+    {
+        act = (vp8_variance_var16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        sad = (vp8_variance_sad16x16(y, y_stride, yd, yd_stride, 0)+128)>>8;
+    }
+    else if (blksize == 8)
+    {
+        act = (vp8_variance_var8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        sad = (vp8_variance_sad8x8(y, y_stride, yd, yd_stride, 0)+32)>>6;
+    }
+    else
+    {
+        act = (vp8_variance_var4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
+        sad = (vp8_variance_sad4x4(y, y_stride, yd, yd_stride, 0)+8)>>4;
+    }
+    /* thr = qdiff/8 + log2(act) + log4(qprev) */
+    thr = (qdiff>>3);
+    while (act>>=1) thr++;
+    while (qprev>>=2) thr++;
+    if (sad < thr)
+    {
+        static const int roundoff = (1 << (MFQE_PRECISION - 1));
+        int ifactor = (sad << MFQE_PRECISION) / thr;
+        ifactor >>= (qdiff >> 5);
+        // TODO: SIMD optimize this section
+        if (ifactor)
+        {
+            int icfactor = (1 << MFQE_PRECISION) - ifactor;
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+            {
+                for (j = 0; j < blksize; ++j)
+                    ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+        }
+    }
+    else
+    {
+        if (blksize == 16)
+        {
+            vp8_recon_copy16x16(y, y_stride, yd, yd_stride);
+            vp8_recon_copy8x8(u, uv_stride, ud, uvd_stride);
+            vp8_recon_copy8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else if (blksize == 8)
+        {
+            vp8_recon_copy8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+        else
+        {
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+                vpx_memcpy(ydp, yp, blksize);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+    }
+}
+
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
 #else
 #define RTCD_VTABLE(oci) NULL
 #endif

+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mb_row;
+    int mb_col;
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (((frame_type == INTER_FRAME &&
+                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+                 (frame_type == KEY_FRAME)))
+            {
+                if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
+                {
+                    int i, j;
+                    for (i=0; i<2; ++i)
+                        for (j=0; j<2; ++j)
+                            multiframe_quality_enhance_block(8,
+                                                             qcurr,
+                                                             qprev,
+                                                             y_ptr + 8*(i*show->y_stride+j),
+                                                             u_ptr + 4*(i*show->uv_stride+j),
+                                                             v_ptr + 4*(i*show->uv_stride+j),
+                                                             show->y_stride,
+                                                             show->uv_stride,
+                                                             yd_ptr + 8*(i*dest->y_stride+j),
+                                                             ud_ptr + 4*(i*dest->uv_stride+j),
+                                                             vd_ptr + 4*(i*dest->uv_stride+j),
+                                                             dest->y_stride,
+                                                             dest->uv_stride);
+                }
+                else
+                {
+                    multiframe_quality_enhance_block(16,
+                                                     qcurr,
+                                                     qprev,
+                                                     y_ptr,
+                                                     u_ptr,
+                                                     v_ptr,
+                                                     show->y_stride,
+                                                     show->uv_stride,
+                                                     yd_ptr,
+                                                     ud_ptr,
+                                                     vd_ptr,
+                                                     dest->y_stride,
+                                                     dest->uv_stride);
+
+                }
+            }
+            else
+            {
+                vp8_recon_copy16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_recon_copy8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_recon_copy8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
+
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
    int q = oci->filter_level * 10 / 6;
@@ -701,27 +931,69 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        dest->y_width = oci->Width;
        dest->y_height = oci->Height;
        dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
        return 0;
+    }

+    /* Allocate post_proc_buffer_int if needed */
+    if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+    {
+        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+        {
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, oci->Width, oci->Height, VP8BORDERINPIXELS) >= 0)
+            {
+                oci->post_proc_buffer_int_used = 1;
+            }
+            // insure that postproc is set to all 0's so that post proc
+            // doesn't pull random data in from edge
+            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,126,(&oci->post_proc_buffer)->frame_size);
+
+        }
    }

 #if ARCH_X86||ARCH_X86_64
    vpx_reset_mmx_state();
 #endif

-    if (flags & VP8D_DEMACROBLOCK)
+    if ((flags & VP8D_MFQE) &&
+         oci->current_video_frame >= 2 &&
+         oci->base_qindex - oci->postproc_state.last_base_qindex >= 10)
+    {
+        vp8_multiframe_quality_enhance(oci);
+        if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+            oci->post_proc_buffer_int_used)
+        {
+            vp8_yv12_copy_frame_ptr(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+            if (flags & VP8D_DEMACROBLOCK)
+            {
+                vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                                               q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+            }
+            else if (flags & VP8D_DEBLOCK)
+            {
+                vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                            q, 1, 0, RTCD_VTABLE(oci));
+            }
+        }
+        /* Move partially towards the base q of the previous frame */
+        oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+    }
+    else if (flags & VP8D_DEMACROBLOCK)
    {
        vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                                       q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else if (flags & VP8D_DEBLOCK)
    {
        vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
                    q, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else
    {
        vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }

    if (flags & VP8D_ADDNOISE)
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -104,6 +104,7 @@ struct postproc_state
    int           last_q;
    int           last_noise;
    char          noise[3072];
+    int           last_base_qindex;
    DECLARE_ALIGNED(16, char, blackclamp[16]);
    DECLARE_ALIGNED(16, char, whiteclamp[16]);
    DECLARE_ALIGNED(16, char, bothclamp[16]);
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -9,7 +9,6 @@
 */


-#include "g_common.h"
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -23,7 +23,8 @@ enum
    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
    VP8D_DEBUG_DRAW_MV          = 1<<7,
    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
 };

 typedef struct
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)


 /*encoder only*/
-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
-    unsigned char *pred_ptr = x->predictor;
    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
    int pre_stride = x->block[0].pre_stride;
@@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)

    if ((mv_row | mv_col) & 7)
    {
-        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+            dst_ystride);
    }
 }

@@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
        build_inter4x4_predictors_mb(xd);
    }
 }
-/* encoder only*/
-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
-{
-    int i;
-
-    if (x->mode_info_context->mbmi.partitioning < 3)
-    {
-        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
-        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
-        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
-        x->block[10].bmi = x->mode_info_context->bmi[10];
-
-        build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
-        build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
-    }
-    else
-    {
-        for (i = 0; i < 16; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
-            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, d0->predictor, 16);
-            else
-            {
-                build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
-                build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
-            }
-
-        }
-
-    }
-
-    for (i = 16; i < 24; i += 2)
-    {
-        BLOCKD *d0 = &x->block[i];
-        BLOCKD *d1 = &x->block[i+1];
-
-        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, d0->predictor, 8);
-        else
-        {
-            build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
-            build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
-        }
-    }
-}
-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-    }
-    else
-    {
-        build_4x4uvmvs(xd);
-        build_inter4x4_predictors_mb_e(xd);
-    }
-}
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
                                               int dst_uvstride);


-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         vp8_subpix_fn_t sppf);

 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);

 #endif
--- a/vp8/common/type_aliases.h
+++ b/vp8/common/type_aliases.h
@@ -1,117 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -10,7 +10,17 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+
+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}

 void vp8_dequant_idct_add_y_block_mmx
            (short *q, short *dq,
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"

 void vp8_idct_dequant_0_2x_sse2
            (short *q, short *dq ,
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
    ; end prolog

-    mov     rax, 3
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    shl     rax, 16
+    mov         rdx, arg(0)
+    mov         rax, 30003h

-    movq    mm0, [rsi + 0]        ;ip[0]
-    movq    mm1, [rsi + 8]        ;ip[4]
-    or      rax, 3            ;00030003h
+    movq        mm0, [rdx + 0]    ;ip[0]
+    movq        mm1, [rdx + 8]    ;ip[4]
+    movd        mm7, rax

-    movq    mm2, [rsi + 16]       ;ip[8]
-    movq    mm3, [rsi + 24]       ;ip[12]
+    movq        mm2, [rdx + 16]   ;ip[8]
+    movq        mm3, [rdx + 24]   ;ip[12]
+    punpcklwd   mm7, mm7          ;0003000300030003h
+    mov         rdx, arg(1)

-    movq    mm7, rax
-    movq    mm4, mm0
+    movq        mm4, mm0
+    movq        mm5, mm1

-    punpcklwd mm7, mm7          ;0003000300030003h
-    movq    mm5, mm1
+    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl

-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+    movq        mm6, mm4          ;temp al
+    paddw       mm4, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl

-    movq    mm6, mm4          ;temp al
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1

-    paddw   mm4, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm1          ;dl + cl
-    psubw   mm5, mm1          ;dl - cl
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm1          ;dl + cl
+    psubw       mm5, mm1          ;dl - cl

    ; 03 02 01 00
    ; 13 12 11 10
    ; 23 22 21 20
    ; 33 32 31 30

-    movq    mm3, mm4          ; 03 02 01 00
-    punpcklwd mm4, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
+    movq        mm3, mm4          ; 03 02 01 00
+    punpcklwd   mm4, mm0          ; 11 01 10 00
+    punpckhwd   mm3, mm0          ; 13 03 12 02

-    movq    mm1, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm1, mm5          ; 33 23 32 22
+    movq        mm1, mm6          ; 23 22 21 20
+    punpcklwd   mm6, mm5          ; 31 21 30 20
+    punpckhwd   mm1, mm5          ; 33 23 32 22

-    movq    mm0, mm4          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
+    movq        mm0, mm4          ; 11 01 10 00
+    movq        mm2, mm3          ; 13 03 12 02

-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
+    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]

-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
+    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
 ;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm1, mm0
-    movq    mm5, mm4
+    movq        mm1, mm0
+    movq        mm5, mm4
+    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl

-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
+    movq        mm6, mm1          ;temp al
+    paddw       mm1, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+    paddw       mm1, mm7
+    paddw       mm6, mm7
+    psraw       mm1, 3
+    psraw       mm6, 3

-    movq    mm6, mm1          ;temp al
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1

-    paddw   mm1, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm4          ;dl + cl
-    psubw   mm5, mm4          ;dl - cl
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm4          ;dl + cl
+    psubw       mm5, mm4          ;dl - cl
+    paddw       mm0, mm7
+    paddw       mm5, mm7
+    psraw       mm0, 3
+    psraw       mm5, 3
 ;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm3, mm1          ; 03 02 01 00
-    punpcklwd mm1, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02

-    movq    mm4, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm4, mm5          ; 33 23 32 22
+    movd        eax, mm1
+    movd        ecx, mm0
+    psrlq       mm0, 32
+    psrlq       mm1, 32
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*1], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*5], cx
+    movd        eax, mm1
+    movd        ecx, mm0
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*9], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*13], cx

-    movq    mm0, mm1          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
-
-    paddw   mm0, mm7
-    paddw   mm1, mm7
-    paddw   mm2, mm7
-    paddw   mm3, mm7
-
-    psraw   mm0, 3
-    psraw   mm1, 3
-    psraw   mm2, 3
-    psraw   mm3, 3
-
-;    movq  [rdi + 0], mm0
-;    movq  [rdi + 8], mm1
-;    movq  [rdi + 16], mm2
-;    movq  [rdi + 24], mm3
-
-    movd    eax, mm0
-    psrlq   mm0, 32
-    mov     word ptr[rdi+32*0], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*1], ax
-    movd    eax, mm0
-    mov     word ptr[rdi+32*2], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*3], ax
-
-    movd    ecx, mm1
-    psrlq   mm1, 32
-    mov     word ptr[rdi+32*4], cx
-    shr     ecx, 16
-    mov     word ptr[rdi+32*5], cx
-    movd    ecx, mm1
-    mov     word ptr[rdi+32*6], cx
-    shr     ecx, 16
-    mov     word ptr[rdi+32*7], cx
-
-    movd    eax, mm2
-    psrlq   mm2, 32
-    mov     word ptr[rdi+32*8], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*9], ax
-    movd    eax, mm2
-    mov     word ptr[rdi+32*10], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*11], ax
-
-    movd    ecx, mm3
-    psrlq   mm3, 32
-    mov     word ptr[rdi+32*12], cx
-    shr     ecx, 16
-    mov     word ptr[rdi+32*13], cx
-    movd    ecx, mm3
-    mov     word ptr[rdi+32*14], cx
-    shr     ecx, 16
-    mov     word ptr[rdi+32*15], cx
+    movd        eax, mm6
+    movd        ecx, mm5
+    psrlq       mm5, 32
+    psrlq       mm6, 32
+    mov         word ptr[rdx+32*2], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*6], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, mm6
+    movd        ecx, mm5
+    mov         word ptr[rdx+32*10], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*14], ax
+    mov         word ptr[rdx+32*15], cx

    ; begin epilog
-    pop rdi
-    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
    ; end prolog

-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    mov     rax, 3
+    mov         rcx, arg(0)
+    mov         rdx, arg(1)
+    mov         rax, 30003h

-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
+    movdqa      xmm0, [rcx + 0]     ;ip[4] ip[0]
+    movdqa      xmm1, [rcx + 16]    ;ip[12] ip[8]

-    shl     rax, 16
-    or      rax, 3            ;00030003h

-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm0          ;ip[4] ip[0]
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm0          ;ip[4] ip[0]

-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+    paddw       xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

-    movdqa    xmm4, xmm0
+    movdqa      xmm4, xmm0
    punpcklqdq  xmm0, xmm3          ;d1 a1
    punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm6, eax

-    movdqa    xmm1, xmm4          ;c1 b1
-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+    movdqa      xmm1, xmm4          ;c1 b1
+    paddw       xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

-;;;temp output
-;;  movdqu  [rdi + 0], xmm4
-;;  movdqu  [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ; 13 12 11 10 03 02 01 00
    ;
    ; 33 32 31 30 23 22 21 20
    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    movdqa      xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd   xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd   xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa      xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd   xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd   xmm1, xmm3          ; 33 23 13 03 32 22 12 02
    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+    movd        xmm0, eax
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm4          ;ip[4] ip[0]

-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
+    pshufd      xmm0, xmm0, 0       ;03 03 03 03 03 03 03 03

-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+    paddw       xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

-    movdqa    xmm5, xmm4
+    movdqa      xmm5, xmm4
    punpcklqdq  xmm4, xmm3          ;d1 a1
    punpckhqdq  xmm5, xmm3          ;c1 b1

-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm6
-    paddw   xmm1, xmm6
+    movdqa      xmm1, xmm5          ;c1 b1
+    paddw       xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

-    psraw   xmm5, 3
-    psraw   xmm1, 3
+    paddw       xmm5, xmm0
+    paddw       xmm4, xmm0
+    psraw       xmm5, 3
+    psraw       xmm4, 3

-;;    movdqa  [rdi + 0], xmm5
-;;    movdqa  [rdi + 16], xmm1
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*2], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*6], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*10], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*14], cx

-    movd    eax, xmm5
-    psrldq   xmm5, 4
-    mov     word ptr[rdi+32*0], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*1], ax
-    movd    eax, xmm5
-    psrldq   xmm5, 4
-    mov     word ptr[rdi+32*2], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*3], ax
-
-    movd    eax, xmm5
-    psrldq   xmm5, 4
-    mov     word ptr[rdi+32*4], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*5], ax
-    movd    eax, xmm5
-    mov     word ptr[rdi+32*6], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*7], ax
-
-    movd    eax, xmm1
-    psrldq   xmm1, 4
-    mov     word ptr[rdi+32*8], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*9], ax
-    movd    eax, xmm1
-    psrldq   xmm1, 4
-    mov     word ptr[rdi+32*10], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*11], ax
-
-    movd    eax, xmm1
-    psrldq   xmm1, 4
-    mov     word ptr[rdi+32*12], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*13], ax
-    movd    eax, xmm1
-    mov     word ptr[rdi+32*14], ax
-    shr     eax, 16
-    mov     word ptr[rdi+32*15], ax
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*1], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*5], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    mov         word ptr[rdx+32*9], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*13], ax
+    mov         word ptr[rdx+32*15], cx

    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
-    times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 4 dw 0x4E7B
-align 16
-fours:
-    times 4 dw 0x0004
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
    SHADOW_ARGS_TO_STACK 3
    SAVE_XMM 7
    GET_GOT     rbx
-    push        rsi
-    push        rdi
    ; end prolog

-        mov         rsi, arg(0)             ;src_ptr
+        mov         rcx, arg(0)             ;src_ptr
        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]

-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
+        lea         rdx, [rcx + rax]
        neg         rax

        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
        movdqa      xmm2, xmm1
        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
+
        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
+        psubusb     xmm1, xmm7              ; p1-=q1
        por         xmm1, xmm0              ; abs(p1-q1)
        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
        psrlw       xmm1, 1                 ; abs(p1-q1)/2

-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
+        movdqa      xmm3, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
        movdqa      xmm0, xmm4              ; q0
        movdqa      xmm6, xmm5              ; p0
        psubusb     xmm5, xmm4              ; p0-=q0
        psubusb     xmm4, xmm6              ; q0-=p0
        por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
        pxor        xmm3, xmm3
        pcmpeqb     xmm5, xmm3

+
        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm7, xmm4     ; q1 offset to convert to signed values
        psubsb      xmm2, xmm7              ; p1 - q1

-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
        movdqa      xmm3, xmm0              ; q0
        psubsb      xmm0, xmm6              ; q0 - p0
        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
        pand        xmm5, xmm2              ; mask filter values we don't care about

-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
+        paddsb      xmm5, [GLOBAL(t4)]      ;  3* (q0 - p0) + (p1 - q1) + 4
+        movdqa      xmm0, xmm5
        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]

+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add

-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
+
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
+
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back

    ; begin epilog
-    pop rdi
-    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
        lea         rsi,        [rsi + rax*8]
        lea         rdi,        [rsi + rax]
        lea         rdx,        [rsi + rax*4]
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90

-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0

        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0

-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
        movdqa      xmm1,       xmm0
        movdqa      xmm3,       xmm2

@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        mov         rdx,        arg(2)                          ;blimit
+
        ; calculate mask
        movdqa      xmm6,       xmm0                            ; p1
        movdqa      xmm7,       xmm3                            ; q1
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
        psrlw       xmm6,       1                               ; abs(p1-q1)/2

+        movdqa      xmm7, [rdx]
+
        movdqa      xmm5,       xmm1                            ; p0
        movdqa      xmm4,       xmm2                            ; q0
        psubusb     xmm5,       xmm2                            ; p0-=q0
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
+        movdqa      xmm4, [GLOBAL(t80)]

        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
        pxor        xmm7,        xmm7
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        movdqa        t0,        xmm0
        movdqa        t1,        xmm3

-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
        psubsb      xmm0,        xmm3                           ; p1 - q1
+
        movdqa      xmm6,        xmm1                           ; p0
+;        movdqa      xmm7,        xmm2                           ; q0

-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+        pxor        xmm6,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values

+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
        pand        xmm5,        xmm0                           ; mask filter values we don't care about

-
        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
+        movdqa      xmm0, xmm5
        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
+        movdqa  xmm1, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]

-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add

-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add

-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm6,        xmm4                  ; unoffset   p0

        movdqa      xmm0,        t0                             ; p1
        movdqa      xmm4,        t1                             ; q1
@@ -1763,3 +1746,9 @@ s9:
 align 16
 s63:
    times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -151,6 +151,23 @@ sym(vp8_post_proc_down_and_across_mmx):
        sub         rsi, rdx
        sub         rdi, rdx

+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(5)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+

        push        rax
        xor         rdx,    rdx
@@ -298,8 +315,36 @@ sym(vp8_mbpost_proc_down_mmx):
            pxor        mm0,        mm0     ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
            neg         rax                                     ; rax = -pitch

+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      mm1
+
+            dec         rcx
+            jne         .init_border
+
+
            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax

--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -139,6 +139,24 @@ sym(vp8_post_proc_down_and_across_xmm):
        sub         rsi,        rdx
        sub         rdi,        rdx

+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(5)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
        xor         rdx,        rdx
        movq        mm0,        QWORD PTR [rdi-8];

@@ -287,12 +305,40 @@ sym(vp8_mbpost_proc_down_xmm):
            pxor        xmm0,       xmm0        ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
            neg         rax                                     ; rax = -pitch

+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax

-
            pxor        xmm5,       xmm5
            pxor        xmm6,       xmm6        ;

@@ -480,7 +526,25 @@ sym(vp8_mbpost_proc_across_ip_xmm):
        xor         rdx,    rdx ;sumsq=0;
        xor         rcx,    rcx ;sum=0;
        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
 .ip_var_loop:
        ;for(i=-8;i<=6;i++)
        ;{
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -559,12 +559,492 @@ sym(vp8_intra_pred_uv_ho_%1):
 vp8_intra_pred_uv_ho mmx2
 vp8_intra_pred_uv_ho ssse3

+;void vp8_intra_pred_y_dc_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc_sse2)
+sym(vp8_intra_pred_y_dc_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rsi]
+    psadbw      xmm1,       xmm0
+    movq        xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        xmm1, 0x0
+    lea         edx,        [edx+ecx+16]
+    sar         edx,        5
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dctop_sse2)
+sym(vp8_intra_pred_y_dctop_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; from top
+    mov         rcx,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rcx,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rcx]
+    psadbw      xmm1,       xmm0
+    movdqa      xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; add up
+    paddw       xmm1,       [GLOBAL(dc_8)]
+    psraw       xmm1,       4
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdx,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dcleft_sse2)
+sym(vp8_intra_pred_y_dcleft_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+8]
+
+    ; add up
+    shr         edx,        4
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc128_sse2)
+sym(vp8_intra_pred_y_dc128_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; write out
+    mov         rsi,        2
+    movdqa      xmm1,       [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1)
+sym(vp8_intra_pred_y_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; read top row
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm3,       [GLOBAL(dc_1024)]
+%endif
+    movdqa      xmm1,       [rsi]
+    movdqa      xmm2,       xmm1
+    punpcklbw   xmm1,       xmm0
+    punpckhbw   xmm2,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm4,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    punpcklqdq  xmm4,       xmm4
+%else
+    pshufb      xmm4,       xmm3
+%endif
+    psubw       xmm1,       xmm4
+    psubw       xmm2,       xmm4
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+    movd        xmm4,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm4,       xmm4
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm4,       xmm3
+    pshufb      xmm5,       xmm3
+%endif
+    movdqa      xmm6,       xmm4
+    movdqa      xmm7,       xmm5
+    paddw       xmm4,       xmm1
+    paddw       xmm6,       xmm2
+    paddw       xmm5,       xmm1
+    paddw       xmm7,       xmm2
+    packuswb    xmm4,       xmm6
+    packuswb    xmm5,       xmm7
+    movdqa [rdi    ],       xmm4
+    movdqa [rdi+rcx],       xmm5
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_tm_%1_loop
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ve_sse2)
+sym(vp8_intra_pred_y_ve_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movdqa      xmm1,       [rax]
+
+    ; write out
+    mov         rsi,        2
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_ho_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ho_sse2)
+sym(vp8_intra_pred_y_ho_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read from left and write out
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    dec         rsi
+
+vp8_intra_pred_y_ho_sse2_loop:
+    movd        xmm0,       [rsi]
+    movd        xmm1,       [rsi+rax]
+    ; FIXME use pshufb for ssse3 version
+    punpcklbw   xmm0,       xmm0
+    punpcklbw   xmm1,       xmm1
+    pshuflw     xmm0,       xmm0, 0x0
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm0,       xmm0
+    punpcklqdq  xmm1,       xmm1
+    movdqa [rdi    ],       xmm0
+    movdqa [rdi+rcx],       xmm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_ho_sse2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 SECTION_RODATA
+align 16
 dc_128:
-    times 8 db 128
+    times 16 db 128
 dc_4:
    times 4 dw 4
 align 16
+dc_8:
+    times 8 dw 8
+align 16
 dc_1024:
    times 8 dw 0x400
 align 16
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -94,3 +94,69 @@ void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
                                        vp8_intra_pred_uv_tm_ssse3,
                                        vp8_intra_pred_uv_ho_ssse3);
 }
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               int dst_stride,
+                                               build_intra_predictors_mbuv_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.mode;
+    build_intra_predictors_mbuv_fn_t fn;
+    int src_stride = x->dst.y_stride;
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+        case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_y_dc_sse2; break;
+                } else {
+                    fn = vp8_intra_pred_y_dctop_sse2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_y_dcleft_sse2; break;
+            } else {
+                fn = vp8_intra_pred_y_dc128_sse2; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+    return;
+}
+
+void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_ssse3);
+
+}
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -42,6 +42,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
 extern prototype_copy_block(vp8_copy_mem16x16_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_copy16x16
@@ -53,12 +55,20 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2

+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_sse2
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_sse2
+
 #endif
 #endif

 #if HAVE_SSSE3
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_ssse3);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_build_intra_predictors_mbuv
@@ -67,6 +77,12 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3)
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3

+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_ssse3
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
+
 #endif
 #endif
 #endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -11,7 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/x86.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -37,6 +36,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)

    if (flags & HAS_MMX)
    {
+        rtcd->dequant.block               = vp8_dequantize_b_mmx;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
+
        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
        rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
@@ -81,6 +85,13 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mbuv_sse2;
        rtcd->recon.build_intra_predictors_mbuv_s =
            vp8_build_intra_predictors_mbuv_s_sse2;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_sse2;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_sse2;
+
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;

        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;

@@ -124,6 +135,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mbuv_ssse3;
        rtcd->recon.build_intra_predictors_mbuv_s =
            vp8_build_intra_predictors_mbuv_s_ssse3;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_ssse3;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_ssse3;
    }
 #endif

--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -11,9 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/decoder/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"

 void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
@@ -30,20 +27,12 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
 #if HAVE_ARMV6
    if (flags & HAS_MEDIA)
    {
-        pbi->dequant.block               = vp8_dequantize_b_v6;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
    }
 #endif

 #if HAVE_ARMV7
    if (flags & HAS_NEON)
    {
-        pbi->dequant.block               = vp8_dequantize_b_neon;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
    }
 #endif
 #endif
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -15,7 +15,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/recon.h"
 #include "vp8/common/reconinter.h"
-#include "dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "detokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
@@ -32,7 +32,7 @@
 #endif
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/idct.h"
-#include "dequantize.h"
+
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
@@ -42,7 +42,6 @@

 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
-    int i;
    int Q;
    VP8_COMMON *const pc = & pbi->common;

@@ -52,15 +51,9 @@ void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);

-        /* all the ac values = ; */
-        for (i = 1; i < 16; i++)
-        {
-            int rc = vp8_default_zig_zag1d[i];
-
-            pc->Y1dequant[Q][rc] = (short)vp8_ac_yquant(Q);
-            pc->Y2dequant[Q][rc] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
-            pc->UVdequant[Q][rc] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
-        }
+        pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+        pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+        pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
    }
 }

@@ -88,19 +81,19 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
    else
        QIndex = pc->base_qindex;

-    /* Set up the block level dequant pointers */
-    for (i = 0; i < 16; i++)
+    /* Set up the macroblock dequant constants */
+    xd->dequant_y1_dc[0] = 1;
+    xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+    xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+    xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+    for (i = 1; i < 16; i++)
    {
-        xd->block[i].dequant = pc->Y1dequant[QIndex];
+        xd->dequant_y1_dc[i] =
+        xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+        xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+        xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
    }
-
-    for (i = 16; i < 24; i++)
-    {
-        xd->block[i].dequant = pc->UVdequant[QIndex];
-    }
-
-    xd->block[24].dequant = pc->Y2dequant[QIndex];
-
 }

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -109,32 +102,12 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 #define RTCD_VTABLE(x) NULL
 #endif

-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-    {
-        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
-        RECON_INVOKE(&pbi->common.rtcd.recon,
-                     build_intra_predictors_mby_s)(xd);
-    }
-    else
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                           xd->dst.u_buffer, xd->dst.v_buffer,
-                                           xd->dst.y_stride, xd->dst.uv_stride);
-    }
-}
-
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                              unsigned int mb_idx)
 {
-    int eobtotal = 0;
-    int throw_residual = 0;
    MB_PREDICTION_MODE mode;
    int i;
+    int corruption_detected = 0;

    if (xd->mode_info_context->mbmi.mb_skip_coeff)
    {
@@ -142,28 +115,52 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
    }
    else if (!vp8dx_bool_error(xd->current_bc))
    {
+        int eobtotal;
        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
    }

-
-
    mode = xd->mode_info_context->mbmi.mode;

-    if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV &&
-            !vp8dx_bool_error(xd->current_bc))
-    {
-        /* Special case:  Force the loopfilter to skip when eobtotal and
-         * mb_skip_coeff are zero.
-         * */
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-        skip_recon_mb(pbi, xd);
-        return;
-    }
-
    if (xd->segmentation_enabled)
        mb_init_dequantizer(pbi, xd);

+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+
    /* do prediction */
    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
@@ -173,121 +170,114 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
        {
            RECON_INVOKE(&pbi->common.rtcd.recon,
                         build_intra_predictors_mby_s)(xd);
-        } else {
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
            vp8_intra_prediction_down_copy(xd);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                int b_mode = xd->mode_info_context->bmi[i].as_mode;
+
+                RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+                              ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
+                                *(b->base_dst) + b->dst, b->dst_stride );
+
+                if (xd->eobs[i])
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                            (b->qcoeff, DQC,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                    }
+                    else
+                    {
+                        IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                            (b->qcoeff[0] * DQC[0],
+                            *(b->base_dst) + b->dst, b->dst_stride,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
        }
    }
    else
    {
        vp8_build_inter_predictors_mb(xd);
    }
-    /* When we have independent partitions we can apply residual even
-     * though other partitions within the frame are corrupt.
-     */
-    throw_residual = (!pbi->independent_partitions &&
-                      pbi->frame_corrupt_residual);
-    throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+

 #if CONFIG_ERROR_CONCEALMENT
-    if (pbi->ec_active &&
-        (mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+    if (corruption_detected)
    {
-        /* MB with corrupt residuals or corrupt mode/motion vectors.
-         * Better to use the predictor as reconstruction.
-         */
-        pbi->frame_corrupt_residual = 1;
-        vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-        vp8_conceal_corrupt_mb(xd);
        return;
    }
 #endif

-    /* dequantization and idct */
-    if (mode == B_PRED)
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
    {
-        for (i = 0; i < 16; i++)
+        /* dequantization and idct */
+        if (mode != B_PRED)
        {
-            BLOCKD *b = &xd->block[i];
-            int b_mode = xd->mode_info_context->bmi[i].as_mode;
+            short *DQC = xd->dequant_y1;

-            RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
-                          ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
-                            *(b->base_dst) + b->dst, b->dst_stride );
-
-            if (xd->eobs[i] )
+            if (mode != SPLITMV)
            {
-                if (xd->eobs[i] > 1)
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                        (b->qcoeff, b->dequant,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b,
+                        xd->dequant_y2);
+
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
                }
                else
                {
-                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
-                        *(b->base_dst) + b->dst, b->dst_stride,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+                        xd->qcoeff);
                    ((int *)b->qcoeff)[0] = 0;
                }
-            }
-        }
-    }
-    else
-    {
-        short *DQC = xd->block[0].dequant;

-        /* save the dc dequant constant in case it is overridden */
-        short dc_dequant_temp = DQC[0];
-
-        if (mode != SPLITMV)
-        {
-            BLOCKD *b = &xd->block[24];
-
-            /* do 2nd order transform on the dc block */
-            if (xd->eobs[24] > 1)
-            {
-                DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
-                    xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
-                ((int *)b->qcoeff)[1] = 0;
-                ((int *)b->qcoeff)[2] = 0;
-                ((int *)b->qcoeff)[3] = 0;
-                ((int *)b->qcoeff)[4] = 0;
-                ((int *)b->qcoeff)[5] = 0;
-                ((int *)b->qcoeff)[6] = 0;
-                ((int *)b->qcoeff)[7] = 0;
-            }
-            else
-            {
-                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
-                    xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
            }

-            /* override the dc dequant constant */
-            DQC[0] = 1;
+            DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
        }

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
-
-        /* restore the dc dequant constant */
-        DQC[0] = dc_dequant_temp;
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
    }
-
-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
-                     xd->dst.u_buffer, xd->dst.v_buffer,
-                     xd->dst.uv_stride, xd->eobs+16);
 }

-
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 {
    int ret_val = 0;
@@ -484,7 +474,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                                const unsigned char* token_part_sizes)
 {
    vp8_reader *bool_decoder = &pbi->bc2;
-    int fragment_idx, partition_idx;
+    unsigned int partition_idx;
+    int fragment_idx;
    int num_token_partitions;
    const unsigned char *first_fragment_end = pbi->fragments[0] +
                                          pbi->fragment_sizes[0];
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,7 +15,7 @@
 #include "vpx_ports/mem.h"
 #include "detokenize.h"

-#define BOOL_DATA UINT8
+#define BOOL_DATA unsigned char

 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
 DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
@@ -157,10 +157,10 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
    DECODE_AND_APPLYSIGN(val) \
    Prob = coef_probs + (ENTROPY_NODES*2); \
    if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        qcoeff_ptr [ scan[c] ] = (int16_t) v; \
        ++c; \
        goto DO_WHILE; }\
-    qcoeff_ptr [ 15 ] = (INT16) v; \
+    qcoeff_ptr [ 15 ] = (int16_t) v; \
    goto BLOCK_FINISHED;


@@ -172,7 +172,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
    {\
        range = range-split;\
        value = value-bigsplit;\
-        val += ((UINT16)1<<bits_count);\
+        val += ((uint16_t)1<<bits_count);\
    }\
    else\
    {\
@@ -340,12 +340,12 @@ ONE_CONTEXT_NODE_0_:

    if (c < 15)
    {
-        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        qcoeff_ptr [ scan[c] ] = (int16_t) v;
        ++c;
        goto DO_WHILE;
    }

-    qcoeff_ptr [ 15 ] = (INT16) v;
+    qcoeff_ptr [ 15 ] = (int16_t) v;
 BLOCK_FINISHED:
    eobs[i] = c;
    eobtotal += c;
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -10,7 +10,7 @@


 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"

 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
@@ -20,11 +20,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
    /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd                     = &pbi->common.rtcd;
-    pbi->dequant.block               = vp8_dequantize_b_c;
-    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
-    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
-    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
+    pbi->mb.rtcd                               = &pbi->common.rtcd;
 #endif

 #if ARCH_X86 || ARCH_X86_64
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -20,7 +20,6 @@
 #include "vpx_scale/yv12extend.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
@@ -57,7 +56,7 @@ void vp8dx_initialize()
 }


-VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 {
    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));

@@ -117,14 +116,12 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     */
    pbi->independent_partitions = 0;

-    return (VP8D_PTR) pbi;
+    return pbi;
 }


-void vp8dx_remove_decompressor(VP8D_PTR ptr)
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
    if (!pbi)
        return;

@@ -142,9 +139,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 }


-vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int ref_fb_idx;

@@ -174,9 +170,8 @@ vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, Y
 }


-vpx_codec_err_t vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int *ref_fb_ptr = NULL;
    int free_fb;
@@ -301,19 +296,18 @@ static int swap_frame_buffers (VP8_COMMON *cm)
    return err;
 }

-int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp)
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp)
 {
 #if HAVE_ARMV7
    int64_t dx_store_reg[8];
 #endif
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int retcode = 0;

    /*if(pbi->ready_for_new_data == 0)
        return -1;*/

-    if (ptr == 0)
+    if (pbi == 0)
    {
        return -1;
    }
@@ -575,10 +569,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;

    if (pbi->ready_for_new_data == 1)
        return ret;
@@ -613,3 +606,26 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stam
    vp8_clear_system_state();
    return ret;
 }
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+    const MODE_INFO *mi = oci->mi;
+    int mb_row, mb_col;
+
+    for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+        {
+            if( mi->mbmi.ref_frame == ref_frame)
+              return 1;
+        }
+        mi++;
+    }
+    return 0;
+
+}
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -16,7 +16,8 @@
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/threading.h"
-#include "dequantize.h"
+
+
 #if CONFIG_ERROR_CONCEALMENT
 #include "ec_types.h"
 #endif
@@ -43,7 +44,7 @@ typedef struct
 } DATARATE;


-typedef struct VP8Decompressor
+typedef struct VP8D_COMP
 {
    DECLARE_ALIGNED(16, MACROBLOCKD, mb);

@@ -93,11 +94,6 @@ typedef struct VP8Decompressor

    DATARATE dr[16];

-#if CONFIG_RUNTIME_CPU_DETECT
-    vp8_dequant_rtcd_vtable_t        dequant;
-#endif
-
-
    vp8_prob prob_intra;
    vp8_prob prob_last;
    vp8_prob prob_gf;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -37,7 +37,7 @@ extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
    VP8_COMMON *const pc = & pbi->common;
-    int i, j;
+    int i;

    for (i = 0; i < count; i++)
    {
@@ -77,10 +77,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D

        mbd->current_bc = &pbi->bc2;

-        for (j = 0; j < 25; j++)
-        {
-            mbd->block[j].dequant = xd->block[j].dequant;
-        }
+        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));

        mbd->fullpixel_mask = 0xffffffff;
        if(pc->full_pixel)
@@ -177,6 +177,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    /* dequantization and idct */
    if (xd->mode_info_context->mbmi.mode == B_PRED)
    {
+        short *DQC = xd->dequant_y1;
+
        for (i = 0; i < 16; i++)
        {
            BLOCKD *b = &xd->block[i];
@@ -189,14 +191,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            {
                if (xd->eobs[i] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                        (b->qcoeff, b->dequant,
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                        (b->qcoeff, DQC,
                        *(b->base_dst) + b->dst, b->dst_stride);
                }
                else
                {
                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
+                        (b->qcoeff[0] * DQC[0],
                        *(b->base_dst) + b->dst, b->dst_stride,
                        *(b->base_dst) + b->dst, b->dst_stride);
                    ((int *)b->qcoeff)[0] = 0;
@@ -206,9 +208,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    }
    else
    {
-        short *DQC = xd->block[0].dequant;
-
-        DECLARE_ALIGNED(16, short, local_dequant[16]);
+        short *DQC = xd->dequant_y1;

        if (xd->mode_info_context->mbmi.mode != SPLITMV)
        {
@@ -217,7 +217,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            /* do 2nd order transform on the dc block */
            if (xd->eobs[24] > 1)
            {
-                DEQUANT_INVOKE(&pbi->dequant, block)(b);
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b, xd->dequant_y2);

                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                    xd->qcoeff);
@@ -232,30 +232,23 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            }
            else
            {
-                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
                ((int *)b->qcoeff)[0] = 0;
            }

-            /* make a local copy of the dequant constants */
-            vpx_memcpy(local_dequant, xd->block[0].dequant,
-                       sizeof(local_dequant));
-
            /* override the dc dequant constant */
-            local_dequant[0] = 1;
-
-            /* use the new dequant constants */
-            DQC = local_dequant;
+            DQC = xd->dequant_y1_dc;
        }

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
                        (xd->qcoeff, DQC,
                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
    }

-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->dequant_uv,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
 }
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -13,47 +13,7 @@
 #include "vpx_ports/x86.h"
 #include "vp8/decoder/onyxd_int.h"

-
-#if HAVE_MMX
-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d)
-{
-    short *sq = (short *) d->qcoeff;
-    short *dq = (short *) d->dqcoeff;
-    short *q = (short *) d->dequant;
-    vp8_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
 void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 {
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = x86_simd_caps();

-    /* Note:
-     *
-     * This platform can be built without runtime CPU detection as well. If
-     * you modify any of the function mappings present in this file, be sure
-     * to also update them in static mapings (<arch>/filename_<arch>.h)
-     */
-    /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-    if (flags & HAS_MMX)
-    {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
-    }
-#endif
-#if HAVE_SSE2
-    if (flags & HAS_SSE2)
-    {
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
-    }
-#endif
-
-#endif
 }
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -11,9 +11,9 @@
 #include "vpx_config.h"
 #include "vp8/encoder/variance.h"
 #include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"

 #if HAVE_ARMV6
+#include "vp8/common/arm/bilinearfilter_arm.h"

 unsigned int vp8_sub_pixel_variance8x8_armv6
 (
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -25,8 +25,6 @@

 #include "defaultcoefcounts.h"

-#define printf(...)
-
 const int vp8cx_base_skip_false_prob[128] =
 {
    255, 255, 255, 255, 255, 255, 255, 255,
@@ -161,227 +159,6 @@ static void write_split(vp8_writer *bc, int x)
    );
 }

-#define MAX_BOOL_CACHE_CONTEXTS ((2048+2*4*8*3*12)*128)
-
-typedef unsigned long long bool_cache_entry_t;
-
-static unsigned char bool_cache_validity1[MAX_BOOL_CACHE_CONTEXTS/128/8];
-static unsigned char bool_cache_validity2[MAX_BOOL_CACHE_CONTEXTS/8];
-#define BOOL_CACHE_BUCKETS (32*1024)
-static bool_cache_entry_t bool_cache_table[BOOL_CACHE_BUCKETS];
-
-static unsigned char bit_n(unsigned char *vector, int n)
-{
-    int idx = n >> 3;
-    return (vector[idx] >> (n&7)) & 1;
-}
-
-
-static void set_bit_n(unsigned char *vector, int n)
-{
-    int idx = n >> 3;
-    vector[idx] |= 1 << (n&7);
-}
-
-static int bool_cache_hit1;
-static int bool_cache_hit2;
-static bool_cache_entry_t bool_cache_hit(int context)
-{
-    bool_cache_entry_t entry;
-
-    bool_cache_hit1++;
-    assert(context < MAX_BOOL_CACHE_CONTEXTS);
-    if(!bit_n(bool_cache_validity1, context>>7))
-        return 0;
-    if(!bit_n(bool_cache_validity2, context))
-        return 0;
-
-    entry = bool_cache_table[context & (BOOL_CACHE_BUCKETS-1)];
-    if((context >> 15) == (entry & 0x1F))
-    {
-        bool_cache_hit2++;
-        return entry;
-    }
-    return 0;
-}
-
-
-static void encode_bits_msb(vp8_writer          *w,
-                            int                  v,
-                            const unsigned char *probs,
-                            int n)
-{
-    do
-    {
-        vp8_encode_bool(w, (v >> --n) & 1, *probs++);
-    }
-    while(n);
-}
-
-
-static void bool_cache(int context, vp8_writer *bc)
-{
-    int shift_count;
-    bool_cache_entry_t entry;
-
-    if(!bit_n(bool_cache_validity1, context>>7))
-    {
-        /* This is the first time this top-level context has been hit.
-         * Invalidate the second level contexts
-         */
-        memset(&bool_cache_validity2[context>>3], 0, 16);
-    }
-    set_bit_n(bool_cache_validity1, context>>7);
-    set_bit_n(bool_cache_validity2, context);
-
-    /* Convert the bc into a cache entry */
-    shift_count = bc->count + 24 + 8 * bc->pos;
-    entry = bc->lowvalue;
-    if (bc->pos)
-    {
-        assert(bc->pos < 2);
-        entry += (unsigned long long)bc->buffer[0] << shift_count;
-    }
-    entry <<= 6;
-    entry += shift_count;
-    entry <<= 7;
-    entry += bc->range - 128;
-    entry <<= 5;
-    entry += context >> 15;
-    bool_cache_table[context & (BOOL_CACHE_BUCKETS-1)] = entry;
-
-}
-
-
-static void splice_bits(BOOL_CODER *cx, bool_cache_entry_t to_splice)
-{
-    unsigned int thismask;
-    int shift_count, count;
-    unsigned long long lowvalue = cx->lowvalue;
-
-    to_splice >>= 5;
-    cx->range = (to_splice & 0x7F) + 128;
-    to_splice >>= 7;
-    shift_count = to_splice & 0x3F;
-    to_splice >>= 6;
-
-    count = cx->count + shift_count;
-    lowvalue <<= shift_count;
-    lowvalue += to_splice;
-
-    if(count >=0)
-    {
-        unsigned long long tmp = lowvalue >> count;
-        thismask = 0xffffff << (count & 7) | 255;
-
-        if(tmp >> 32)
-        {
-            int x = cx->pos - 1;
-
-            while (x >= 0 && cx->buffer[x] == 0xff)
-            {
-                cx->buffer[x] = (unsigned char)0;
-                x--;
-            }
-
-            cx->buffer[x] += 1;
-        }
-
-        while(count >= 0)
-        {
-            int out = (tmp >> 24) & 0xff;
-            cx->buffer[cx->pos++] = out;
-            count -= 8;
-            tmp <<= 8;
-        }
-        lowvalue &= thismask;
-    }
-    cx->count = count;
-    cx->lowvalue = lowvalue;
-}
-
-
-void pack_tokens_lut(vp8_writer *w, const TOKENEXTRA *p, int xcount)
-{
-    /* For now, we read from the token array */
-    const TOKENEXTRA *const stop = p + xcount;
-
-    while (p < stop)
-    {
-        const vp8_extra_bit_struct *b;
-        int context;
-        bool_cache_entry_t entry;
-
-        /* Encode token */
-        context = (p->context << 7) + (w->range - 128);
-        if (!(entry = bool_cache_hit(context)))
-        {
-            /* Build the probability vector */
-            int n, i, j, v;
-            unsigned char probs[12], buf[4];
-            vp8_writer tmpbc;
-
-            if (p->skip_eob_node)
-            {
-                n = vp8_coef_encodings[p->Token].Len - 1;
-                i = 2;
-            }
-            else
-            {
-                n = vp8_coef_encodings[p->Token].Len;
-                i = 0;
-            }
-
-            v = vp8_coef_encodings[p->Token].value;
-            j=0;
-            do
-            {
-                const int bb = (v >> --n) & 1;
-                probs[j++] = p->context_tree[i>>1];
-                i = vp8_coef_tree[i+bb];
-            } while(n);
-
-            /* Generate the appropriate bitstream and cache the result */
-            vp8_start_encode(&tmpbc, buf, buf+4);
-            tmpbc.range = w->range;
-            encode_bits_msb(&tmpbc, v, probs, j);
-            bool_cache(context, &tmpbc);
-            entry = bool_cache_hit(context);
-        }
-        splice_bits(w, entry);
-
-        /* Encode extra bits */
-        b = vp8_extra_bits + p->Token;
-        if (b->Len)
-        {
-            context = b->base_val + (p->Extra >> 1);
-            context = (context << 7) + (w->range - 128);
-            if (!(entry = bool_cache_hit(context)))
-            {
-                unsigned char buf[4];
-                vp8_writer tmpbc;
-
-                /* Generate the appropriate bitstream and cache the result */
-                vp8_start_encode(&tmpbc, buf, buf+4);
-                tmpbc.range = w->range;
-                encode_bits_msb(&tmpbc, p->Extra >> 1, b->prob, b->Len);
-                bool_cache(context, &tmpbc);
-                entry = bool_cache_hit(context);
-            }
-            splice_bits(w, entry);
-        }
-
-        /* Encode sign bit */
-        if (b->base_val)
-            vp8_write_bit(w, p->Extra & 1);
-
-        /* Next token */
-        printf("lv=%08x, pos=%ld\n",w->lowvalue,w->pos);
-        ++p;
-    }
-}
-
-
 static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
    const TOKENEXTRA *const stop = p + xcount;
@@ -410,7 +187,6 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
        do
        {
            const int bb = (v >> --n) & 1;
-            //printf("E(%d)=%d (v=%d)\n",pp[i>>1],(v >> n) & 1,v);
            split = 1 + (((range - 1) * pp[i>>1]) >> 8);
            i = vp8_coef_tree[i+bb];

@@ -476,7 +252,6 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
                do
                {
                    const int bb = (v >> --n) & 1;
-            //printf("E(%d)=%d (v=%d)\n",pp[i>>1],(v >> n) & 1,v);
                    split = 1 + (((range - 1) * pp[i>>1]) >> 8);
                    i = b->tree[i+bb];

@@ -577,7 +352,6 @@ static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)

        }

-        printf("lv=%08x, pos=%ld\n",lowvalue,w->pos);
        ++p;
    }

@@ -1104,7 +878,28 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO
        }
    }
 }
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

+    // Calculate the probabilities used to code the ref frame based on useage
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+    if (!cpi->prob_last_coded)
+        cpi->prob_last_coded = 1;
+
+    cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!cpi->prob_gf_coded)
+        cpi->prob_gf_coded = 1;
+
+}

 static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 {
@@ -1112,36 +907,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
    vp8_writer *const w = cpi->bc;
    const MV_CONTEXT *mvc = pc->fc.mvc;

-    const int *const rfct = cpi->count_mb_ref_frame_usage;
-    const int rf_intra = rfct[INTRA_FRAME];
-    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

    MODE_INFO *m = pc->mi, *ms;
    const int mis = pc->mode_info_stride;
    int mb_row = -1;

-    int prob_last_coded;
-    int prob_gf_coded;
    int prob_skip_false = 0;
    ms = pc->mi - 1;

    cpi->mb.partition_info = cpi->mb.pi;

-    // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
-    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
-        cpi->prob_intra_coded = 1;
-
-    prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-    if (!prob_last_coded)
-        prob_last_coded = 1;
-
-    prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                    ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-    if (!prob_gf_coded)
-        prob_gf_coded = 1;
-
+    vp8_convert_rfct_to_prob(cpi);

 #ifdef ENTROPY_STATS
    active_section = 1;
@@ -1162,8 +938,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
    }

    vp8_write_literal(w, cpi->prob_intra_coded, 8);
-    vp8_write_literal(w, prob_last_coded, 8);
-    vp8_write_literal(w, prob_gf_coded, 8);
+    vp8_write_literal(w, cpi->prob_last_coded, 8);
+    vp8_write_literal(w, cpi->prob_gf_coded, 8);

    update_mbintra_mode_probs(cpi);

@@ -1225,11 +1001,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                vp8_write(w, 1, cpi->prob_intra_coded);

                if (rf == LAST_FRAME)
-                    vp8_write(w, 0, prob_last_coded);
+                    vp8_write(w, 0, cpi->prob_last_coded);
                else
                {
-                    vp8_write(w, 1, prob_last_coded);
-                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+                    vp8_write(w, 1, cpi->prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
                }

                {
@@ -1237,6 +1013,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                    int ct[4];

                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_clamp_mv2(&best_mv, xd);
+
                    vp8_mv_ref_probs(mv_ref_p, ct);

 #ifdef ENTROPY_STATS
@@ -1430,7 +1208,7 @@ static void sum_probs_over_prev_coef_context(
    {
        for (j=0; j < PREV_COEF_CONTEXTS; ++j)
        {
-            const int tmp = out[i];
+            const unsigned int tmp = out[i];
            out[i] += probs[j][i];
            /* check for wrap */
            if (out[i] < tmp)
@@ -1554,7 +1332,7 @@ static int default_coef_context_savings(VP8_COMP *cpi)
                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
                    cpi->frame_coef_probs [i][j][k],
                    cpi->frame_branch_ct [i][j][k],
-                    default_coef_counts [i][j][k],
+                    cpi->coef_counts [i][j][k],
                    256, 1
                );

@@ -1581,6 +1359,24 @@ static int default_coef_context_savings(VP8_COMP *cpi)
    return savings;
 }

+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                              int prob_intra,
+                              int prob_last,
+                              int prob_garf
+                             )
+{
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
+    ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
+                                    + vp8_cost_zero(prob_last);
+    ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_zero(prob_garf);
+    ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_one(prob_garf);
+
+}
+
 int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 {
    int savings = 0;
@@ -1588,7 +1384,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
    const int *const rfct = cpi->count_mb_ref_frame_usage;
    const int rf_intra = rfct[INTRA_FRAME];
    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-    int new_intra, new_last, gf_last, oldtotal, newtotal;
+    int new_intra, new_last, new_garf, oldtotal, newtotal;
    int ref_frame_cost[MAX_REF_FRAMES];

    vp8_clear_system_state(); //__asm emms;
@@ -1600,19 +1396,11 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)

        new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;

-        gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+        new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;

-        // new costs
-        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(new_intra);
-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(new_intra)
-                                        + vp8_cost_zero(new_last);
-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(new_intra)
-                                        + vp8_cost_one(new_last)
-                                        + vp8_cost_zero(gf_last);
-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(new_intra)
-                                        + vp8_cost_one(new_last)
-                                        + vp8_cost_one(gf_last);
+
+        vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);

        newtotal =
            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1622,15 +1410,8 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)


        // old costs
-        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(cpi->prob_last_coded);
-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_zero(cpi->prob_gf_coded);
-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_one(cpi->prob_gf_coded);
+        vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,cpi->prob_gf_coded);

        oldtotal =
            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
@@ -1795,7 +1576,6 @@ static void put_delta_q(vp8_writer *bc, int delta_q)
        vp8_write_bit(bc, 0);
 }

-char tmp[128*1024];
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest_end, unsigned long *size)
 {
    int i, j;
@@ -1866,20 +1646,20 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest


    // Signal whether or not Segmentation is enabled
-    vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->segmentation_enabled);

    // Indicate which features are enabled
    if (xd->segmentation_enabled)
    {
        // Signal whether or not the segmentation map is being updated.
-        vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-        vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+        vp8_write_bit(bc, xd->update_mb_segmentation_map);
+        vp8_write_bit(bc, xd->update_mb_segmentation_data);

        if (xd->update_mb_segmentation_data)
        {
            signed char Data;

-            vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+            vp8_write_bit(bc, xd->mb_segement_abs_delta);

            // For each segmentation feature (Quant and loop filter level)
            for (i = 0; i < MB_LVL_MAX; i++)
@@ -1936,7 +1716,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
    vp8_write_literal(bc, pc->sharpness_level, 3);

    // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-    vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);

    if (xd->mode_ref_lf_delta_enabled)
    {
@@ -2158,22 +1938,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
            pack_mb_row_tokens(cpi, &cpi->bc[1]);
        else
 #endif
-{
-vp8_writer tmpbc;
-
-vp8_start_encode(&tmpbc, tmp, tmp+sizeof(tmp));
-            //pack_tokens_c(&tmpbc, cpi->tok, cpi->tok_count);
-pack_tokens_lut(&cpi->bc[1], cpi->tok, cpi->tok_count);
-//assert(cpi->bc[1].lowvalue == tmpbc.lowvalue);
-//assert(cpi->bc[1].pos == tmpbc.pos);
-
-}
-
-//#undef printf
-printf("cache hit ratio: %d/%d=%f\n",
-bool_cache_hit2,
-bool_cache_hit1,
-bool_cache_hit2/(float)bool_cache_hit1);
+            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);

        vp8_stop_encode(&cpi->bc[1]);

--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -45,10 +45,6 @@ typedef struct
    unsigned char **base_src;
    int src;
    int src_stride;
-
-//  MV  enc_mv;
-    int force_empty;
-
 } BLOCK;

 typedef struct
@@ -107,7 +103,6 @@ typedef struct
    int mv_row_min;
    int mv_row_max;

-    int vector_range;    // Used to monitor limiting range of recent vectors to guide search.
    int skip;

    int encode_breakout;
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -24,7 +24,7 @@

 typedef struct
 {
-    unsigned long long lowvalue;
+    unsigned int lowvalue;
    unsigned int range;
    unsigned int value;
    int count;
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -39,7 +39,12 @@
 #define IF_RTCD(x)  NULL
 #endif
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
-
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                                     int prob_intra,
+                                     int prob_last,
+                                     int prob_garf
+                                    );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
 extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
 extern void vp8_auto_select_speed(VP8_COMP *cpi);
 extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
@@ -49,8 +54,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                      int count);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
-int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col);
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col);
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );

 #ifdef MODE_STATS
@@ -475,14 +480,14 @@ void encode_mb_row(VP8_COMP *cpi,

        if (cm->frame_type == KEY_FRAME)
        {
-            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col);
 #ifdef MODE_STATS
            y_modes[xd->mbmi.mode] ++;
 #endif
        }
        else
        {
-            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);

 #ifdef MODE_STATS
            inter_y_modes[xd->mbmi.mode] ++;
@@ -590,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
    // Activity map pointer
    x->mb_activity_ptr = cpi->mb_activity_map;

-    x->vector_range = 32;
-
    x->act_zbin_adj = 0;

    x->partition_info = x->pi;
@@ -636,55 +639,23 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
    vpx_memset(cm->above_context, 0,
               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

-    xd->ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
-
    // Special case treatment when GF and ARF are not sensible options for reference
    if (cpi->ref_frame_flags == VP8_LAST_FLAG)
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(255);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(255)
-                                        + vp8_cost_zero(128);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(255)
-                                        + vp8_cost_one(128);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,255,128);
    else if ((cpi->oxcf.number_of_layers > 1) &&
               (cpi->ref_frame_flags == VP8_GOLD_FLAG))
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_zero(255);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_one(255);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,255);
    else if ((cpi->oxcf.number_of_layers > 1) &&
                (cpi->ref_frame_flags == VP8_ALT_FLAG))
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_zero(1);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(1)
-                                        + vp8_cost_one(1);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,1);
    else
-    {
-        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_zero(cpi->prob_last_coded);
-        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_zero(cpi->prob_gf_coded);
-        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
-                                        + vp8_cost_one(cpi->prob_last_coded)
-                                        + vp8_cost_one(cpi->prob_gf_coded);
-    }
+        vp8_calc_ref_frame_costs(xd->ref_frame_cost,
+                                 cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,
+                                 cpi->prob_gf_coded);

    xd->fullpixel_mask = 0xffffffff;
    if(cm->full_pixel)
@@ -966,31 +937,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
    if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
        (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
    {
-        const int *const rfct = cpi->count_mb_ref_frame_usage;
-        const int rf_intra = rfct[INTRA_FRAME];
-        const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
-        if ((rf_intra + rf_inter) > 0)
-        {
-            cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
-
-            if (cpi->prob_intra_coded < 1)
-                cpi->prob_intra_coded = 1;
-
-            if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
-            {
-                cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
-
-                if (cpi->prob_last_coded < 1)
-                    cpi->prob_last_coded = 1;
-
-                cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
-                                     ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
-
-                if (cpi->prob_gf_coded < 1)
-                    cpi->prob_gf_coded = 1;
-            }
-        }
+      vp8_convert_rfct_to_prob(cpi);
    }

 #if 0
@@ -1142,8 +1089,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }

-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int mb_row, int mb_col)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    int rate;

    if (cpi->sf.RD && cpi->compressor_speed != 2)
@@ -1163,14 +1112,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
    sum_intra_stats(cpi, x);
    vp8_tokenize_mb(cpi, &x->e_mbd, t);

-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));

+    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->dequant_uv,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
    return rate;
 }
 #ifdef SPEEDSTATS
@@ -1182,7 +1134,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x);
 int vp8cx_encode_inter_macroblock
 (
    VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-    int recon_yoffset, int recon_uvoffset
+    int recon_yoffset, int recon_uvoffset,
+    int mb_row, int mb_col
 )
 {
    MACROBLOCKD *const xd = &x->e_mbd;
@@ -1230,8 +1183,10 @@ int vp8cx_encode_inter_macroblock

    }
    else
+    {
        vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                            &distortion, &intra_error);
+                            &distortion, &intra_error, mb_row, mb_col);
+    }

    cpi->prediction_error += distortion;
    cpi->intra_error += intra_error;
@@ -1345,12 +1300,14 @@ int vp8cx_encode_inter_macroblock
    if (!x->skip)
    {
        vp8_tokenize_mb(cpi, xd, t);
-        if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        {
-          vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),
-                                      &x->e_mbd);
-        }
-        vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+
+        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
    }
    else
    {
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,7 +18,6 @@
 #include "vp8/common/invtrans.h"
 #include "vp8/common/recon.h"
 #include "dct.h"
-#include "vp8/common/g_common.h"
 #include "encodeintra.h"


@@ -45,7 +44,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

        vp8_encode_intra16x16mby(rtcd, x);

-        vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
    }
    else
    {
@@ -77,8 +76,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,

    x->quantize_b(be, b);

-    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
-
+    if (*b->eob > 1)
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
+            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+    else
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
+            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
+                b->dst_stride);
+    }
 }

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -96,11 +104,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);

    vp8_transform_intra_mby(x);

@@ -108,16 +117,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mby(x, rtcd);
-
 }

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);

    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);

    vp8_transform_mbuv(x);

@@ -125,5 +135,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mbuv(x, rtcd);
-
 }
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -105,10 +105,10 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    BLOCK *b = &x->block[0];

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
 }

 static void build_dcblock(MACROBLOCK *x)
@@ -625,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_inter_predictors_mb_e(&x->e_mbd);
+    vp8_build_inter_predictors_mb(&x->e_mbd);

    vp8_subtract_mb(rtcd, x);

@@ -635,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        optimize_mb(x, rtcd);
-
 }

 /* this funciton is used by first pass only */
@@ -643,15 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];

-    vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

    transform_mby(x);

    vp8_quantize_mby(x);

-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
+    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
 }
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,6 +12,7 @@
 #ifndef __INC_ENCODEMB_H
 #define __INC_ENCODEMB_H

+
 #include "vpx_config.h"
 #include "block.h"

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -38,7 +38,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)

        if (sem_wait(&cpi->h_event_start_lpf) == 0)
        {
-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                break;

            loopfilter_frame(cpi, cm);
@@ -78,7 +78,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
            int *segment_counts = mbri->segment_counts;
            int *totalrate = &mbri->totalrate;

-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                break;

            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
@@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->mv_col_max    = x->mv_col_max;
    z->mv_row_min    = x->mv_row_min;
    z->mv_row_max    = x->mv_row_max;
-    z->vector_range = x->vector_range ;
    */

    z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
@@ -350,8 +349,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
        z->block[i].src             = x->block[i].src;
        */
        z->block[i].src_stride       = x->block[i].src_stride;
-        z->block[i].force_empty      = x->block[i].force_empty;
-
    }

    {
@@ -387,10 +384,22 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
        zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));

-        for (i = 0; i < 25; i++)
-        {
-            zd->block[i].dequant = xd->block[i].dequant;
-        }
+        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            zd->block[i].dequant = zd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            zd->block[i].dequant = zd->dequant_uv;
+        zd->block[24].dequant = zd->dequant_y2;
+#endif
    }
 }

@@ -421,8 +430,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 #endif
        mb->gf_active_ptr            = x->gf_active_ptr;

-        mb->vector_range             = 32;
-
        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
        mbr_ei[i].totalrate = 0;

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -267,8 +267,8 @@ static void avg_stats(FIRSTPASS_STATS *section)
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    double av_err = ( cpi->twopass.total_stats->ssim_weighted_pred_err /
-                      cpi->twopass.total_stats->count );
+    double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+                      cpi->twopass.total_stats.count );
    double this_err = this_frame->ssim_weighted_pred_err;
    double modified_err;

@@ -373,7 +373,7 @@ static int frame_max_bits(VP8_COMP *cpi)
    else
    {
        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
    }

    // Trap case where we are out of bits
@@ -385,12 +385,12 @@ static int frame_max_bits(VP8_COMP *cpi)

 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    zero_stats(cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_stats);
 }

 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }

 static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
@@ -804,17 +804,17 @@ void vp8_first_pass(VP8_COMP *cpi)
                       - cpi->source->ts_start;

        // don't want to do output stats with a stack variable!
-        memcpy(cpi->twopass.this_frame_stats,
+        memcpy(&cpi->twopass.this_frame_stats,
               &fps,
               sizeof(FIRSTPASS_STATS));
-        output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-        accumulate_stats(cpi->twopass.total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+        accumulate_stats(&cpi->twopass.total_stats, &fps);
    }

    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
    if ((cm->current_video_frame > 0) &&
-        (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-        ((cpi->twopass.this_frame_stats->intra_error / cpi->twopass.this_frame_stats->coded_error) > 2.0))
+        (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
    {
        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
    }
@@ -861,7 +861,7 @@ double bitcost( double prob )
 {
    return -(log( prob ) / log( 2.0 ));
 }
-static long long estimate_modemvcost(VP8_COMP *cpi,
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
                                     FIRSTPASS_STATS * fpstats)
 {
    int mv_cost;
@@ -1019,7 +1019,7 @@ static int estimate_max_q(VP8_COMP *cpi,
    // averaga q observed in clip for non kf/gf.arf frames
    // Give average a chance to settle though.
    if ( (cpi->ni_frames >
-                  ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+                  ((unsigned int)cpi->twopass.total_stats.count >> 8)) &&
         (cpi->ni_frames > 150) )
    {
        cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
@@ -1075,8 +1075,8 @@ static int estimate_cq( VP8_COMP *cpi,
    }

    // II ratio correction factor for clip as a whole
-    clip_iiratio = cpi->twopass.total_stats->intra_error /
-                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+    clip_iiratio = cpi->twopass.total_stats.intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
    if (clip_iifactor < 0.80)
        clip_iifactor = 0.80;
@@ -1260,25 +1260,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)

    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-    zero_stats(cpi->twopass.total_stats);
-    zero_stats(cpi->twopass.total_left_stats);
+    zero_stats(&cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_left_stats);

    if (!cpi->twopass.stats_in_end)
        return;

-    *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-    *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+    cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+    cpi->twopass.total_left_stats = cpi->twopass.total_stats;

    // each frame can have a different duration, as the frame rate in the source
    // isn't guaranteed to be constant.   The frame rate prior to the first frame
    // encoded in the second pass is a guess.  However the sum duration is not.
    // Its calculated based on the actual durations of all frames from the first
    // pass.
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);

    cpi->output_frame_rate = cpi->frame_rate;
-    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
-    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0);
+    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);

    // Calculate a minimum intra value to be used in determining the IIratio
    // scores used in the second pass. We have this minimum to make sure
@@ -1301,7 +1301,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
            sum_iiratio += IIRatio;
        }

-        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);

        // Reset file position
        reset_fpf_position(cpi, start_pos);
@@ -1376,7 +1376,7 @@ static int detect_transition_to_still(
    double loop_decay_rate,
    double decay_accumulator )
 {
-    BOOL trans_to_still = FALSE;
+    int trans_to_still = 0;

    // Break clause to detect very still sections after motion
    // For example a static image after a fade or other transition
@@ -1406,7 +1406,7 @@ static int detect_transition_to_still(

        // Only if it does do we signal a transition to still
        if ( j == still_interval )
-            trans_to_still = TRUE;
+            trans_to_still = 1;
    }

    return trans_to_still;
@@ -1415,14 +1415,14 @@ static int detect_transition_to_still(
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static BOOL detect_flash( VP8_COMP *cpi, int offset )
+static int detect_flash( VP8_COMP *cpi, int offset )
 {
    FIRSTPASS_STATS next_frame;

-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;

    // Read the frame data.
-    // The return is FALSE (no flash detected) if not a valid frame
+    // The return is 0 (no flash detected) if not a valid frame
    if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
    {
        // What we are looking for here is a situation where there is a
@@ -1433,7 +1433,7 @@ static BOOL detect_flash( VP8_COMP *cpi, int offset )
        if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
             (next_frame.pcnt_second_ref >= 0.5 ) )
        {
-            flash_detected = TRUE;
+            flash_detected = 1;

            /*if (1)
            {
@@ -1548,7 +1548,7 @@ static int calc_arf_boost(
    double mv_in_out_accumulator = 0.0;
    double abs_mv_in_out_accumulator = 0.0;
    double r;
-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;

    // Search forward from the proposed arf/next gf position
    for ( i = 0; i < f_frames; i++ )
@@ -1677,7 +1677,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    int alt_boost = 0;
    int f_boost = 0;
    int b_boost = 0;
-    BOOL flash_detected;
+    int flash_detected;

    cpi->twopass.gf_group_bits = 0;
    cpi->twopass.gf_decay_rate = 0;
@@ -1751,7 +1751,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                                         loop_decay_rate,
                                         decay_accumulator ) )
        {
-            allow_alt_ref = FALSE;
+            allow_alt_ref = 0;
            boost_score = old_boost_score;
            break;
        }
@@ -1923,7 +1923,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

-            cpi->source_alt_ref_pending = TRUE;
+            cpi->source_alt_ref_pending = 1;

            // For alt ref frames the error score for the end frame of the
            // group (the alt ref frame) should not contribute to the group
@@ -1949,7 +1949,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            // Note: this_frame->frame has been updated in the loop
            // so it now points at the ARF frame.
            half_gf_int = cpi->baseline_gf_interval >> 1;
-            frames_after_arf = cpi->twopass.total_stats->count -
+            frames_after_arf = cpi->twopass.total_stats.count -
                               this_frame->frame - 1;

            switch (cpi->oxcf.arnr_type)
@@ -1989,13 +1989,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        else
        {
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
            cpi->baseline_gf_interval = i;
        }
    }
    else
    {
-        cpi->source_alt_ref_pending = FALSE;
+        cpi->source_alt_ref_pending = 0;
        cpi->baseline_gf_interval = i;
    }

@@ -2005,7 +2005,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
    // This is also important for short clips where there may only be one
    // key frame.
-    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                            cpi->common.current_video_frame))
    {
        cpi->twopass.kf_group_bits =
@@ -2296,7 +2296,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 void vp8_second_pass(VP8_COMP *cpi)
 {
    int tmp_q;
-    int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);

    FIRSTPASS_STATS this_frame = {0};
    FIRSTPASS_STATS this_frame_copy;
@@ -2341,7 +2341,7 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left;
            cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
        }

    }
@@ -2411,7 +2411,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    // Account for mv, mode and other overheads.
    overhead_bits = estimate_modemvcost(
-                        cpi, cpi->twopass.total_left_stats );
+                        cpi, &cpi->twopass.total_left_stats );

    // Special case code for first frame.
    if (cpi->common.current_video_frame == 0)
@@ -2425,7 +2425,7 @@ void vp8_second_pass(VP8_COMP *cpi)

            est_cq =
                estimate_cq( cpi,
-                             cpi->twopass.total_left_stats,
+                             &cpi->twopass.total_left_stats,
                             (int)(cpi->twopass.bits_left / frames_left),
                             overhead_bits );

@@ -2440,7 +2440,7 @@ void vp8_second_pass(VP8_COMP *cpi)

        tmp_q = estimate_max_q(
                    cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                    (int)(cpi->twopass.bits_left / frames_left),
                    overhead_bits );

@@ -2463,16 +2463,16 @@ void vp8_second_pass(VP8_COMP *cpi)
    // radical adjustments to the allowed quantizer range just to use up a
    // few surplus bits or get beneath the target rate.
    else if ( (cpi->common.current_video_frame <
-                 (((unsigned int)cpi->twopass.total_stats->count * 255)>>8)) &&
+                 (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-                 (unsigned int)cpi->twopass.total_stats->count) )
+                 (unsigned int)cpi->twopass.total_stats.count) )
    {
        if (frames_left < 1)
            frames_left = 1;

        tmp_q = estimate_max_q(
                    cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                    (int)(cpi->twopass.bits_left / frames_left),
                    overhead_bits );

@@ -2489,13 +2489,13 @@ void vp8_second_pass(VP8_COMP *cpi)
    cpi->twopass.frames_to_key --;

    // Update the total stats remaining sturcture
-    subtract_stats(cpi->twopass.total_left_stats, &this_frame );
+    subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
 }


-static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
 {
-    BOOL is_viable_kf = FALSE;
+    int is_viable_kf = 0;

    // Does the frame satisfy the primary criteria of a key frame
    //      If so, then examine how well it predicts subsequent frames
@@ -2569,13 +2569,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST

        // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
        if (boost_score > 5.0 && (i > 3))
-            is_viable_kf = TRUE;
+            is_viable_kf = 1;
        else
        {
            // Reset the file position
            reset_fpf_position(cpi, start_pos);

-            is_viable_kf = FALSE;
+            is_viable_kf = 0;
        }
    }

@@ -2611,7 +2611,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->this_key_frame_forced = cpi->next_key_frame_forced;

    // Clear the alt ref active flag as this can never be active on a key frame
-    cpi->source_alt_ref_active = FALSE;
+    cpi->source_alt_ref_active = 0;

    // Kf is always a gf so clear frames till next gf counter
    cpi->frames_till_gf_update_due = 0;
@@ -2727,10 +2727,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // Reset to the start of the group
        reset_fpf_position(cpi, current_pos);

-        cpi->next_key_frame_forced = TRUE;
+        cpi->next_key_frame_forced = 1;
    }
    else
-        cpi->next_key_frame_forced = FALSE;
+        cpi->next_key_frame_forced = 0;

    // Special case for the last frame of the file
    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
@@ -3034,8 +3034,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    if (cpi->oxcf.allow_spatial_resampling)
    {
-        int resample_trigger = FALSE;
-        int last_kf_resampled = FALSE;
+        int resample_trigger = 0;
+        int last_kf_resampled = 0;
        int kf_q;
        int scale_val = 0;
        int hr, hs, vr, vs;
@@ -3053,14 +3053,14 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        double effective_size_ratio;

        if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
-            last_kf_resampled = TRUE;
+            last_kf_resampled = 1;

        // Set back to unscaled by defaults
        cpi->common.horiz_scale = NORMAL;
        cpi->common.vert_scale = NORMAL;

        // Calculate Average bits per frame.
-        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
        //if ( av_bits_per_frame < 0.0 )
        //  av_bits_per_frame = 0.0
@@ -3117,21 +3117,21 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
                //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
                //  ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
-                resample_trigger = TRUE;
+                resample_trigger = 1;
            else
-                resample_trigger = FALSE;
+                resample_trigger = 0;
        }
        else
        {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;

            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
                ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
                 (over_spend > clip_bits / 20)))                                                               // ... Overspend > 5% of total bits
-                resample_trigger = TRUE;
+                resample_trigger = 1;
            else
-                resample_trigger = FALSE;
+                resample_trigger = 0;

        }

--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -48,7 +48,7 @@ vp8_lookahead_destroy(struct lookahead_ctx *ctx)
    {
        if(ctx->buf)
        {
-            int i;
+            unsigned int i;

            for(i = 0; i < ctx->max_sz; i++)
                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
@@ -65,7 +65,7 @@ vp8_lookahead_init(unsigned int width,
                   unsigned int depth)
 {
    struct lookahead_ctx *ctx = NULL;
-    int i;
+    unsigned int i;

    /* Clamp the lookahead queue depth */
    if(depth < 1)
@@ -188,7 +188,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,

 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index)
+                   unsigned int          index)
 {
    struct lookahead_entry* buf = NULL;

--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -92,7 +92,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 */
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index);
+                   unsigned int          index);


 /**\brief Get the number of frames currently in the lookahead queue
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -9,6 +9,7 @@
 */


+#include "onyx_int.h"
 #include "mcomp.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_config.h"
@@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))

 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                             int_mv *bestmv, int_mv *ref_mv,
@@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #undef IFMVCV
 #undef ERR
 #undef CHECK_BETTER
-#undef MIN
-#undef MAX
+
 int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -854,6 +852,8 @@ int vp8_hex_search
    int k = -1;
    int all_in;
    int best_site = -1;
+    int hex_range = 127;
+    int dia_range = 8;

    int_mv fcenter_mv;
    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
@@ -873,6 +873,18 @@ int vp8_hex_search
                        in_what_stride, 0x7fffffff)
            + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);

+#if CONFIG_MULTI_RES_ENCODING
+    /* Lower search range based on prediction info */
+    if (search_param >= 6) goto cal_neighbors;
+    else if (search_param >= 5) hex_range = 4;
+    else if (search_param >= 4) hex_range = 6;
+    else if (search_param >= 3) hex_range = 15;
+    else if (search_param >= 2) hex_range = 31;
+    else if (search_param >= 1) hex_range = 63;
+
+    dia_range = 8;
+#endif
+
    // hex search
    //j=0
    CHECK_BOUNDS(2)
@@ -909,7 +921,7 @@ int vp8_hex_search
        k = best_site;
    }

-    for (j = 1; j < 127; j++)
+    for (j = 1; j < hex_range; j++)
    {
        best_site = -1;
        CHECK_BOUNDS(2)
@@ -951,7 +963,7 @@ int vp8_hex_search

    // check 4 1-away neighbors
 cal_neighbors:
-    for (j = 0; j < 32; j++)
+    for (j = 0; j < dia_range; j++)
    {
        best_site = -1;
        CHECK_BOUNDS(1)
@@ -1144,7 +1156,7 @@ int vp8_diamond_search_sadx4
    int tot_steps;
    int_mv this_mv;

-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int best_site = 0;
    int last_site = 0;

@@ -1385,7 +1397,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int r, c;

    unsigned char *check_here;
@@ -1515,7 +1527,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int r, c;

    unsigned char *check_here;
--- a/vp8/encoder/mr_dissim.c
+++ b/vp8/encoder/mr_dissim.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+    int low_res_w;
+
+    /* Support arbitrary down-sampling factor */
+    unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+                      + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+    low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+    cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    cnt++;    \
+}
+
+#define GET_MV_SIGN(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    if (cm->ref_frame_sign_bias[x->mbmi.ref_frame]  \
+        != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame])  \
+    {  \
+        mvx[cnt] *= -1;   \
+        mvy[cnt] *= -1;   \
+    }  \
+    cnt++;  \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Note: The first row & first column in mip are outside the frame, which
+     * were initialized to all 0.(ref_frame, mode, mv...)
+     * Their ref_frame = 0 means they won't be counted in the following
+     * calculation.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        if(cm->frame_type != KEY_FRAME)
+        {
+            int mb_row;
+            int mb_col;
+            /* Point to beginning of allocated MODE_INFO arrays. */
+            MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+            LOWER_RES_INFO* store_mode_info
+                            = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                tmp++;
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    int dissim = INT_MAX;
+
+                    if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+                    {
+                        int              mvx[8];
+                        int              mvy[8];
+                        int              mmvx;
+                        int              mmvy;
+                        int              cnt=0;
+                        const MODE_INFO *here = tmp;
+                        const MODE_INFO *above = here - cm->mode_info_stride;
+                        const MODE_INFO *left = here - 1;
+                        const MODE_INFO *aboveleft = above - 1;
+                        const MODE_INFO *aboveright = NULL;
+                        const MODE_INFO *right = NULL;
+                        const MODE_INFO *belowleft = NULL;
+                        const MODE_INFO *below = NULL;
+                        const MODE_INFO *belowright = NULL;
+
+                        /* If alternate reference frame is used, we have to
+                         * check sign of MV. */
+                        if(cpi->oxcf.play_alternate)
+                        {
+                            /* Gather mv of neighboring MBs */
+                            GET_MV_SIGN(above)
+                            GET_MV_SIGN(left)
+                            GET_MV_SIGN(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV_SIGN(right)
+                                GET_MV_SIGN(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV_SIGN(below)
+                                GET_MV_SIGN(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV_SIGN(belowright)
+                            }
+                        }else
+                        {
+                            /* No alt_ref and gather mv of neighboring MBs */
+                            GET_MV(above)
+                            GET_MV(left)
+                            GET_MV(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV(right)
+                                GET_MV(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV(below)
+                                GET_MV(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV(belowright)
+                            }
+                        }
+
+                        if (cnt > 0)
+                        {
+                            int max_mvx = mvx[0];
+                            int min_mvx = mvx[0];
+                            int max_mvy = mvy[0];
+                            int min_mvy = mvy[0];
+                            int i;
+
+                            if (cnt > 1)
+                            {
+                                for (i=1; i< cnt; i++)
+                                {
+                                    if (mvx[i] > max_mvx) max_mvx = mvx[i];
+                                    else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+                                    if (mvy[i] > max_mvy) max_mvy = mvy[i];
+                                    else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+                                }
+                            }
+
+                            mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
+                                       abs(max_mvx - here->mbmi.mv.as_mv.row));
+                            mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
+                                       abs(max_mvy - here->mbmi.mv.as_mv.col));
+                            dissim = MAX(mmvx, mmvy);
+                        }
+                    }
+
+                    /* Store mode info for next resolution encoding */
+                    store_mode_info->mode = tmp->mbmi.mode;
+                    store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+                    store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+                    store_mode_info->dissim = dissim;
+                    tmp++;
+                    store_mode_info++;
+                }
+            }
+        }
+    }
+}
--- a/vp8/common/common_types.h
+++ b/vp8/common/common_types.h
@@ -9,10 +9,11 @@
 */


-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
+#ifndef __INC_MR_DISSIM_H
+#define __INC_MR_DISSIM_H
+#include "vpx_config.h"

-#define TRUE    1
-#define FALSE   0
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);

 #endif
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -58,6 +58,9 @@

 #define MAX_PERIODICITY 16

+#define MAX(x,y) (((x)>(y))?(x):(y))
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
 typedef struct
 {
    int kf_indicated;
@@ -133,32 +136,32 @@ typedef struct

 typedef enum
 {
-    THR_ZEROMV         = 0,
+    THR_ZERO1          = 0,
    THR_DC             = 1,

-    THR_NEARESTMV      = 2,
-    THR_NEARMV         = 3,
+    THR_NEAREST1       = 2,
+    THR_NEAR1          = 3,

-    THR_ZEROG          = 4,
-    THR_NEARESTG       = 5,
+    THR_ZERO2          = 4,
+    THR_NEAREST2       = 5,

-    THR_ZEROA          = 6,
-    THR_NEARESTA       = 7,
+    THR_ZERO3          = 6,
+    THR_NEAREST3       = 7,

-    THR_NEARG          = 8,
-    THR_NEARA          = 9,
+    THR_NEAR2          = 8,
+    THR_NEAR3          = 9,

    THR_V_PRED         = 10,
    THR_H_PRED         = 11,
    THR_TM             = 12,

-    THR_NEWMV          = 13,
-    THR_NEWG           = 14,
-    THR_NEWA           = 15,
+    THR_NEW1           = 13,
+    THR_NEW2           = 14,
+    THR_NEW3           = 15,

-    THR_SPLITMV        = 16,
-    THR_SPLITG         = 17,
-    THR_SPLITA         = 18,
+    THR_SPLIT1         = 16,
+    THR_SPLIT2         = 17,
+    THR_SPLIT3         = 18,

    THR_B_PRED         = 19,
 }
@@ -250,13 +253,16 @@ typedef struct
    int starting_buffer_level;
    int optimal_buffer_level;
    int maximum_buffer_size;
+    int starting_buffer_level_in_ms;
+    int optimal_buffer_level_in_ms;
+    int maximum_buffer_size_in_ms;

    int avg_frame_size_for_layer;

    int buffer_level;
    int bits_off_target;

-    long long total_actual_bits;
+    int64_t total_actual_bits;
    int total_target_vs_actual;

    int worst_quality;
@@ -276,7 +282,7 @@ typedef struct
    int zbin_over_quant;

    int inter_frame_target;
-    INT64 total_byte_count;
+    int64_t total_byte_count;

    int filter_level;

@@ -336,7 +342,7 @@ typedef struct VP8_COMP
    int gold_is_alt;  // don't do both alt and gold search ( just do gold).

    //int refresh_alt_ref_frame;
-    YV12_BUFFER_CONFIG last_frame_uf;
+    YV12_BUFFER_CONFIG pick_lf_lvl_frame;

    TOKENEXTRA *tok;
    unsigned int tok_count;
@@ -418,6 +424,7 @@ typedef struct VP8_COMP
    int buffered_mode;

    double frame_rate;
+    double ref_frame_rate;
    int64_t buffer_level;
    int bits_off_target;

@@ -565,16 +572,21 @@ typedef struct VP8_COMP

    int base_skip_false_prob[128];

+    FRAME_CONTEXT lfc_n; /* last frame entropy */
+    FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+    FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
    struct twopass_rc
    {
        unsigned int section_intra_rating;
        double section_max_qfactor;
        unsigned int next_iiratio;
        unsigned int this_iiratio;
-        FIRSTPASS_STATS *total_stats;
-        FIRSTPASS_STATS *this_frame_stats;
+        FIRSTPASS_STATS total_stats;
+        FIRSTPASS_STATS this_frame_stats;
        FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-        FIRSTPASS_STATS *total_left_stats;
+        FIRSTPASS_STATS total_left_stats;
        int first_pass_done;
        int64_t bits_left;
        int64_t clip_bits_total;
@@ -665,8 +677,8 @@ typedef struct VP8_COMP
    unsigned int current_layer;
    LAYER_CONTEXT layer_context[MAX_LAYERS];

-    long long frames_in_layer[MAX_LAYERS];
-    long long bytes_in_layer[MAX_LAYERS];
+    int64_t frames_in_layer[MAX_LAYERS];
+    int64_t bytes_in_layer[MAX_LAYERS];
    double sum_psnr[MAX_LAYERS];
    double sum_psnr_p[MAX_LAYERS];
    double total_error2[MAX_LAYERS];
@@ -679,6 +691,11 @@ typedef struct VP8_COMP
    double total_ssimg_v_in_layer[MAX_LAYERS];
    double total_ssimg_all_in_layer[MAX_LAYERS];

+#if CONFIG_MULTI_RES_ENCODING
+    /* Number of MBs per row at lower-resolution level */
+    int    mr_low_res_mb_cols;
+#endif
+
 } VP8_COMP;

 void control_data_rate(VP8_COMP *cpi);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -21,7 +21,6 @@
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
@@ -39,7 +38,7 @@ extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd);
 extern unsigned int cnt_pm;
 #endif

-extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];

 extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
@@ -402,17 +401,78 @@ static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv)
    }
 }

+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+                               int *parent_ref_frame,
+                               MB_PREDICTION_MODE *parent_mode,
+                               int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+    LOWER_RES_INFO* store_mode_info
+                          = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info;
+    unsigned int parent_mb_index;
+    //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col];
+
+    /* Consider different down_sampling_factor.  */
+    {
+        /* TODO: Removed the loop that supports special down_sampling_factor
+         * such as 2, 4, 8. Will revisit it if needed.
+         * Should also try using a look-up table to see if it helps
+         * performance. */
+        int round = cpi->oxcf.mr_down_sampling_factor.num/2;
+        int parent_mb_row, parent_mb_col;
+
+        parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round)
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round)
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+    }
+
+    /* Read lower-resolution mode & motion result from memory.*/
+    *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+    *parent_mode =  store_mode_info[parent_mb_index].mode;
+    *dissim = store_mode_info[parent_mb_index].dissim;
+
+    /* For highest-resolution encoder, adjust dissim value. Lower its quality
+     * for good performance. */
+    if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+        *dissim>>=1;
+
+    if(*parent_ref_frame != INTRA_FRAME)
+    {
+        /* Consider different down_sampling_factor.
+         * The result can be rounded to be more precise, but it takes more time.
+         */
+        //int round = cpi->oxcf.mr_down_sampling_factor.den/2;
+        (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+        (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+
+        vp8_clamp_mv2(parent_ref_mv, xd);
+    }
+}
+#endif
+
+
 void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                         int recon_uvoffset, int *returnrate,
-                         int *returndistortion, int *returnintra)
+                         int *returndistortion, int *returnintra, int mb_row,
+                         int mb_col)
 {
    BLOCK *b = &x->block[0];
    BLOCKD *d = &x->e_mbd.block[0];
    MACROBLOCKD *xd = &x->e_mbd;
    MB_MODE_INFO best_mbmode;

+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
    int_mv best_ref_mv;
-    int_mv mode_mv[MB_MODE_COUNT];
+    int_mv *mode_mv;
    MB_PREDICTION_MODE this_mode;
    int num00;
    int mdcounts[4];
@@ -422,68 +482,62 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    int rate;
    int rate2;
    int distortion2;
-    int bestsme;
-    //int all_rds[MAX_MODES];         // Experimental debug code.
+    int bestsme = INT_MAX;
    int best_mode_index = 0;
    unsigned int sse = INT_MAX, best_sse = INT_MAX;

    int_mv mvp;
+
    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    int saddone=0;
    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)

-    int_mv nearest_mv[4];
-    int_mv near_mv[4];
-    int_mv frame_best_ref_mv[4];
-    int MDCounts[4][4];
-    unsigned char *y_buffer[4];
-    unsigned char *u_buffer[4];
-    unsigned char *v_buffer[4];
+    unsigned char *plane[4][3];
+    int ref_frame_map[4];
+    int sign_bias = 0;

-    int skip_mode[4] = {0, 0, 0, 0};
-    int found_near_mvs[4] = {0, 0, 0, 0};
+    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode,
+                                       when Speed >= 15, no sub-pixel search. */

-    int have_subp_search = cpi->sf.half_pixel_search;  /* In real-time mode, when Speed >= 15, no sub-pixel search. */
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;

-    vpx_memset(mode_mv, 0, sizeof(mode_mv));
-    vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
-    vpx_memset(near_mv, 0, sizeof(near_mv));
+    if (cpi->oxcf.mr_encoder_id)
+        get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+                                  &parent_mode, &parent_ref_mv, mb_row, mb_col);
+#endif
+
+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

+    /* Setup search priorities */
+    get_reference_search_order(cpi, ref_frame_map);

-    // set up all the refframe dependent pointers.
-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
    {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
-        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
-        v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
-    }
-    else
-        skip_mode[LAST_FRAME] = 1;
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);

-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-    {
-        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
-        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
-        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
-        v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
    }
-    else
-        skip_mode[GOLDEN_FRAME] = 1;

-    if ((cpi->ref_frame_flags & VP8_ALT_FLAG) &&
-        (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1))
-    {
-        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
-        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
-        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
-        v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
-    }
-    else
-        skip_mode[ALTREF_FRAME] = 1;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);

-    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+    cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame

    *returnintra = INT_MAX;
    x->skip = 0;
@@ -492,54 +546,96 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,

    // if we encode a new mv this is important
    // find the best new motion vector
-    for (mode_index = 0; mode_index < 1; mode_index++)
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
    {
        int frame_cost;
        int this_rd = INT_MAX;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];

        if (best_rd <= cpi->rd_threshes[mode_index])
            continue;

-        x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
-
-        if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame])
+        if (this_ref_frame < 0)
            continue;

-        // Check to see if the testing frequency for this mode is at its max
-        // If so then prevent it from being tested and increase the threshold for its testing
-        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+#if CONFIG_MULTI_RES_ENCODING
+        if (cpi->oxcf.mr_encoder_id)
        {
-            //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] )
-            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]))
+            /* If parent MB is intra, child MB is intra. */
+            if (!parent_ref_frame && this_ref_frame)
+                continue;
+
+            /* If parent MB is inter, and it is unlikely there are multiple
+             * objects in parent MB, we use parent ref frame as child MB's
+             * ref frame. */
+            if (parent_ref_frame && dissim < 8
+                && parent_ref_frame != this_ref_frame)
+                continue;
+        }
+#endif
+
+        // everything but intra
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
            {
-                // Increase the threshold for coding this mode to make it less likely to be chosen
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id)
+            {
+                if (vp8_mode_order[mode_index] == NEARESTMV &&
+                    mode_mv[NEARESTMV].as_int ==0)
+                    continue;
+                if (vp8_mode_order[mode_index] == NEARMV &&
+                    mode_mv[NEARMV].as_int ==0)
+                    continue;
+
+                if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+                    && best_ref_mv.as_int==0) //&& dissim==0
+                    continue;
+                else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+                    && best_ref_mv.as_int==parent_ref_mv.as_int)
+                    continue;
+            }
+#endif
+        }
+
+        /* Check to see if the testing frequency for this mode is at its max
+         * If so then prevent it from being tested and increase the threshold
+         * for its testing */
+        if (cpi->mode_test_hit_counts[mode_index] &&
+                                         (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         cpi->mode_test_hit_counts[mode_index]))
+            {
+                /* Increase the threshold for coding this mode to make it less
+                 * likely to be chosen */
                cpi->rd_thresh_mult[mode_index] += 4;

                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-
+                cpi->rd_threshes[mode_index] =
+                                 (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                 cpi->rd_thresh_mult[mode_index];
                continue;
            }
        }

-        // If nearby MVs haven't been found for this reference frame then do it now.
-        if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
-            !found_near_mvs[x->e_mbd.mode_info_context->mbmi.ref_frame])
-        {
-            int ref_frame = x->e_mbd.mode_info_context->mbmi.ref_frame;
-            vp8_find_near_mvs(&x->e_mbd,
-                              x->e_mbd.mode_info_context,
-                              &nearest_mv[ref_frame], &near_mv[ref_frame],
-                              &frame_best_ref_mv[ref_frame],
-                              MDCounts[ref_frame],
-                              ref_frame,
-                              cpi->common.ref_frame_sign_bias);
-            found_near_mvs[ref_frame] = 1;
-        }
-
-        // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+        /* We have now reached the point where we are going to test the current
+         * mode so increment the counter for the number of times it has been
+         * tested */
        cpi->mode_test_hit_counts[mode_index] ++;

        rate2 = 0;
@@ -547,42 +643,28 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,

        this_mode = vp8_mode_order[mode_index];

-        // Experimental debug code.
-        //all_rds[mode_index] = -1;
-
        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;

-        // Work out the cost assosciated with selecting the reference frame
+        /* Work out the cost assosciated with selecting the reference frame */
        frame_cost =
            x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
        rate2 += frame_cost;

-        // everything but intra
-        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
-        {
-            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
-        }
-
-        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-        // unless ARNR filtering is enabled in which case we want
-        // an unfiltered alternative
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative */
        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
        {
-            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+            if (this_mode != ZEROMV ||
+                x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                continue;
        }

        switch (this_mode)
        {
        case B_PRED:
-            // Pass best so far to pick_intra4x4mby_modes to use as breakout
+            /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
            distortion2 = best_sse;
            pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);

@@ -641,10 +723,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            int sadpb = x->sadperbit16;
            int_mv mvp_full;

-            int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
-            int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
-            int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3)
+                         + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3)
+                         + MAX_FULL_PEL_VAL;

            int tmp_col_min = x->mv_col_min;
            int tmp_col_max = x->mv_col_max;
@@ -656,110 +740,156 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            // Further step/diamond searches as necessary
            step_param = cpi->sf.first_step + speed_adjust;

-            if(cpi->sf.improved_mv_pred)
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id)
            {
-                if(!saddone)
-                {
-                    vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
-                    saddone = 1;
-                }
-
-                vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
-                            x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
-
-                sr += speed_adjust;
-                //adjust search range according to sr from mv prediction
-                if(sr > step_param)
-                    step_param = sr;
-
-                mvp_full.as_mv.col = mvp.as_mv.col>>3;
-                mvp_full.as_mv.row = mvp.as_mv.row>>3;
+                // Use parent MV as predictor. Adjust search range accordingly.
+                mvp.as_int = parent_ref_mv.as_int;
+                mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+                mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;

+                if(dissim <=32) step_param += 3;
+                else if(dissim <=128) step_param += 2;
+                else step_param += 1;
            }else
+#endif
            {
-                mvp.as_int = best_ref_mv.as_int;
-                mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
-                mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
-            }
-
-            // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
-            if (x->mv_col_min < col_min )
-                x->mv_col_min = col_min;
-            if (x->mv_col_max > col_max )
-                x->mv_col_max = col_max;
-            if (x->mv_row_min < row_min )
-                x->mv_row_min = row_min;
-            if (x->mv_row_max > row_max )
-                x->mv_row_max = row_max;
-
-            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);
-
-            if (cpi->sf.search_method == HEX)
-            {
-                bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param,
-                                      sadpb, &cpi->fn_ptr[BLOCK_16X16],
-                                      x->mvsadcost, x->mvcost, &best_ref_mv);
-                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
-            }
-            else
-            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
-                                      step_param, sadpb, &num00,
-                                      &cpi->fn_ptr[BLOCK_16X16],
-                                      x->mvcost, &best_ref_mv);
-                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
-
-                // Further step/diamond searches as necessary
-                n = 0;
-                //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-                n = num00;
-                num00 = 0;
-
-                while (n < further_steps)
+                if(cpi->sf.improved_mv_pred)
                {
-                    n++;
-
-                    if (num00)
-                        num00--;
-                    else
+                    if(!saddone)
                    {
-                        thissme =
-                        cpi->diamond_search_sad(x, b, d, &mvp_full,
-                                                &d->bmi.mv,
-                                                step_param + n,
-                                                sadpb, &num00,
-                                                &cpi->fn_ptr[BLOCK_16X16],
-                                                x->mvcost, &best_ref_mv);
-                        if (thissme < bestsme)
-                        {
-                            bestsme = thissme;
-                            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
-                        }
-                        else
-                        {
-                            d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
-                        }
+                        vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                        saddone = 1;
                    }
+
+                    vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+                                &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+                                cpi->common.ref_frame_sign_bias, &sr,
+                                &near_sadidx[0]);
+
+                    sr += speed_adjust;
+                    //adjust search range according to sr from mv prediction
+                    if(sr > step_param)
+                        step_param = sr;
+
+                    mvp_full.as_mv.col = mvp.as_mv.col>>3;
+                    mvp_full.as_mv.row = mvp.as_mv.row>>3;
+                }else
+                {
+                    mvp.as_int = best_ref_mv.as_int;
+                    mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+                    mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
                }
            }

-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_encoder_id && dissim <= 2 &&
+                MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+                    abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
+            {
+                d->bmi.mv.as_int = mvp_full.as_int;
+                mode_mv[NEWMV].as_int = mvp_full.as_int;

-            if (bestsme < INT_MAX)
                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
                                             x->errorperbit,
                                             &cpi->fn_ptr[BLOCK_16X16],
                                             cpi->mb.mvcost,
                                             &distortion2,&sse);
+            }else
+#endif
+            {
+                /* Get intersection of UMV window and valid MV window to
+                 * reduce # of checks in diamond search. */
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+
+                further_steps = (cpi->Speed >= 8)?
+                           0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+                if (cpi->sf.search_method == HEX)
+                {
+#if CONFIG_MULTI_RES_ENCODING
+                /* TODO: In higher-res pick_inter_mode, step_param is used to
+                 * modify hex search range. Here, set step_param to 0 not to
+                 * change the behavior in lowest-resolution encoder.
+                 * Will improve it later.
+                 */
+                if (!cpi->oxcf.mr_encoder_id)
+                    step_param = 0;
+#endif
+                    bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+                                          step_param, sadpb,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvsadcost, x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                          &d->bmi.mv, step_param, sadpb, &num00,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+                    // Further step/diamond searches as necessary
+                    n = 0;
+                    //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+                    n = num00;
+                    num00 = 0;
+
+                    while (n < further_steps)
+                    {
+                        n++;
+
+                        if (num00)
+                            num00--;
+                        else
+                        {
+                            thissme =
+                            cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                    &d->bmi.mv,
+                                                    step_param + n,
+                                                    sadpb, &num00,
+                                                    &cpi->fn_ptr[BLOCK_16X16],
+                                                    x->mvcost, &best_ref_mv);
+                            if (thissme < bestsme)
+                            {
+                                bestsme = thissme;
+                                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                            }
+                            else
+                            {
+                                d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                            }
+                        }
+                    }
+                }
+
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+
+                if (bestsme < INT_MAX)
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+                                             &best_ref_mv, x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }

            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;

            // mv cost;
-            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                     cpi->mb.mvcost, 128);
        }

        case NEARESTMV:
@@ -770,18 +900,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,

        case ZEROMV:

-            // Trap vectors that reach beyond the UMV borders
-            // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
-            // because of the lack of break statements in the previous two cases.
-            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+             * through to this point because of the lack of break statements
+             * in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+                ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
                continue;

            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
            x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                                    mode_mv[this_mode].as_int;

-            /* Exit early and don't compute the distortion if this macroblock is marked inactive. */
+            /* Exit early and don't compute the distortion if this macroblock
+             * is marked inactive. */
            if (cpi->active_map_enabled && x->active_ptr[0] == 0)
            {
                sse = 0;
@@ -816,9 +951,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            break;
        }

-        // Experimental debug code.
-        //all_rds[mode_index] = this_rd;
-
        if (this_rd < best_rd || x->skip)
        {
            // Note index of best mode
@@ -828,14 +960,23 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            *returndistortion = distortion2;
            best_sse = sse;
            best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));

-            // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
-            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            cpi->rd_thresh_mult[mode_index] =
+                     (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] =
+                                   (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                   cpi->rd_thresh_mult[mode_index];
        }

-        // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+        /* If the mode did not help improve the best error case then raise the
+         * threshold for testing that mode next time around.
+         */
        else
        {
            cpi->rd_thresh_mult[mode_index] += 4;
@@ -843,7 +984,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            cpi->rd_threshes[mode_index] =
+                         (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                         cpi->rd_thresh_mult[mode_index];
        }

        if (x->skip)
@@ -855,8 +998,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    {
        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);

-        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+        cpi->rd_thresh_mult[best_mode_index] =
+                        (cpi->rd_thresh_mult[best_mode_index]
+                        >= (MIN_THRESHMULT + best_adjustment)) ?
+                        cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] =
+                        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                        cpi->rd_thresh_mult[best_mode_index];
    }


@@ -879,15 +1028,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
-                                        (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+                                        (cpi->common.mb_no_coeff_skip);
        x->e_mbd.mode_info_context->mbmi.partitioning = 0;

        return;
    }

-    /* set to the best mb mode, this copy can be skip if x->skip since it already has the right content */
+    /* set to the best mb mode, this copy can be skip if x->skip since it
+     * already has the right content */
    if (!x->skip)
-        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+                   sizeof(MB_MODE_INFO));

    if (best_mbmode.mode <= B_PRED)
    {
@@ -895,7 +1046,11 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        pick_intra_mbuv_mode(x);
    }

-    update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
+    if (sign_bias
+      != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+    update_mvcount(cpi, &x->e_mbd, &best_ref_mv);
 }


--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -14,6 +14,10 @@
 #include "vpx_config.h"
 #include "vp8/common/onyxc_int.h"

-extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                                int recon_uvoffset, int *returnrate,
+                                int *returndistortion, int *returnintra,
+                                int mb_row, int mb_col);
 extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+
 #endif
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -152,9 +152,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
    int filt_val;
    int best_filt_val = cm->filter_level;
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;

-    //  Make a copy of the unfiltered / processed recon buffer
-    vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;

    if (cm->frame_type == KEY_FRAME)
        cm->sharpness_level = 0;
@@ -176,28 +177,27 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    filt_val = cm->filter_level;
    best_filt_val = filt_val;

-goto skip;
    // Get the err using the previous frame's filter value.
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-    best_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+    best_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                    IF_RTCD(&cpi->rtcd.variance));

-    //  Re-instate the unfiltered frame
-    vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
-
-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val -= 1 + (filt_val > 10);

    // Search lower filter levels
    while (filt_val >= min_filter_level)
    {
        // Apply the loop filter
+        vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
        vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

        // Get the err for filtered frame
-        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-        //  Re-instate the unfiltered frame
-        vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                        IF_RTCD(&cpi->rtcd.variance));

        // Update the best case record or exit loop.
        if (filt_err < best_err)
@@ -209,11 +209,11 @@ goto skip;
            break;

        // Adjust filter level
-        filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+        filt_val -= 1 + (filt_val > 10);
    }

    // Search up (note that we have already done filt_val = cm->filter_level)
-    filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val = cm->filter_level + 1 + (filt_val > 10);

    if (best_filt_val == cm->filter_level)
    {
@@ -223,13 +223,13 @@ goto skip;
        while (filt_val < max_filter_level)
        {
            // Apply the loop filter
+            vp8_yv12_copy_partial_frame_ptr(saved_frame, cm->frame_to_show);
+
            vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

            // Get the err for filtered frame
-            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show,
+                                            IF_RTCD(&cpi->rtcd.variance));

            // Update the best case record or exit loop.
            if (filt_err < best_err)
@@ -243,11 +243,10 @@ goto skip;
                break;

            // Adjust filter level
-            filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+            filt_val += 1 + (filt_val > 10);
        }
    }

-skip:
    cm->filter_level = best_filt_val;

    if (cm->filter_level < min_filter_level)
@@ -255,6 +254,9 @@ skip:

    if (cm->filter_level > max_filter_level)
        cm->filter_level = max_filter_level;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
 }

 // Stub function for now Alt LF not used
@@ -285,10 +287,16 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    int filt_best;
    int filt_direction = 0;

-    int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+    int Bias = 0;                       // Bias against raising loop filter and in favor of lowering it

-    //  Make a copy of the unfiltered / processed recon buffer
-    vp8_yv12_copy_y_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+    int ss_err[MAX_LOOP_FILTER + 1];
+
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    vpx_memset(ss_err, 0, sizeof(ss_err));
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;

    if (cm->frame_type == KEY_FRAME)
        cm->sharpness_level = 0;
@@ -307,15 +315,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;

    // Get baseline error score
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+
    vp8cx_set_alt_lf_level(cpi, filt_mid);
    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);

-    best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-    filt_best = filt_mid;
+    best_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                               IF_RTCD(&cpi->rtcd.variance));

-goto skip;
-    //  Re-instate the unfiltered frame
-    vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+    ss_err[filt_mid] = best_err;
+
+    filt_best = filt_mid;

    while (filter_step > 0)
    {
@@ -330,14 +342,19 @@ goto skip;

        if ((filt_direction <= 0) && (filt_low != filt_mid))
        {
-            // Get Low filter error score
-            vp8cx_set_alt_lf_level(cpi, filt_low);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+            if(ss_err[filt_low] == 0)
+            {
+                // Get Low filter error score
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_low);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_low] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_low];

            // If value is close to the best so far then bias towards a lower loop filter value.
            if ((filt_err - Bias) < best_err)
@@ -353,13 +370,18 @@ goto skip;
        // Now look at filt_high
        if ((filt_direction >= 0) && (filt_high != filt_mid))
        {
-            vp8cx_set_alt_lf_level(cpi, filt_high);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+            if(ss_err[filt_high] == 0)
+            {
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_high);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
-
-            //  Re-instate the unfiltered frame
-            vp8_yv12_copy_y_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_high] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_high];

            // Was it better than the previous best?
            if (filt_err < (best_err - Bias))
@@ -382,6 +404,8 @@ goto skip;
        }
    }

-skip:
    cm->filter_level = filt_best;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
 }
--- a/Show More
+++ b/Show More