Update CHANGELOG for v1.0.0 (Duclair) release

Change-Id: I64472f717e5ef3672e1032b7ee24e73c4d0fff1f
Update AUTHORS
2012-01-27 10:36:39 -08:00 · 2012-01-27 10:36:12 -08:00 · 2012-01-27 10:36:11 -08:00 · 2012-01-27 10:13:20 -08:00 · 2012-01-26 09:37:27 -08:00 · 2012-01-24 15:41:59 -08:00
117 changed files with 3272 additions and 4143 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -3,3 +3,5 @@ Johann Koenig <johannkoenig@google.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Tom Finegan <tomfinegan@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
--- a/4
+++ b/4
@@ -6,10 +6,12 @@ Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Deb Mukherjee <debargha@google.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
@@ -21,6 +23,7 @@ Henrik Lundin <hlundin@google.com>
 James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
+Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
@@ -41,6 +44,7 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Ronald S. Bultje <rbultje@google.com>
 Scott LaVarnway <slavarnway@google.com>
--- a/48
+++ b/48
@@ -1,3 +1,51 @@
+2012-01-27 v1.0.0 "Duclair"
+  Our fourth named release, focused on performance and features related to
+  real-time encoding. It also fixes a decoder crash bug introduced in
+  v0.9.7, so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+      This release is ABI incompatible with prior releases of libvpx, so the
+      "major" version number has been bumped to 1. You must recompile your
+      applications against the latest version of the libvpx headers. The
+      API remains compatible, and this should not require code changes in most
+      applications.
+
+  - Enhancements:
+      This release introduces several substantial new features to the encoder,
+      of particular interest to real time streaming applications.
+
+      Temporal scalability allows the encoder to produce a stream that can
+      be decimated to different frame rates, with independent rate targetting
+      for each substream.
+
+      Multiframe quality enhancement postprocessing can make visual quality
+      more consistent in the presence of frames that are substantially
+      different quality than the surrounding frames, as in the temporal
+      scalability case and in some forced keyframe scenarios.
+
+      Multiple-resolution encoding support allows the encoding of the
+      same content at different resolutions faster than encoding them
+      separately.
+
+  - Speed:
+      Optimization targets for this release included the decoder and the real-
+      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
+      this release. Encoder improvements followed a curve where speeds 1-3
+      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
+      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
+      Cayuga release.
+
+  - Quality:
+      Encoder quality in the single stream case is consistent with the Cayuga
+      release.
+
+  - Bug Fixes:
+      This release fixes an OOB read decoder crash bug present in v0.9.7
+      related to the clamping of motion vectors in SPLITMV blocks. This
+      behavior could be triggered by corrupt input or by starting
+      decoding from a P-frame.
+
+
 2011-08-15 v0.9.7-p1 "Cayuga" patch 1
  This is an incremental bugfix release against Cayuga. All users of that
  release are strongly encouraged to upgrade.
--- a/8
+++ b/8
@@ -42,17 +42,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  --help output of the configure script. As of this writing, the list of
  available targets is:

+    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
-    armv5te-symbian-gcc
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
-    armv6-symbian-gcc
-    iwmmxt-linux-rvct
-    iwmmxt-linux-gcc
-    iwmmxt2-linux-rvct
-    iwmmxt2-linux-gcc
+    armv7-android-gcc
    armv7-linux-rvct
    armv7-linux-gcc
    mips32-linux-gcc
--- a/args.c
+++ b/args.c
@@ -57,7 +57,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv)
    }
    else if (def->long_name)
    {
-        int name_len = strlen(def->long_name);
+        const size_t name_len = strlen(def->long_name);

        if (strlen(arg.argv[0]) >= name_len + 2
            && arg.argv[0][1] == '-'
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -0,0 +1,193 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+#
+# This file is to be used for compiling libvpx for Android using the NDK.
+# In an Android project place a libvpx checkout in the jni directory.
+# Run the configure script from the jni directory.  Base libvpx
+# encoder/decoder configuration will look similar to:
+# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+#                    --sdk-path=/opt/android-ndk-r6b/
+#
+# When targeting Android, realtime-only is enabled by default.  This can
+# be overridden by adding the command line flag:
+#  --disable-realtime-only
+#
+# This will create .mk files that contain variables that contain the
+# source files to compile.
+#
+# Place an Android.mk file in the jni directory that references the
+# Android.mk file in the libvpx directory:
+# LOCAL_PATH := $(call my-dir)
+# include $(CLEAR_VARS)
+# include libvpx/build/make/Android.mk
+#
+# There are currently two TARGET_ARCH_ABI targets for ARM.
+# armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
+# Application.mk in the jni directory that contains:
+# APP_ABI := armeabi-v7a
+#
+# To change to building armeabi, run ./libvpx/configure again, but with
+# --target=arm5te-android-gcc and and modify the Application.mk file to
+# set APP_ABI := armeabi
+#
+# Running ndk-build will build libvpx and include it in your project.
+#
+
+CONFIG_DIR := $(LOCAL_PATH)
+LIBVPX_PATH := $(LOCAL_PATH)/libvpx
+ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
+ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+
+# Makefiles created by the libvpx configure process
+# This will need to be fixed to handle x86.
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  include $(CONFIG_DIR)/libs-armv7-android-gcc.mk
+else
+  include $(CONFIG_DIR)/libs-armv5te-android-gcc.mk
+endif
+
+# Rule that is normally in Makefile created by libvpx
+# configure.  Used to filter out source files based on configuration.
+enabled=$(filter-out $($(1)-no),$($(1)-yes))
+
+# Override the relative path that is defined by the libvpx
+# configure process
+SRC_PATH_BARE := $(LIBVPX_PATH)
+
+# Include the list of files to be built
+include $(LIBVPX_PATH)/libs.mk
+
+# Want arm, not thumb, optimized
+LOCAL_ARM_MODE := arm
+LOCAL_CFLAGS := -O3
+
+# -----------------------------------------------------------------------------
+# Template  : asm_offsets_template
+# Arguments : 1: assembly offsets file to be created
+#             2: c file to base assembly offsets on
+# Returns   : None
+# Usage     : $(eval $(call asm_offsets_template,<asmfile>, <srcfile>
+# Rationale : Create offsets at compile time using for structures that are
+#             defined in c, but used in assembly functions.
+# -----------------------------------------------------------------------------
+define asm_offsets_template
+
+_SRC:=$(2)
+_OBJ:=$(ASM_CNV_PATH)/$$(notdir $(2)).S
+
+_FLAGS = $$($$(my)CFLAGS) \
+          $$(call get-src-file-target-cflags,$(2)) \
+          $$(call host-c-includes,$$(LOCAL_C_INCLUDES) $$(CONFIG_DIR)) \
+          $$(LOCAL_CFLAGS) \
+          $$(NDK_APP_CFLAGS) \
+          $$(call host-c-includes,$$($(my)C_INCLUDES)) \
+          -DINLINE_ASM \
+          -S \
+
+_TEXT = "Compile $$(call get-src-file-text,$(2))"
+_CC   = $$(TARGET_CC)
+
+$$(eval $$(call ev-build-file))
+
+$(1) : $$(_OBJ) $(2)
+	@mkdir -p $$(dir $$@)
+	@grep -w EQU $$< | tr -d '\#' | $(CONFIG_DIR)/$(ASM_CONVERSION) > $$@
+endef
+
+# Use ads2gas script to convert from RVCT format to GAS format.  This passes
+#  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
+#  to handle removing these
+ASM_CNV_OFFSETS_DEPEND = $(ASM_CNV_PATH)/asm_com_offsets.asm
+ifeq ($(CONFIG_VP8_DECODER), yes)
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_dec_offsets.asm
+endif
+ifeq ($(CONFIG_VP8_ENCODER), yes)
+  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/asm_enc_offsets.asm
+endif
+
+.PRECIOUS: %.asm.s
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
+	@mkdir -p $(dir $@)
+	@$(CONFIG_DIR)/$(ASM_CONVERSION) <$< > $@
+
+
+LOCAL_SRC_FILES += vpx_config.c
+
+# Remove duplicate entries
+CODEC_SRCS_UNIQUE = $(sort $(CODEC_SRCS))
+
+# Pull out C files.  vpx_config.c is in the immediate directory and
+# so it does not need libvpx/ prefixed like the rest of the source files.
+CODEC_SRCS_C = $(filter %.c, $(CODEC_SRCS_UNIQUE))
+LOCAL_CODEC_SRCS_C = $(filter-out vpx_config.c, $(CODEC_SRCS_C))
+
+LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libvpx/$(file))
+
+# Pull out assembly files, splitting NEON from the rest.  This is
+# done to specify that the NEON assembly files use NEON assembler flags.
+CODEC_SRCS_ASM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM = $(foreach v, \
+                 $(CODEC_SRCS_ASM_ALL), \
+                 $(if $(findstring neon,$(v)),,$(v)))
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                         $(CODEC_SRCS_ASM))
+LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  CODEC_SRCS_ASM_NEON = $(foreach v, \
+                        $(CODEC_SRCS_ASM_ALL),\
+                        $(if $(findstring neon,$(v)),$(v),))
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                                $(CODEC_SRCS_ASM_NEON))
+  LOCAL_SRC_FILES += $(patsubst %.s, \
+                     %.s.neon, \
+                     $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+endif
+
+LOCAL_CFLAGS += \
+    -DHAVE_CONFIG_H=vpx_config.h \
+    -I$(LIBVPX_PATH) \
+    -I$(ASM_CNV_PATH)
+
+LOCAL_MODULE := libvpx
+
+LOCAL_LDLIBS := -llog
+
+LOCAL_STATIC_LIBRARIES := cpufeatures
+
+.PHONY: clean
+clean:
+	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
+	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	@$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
+	@$(RM) -r $(ASM_CNV_PATH)
+
+include $(BUILD_SHARED_LIBRARY)
+
+$(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_com_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/common/asm_com_offsets.c))
+
+ifeq ($(CONFIG_VP8_DECODER), yes)
+  $(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_dec_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/decoder/asm_dec_offsets.c))
+endif
+
+ifeq ($(CONFIG_VP8_ENCODER), yes)
+  $(eval $(call asm_offsets_template,\
+    $(ASM_CNV_PATH)/asm_enc_offsets.asm, \
+    $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c))
+endif
+
+$(call import-module,cpufeatures)
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -129,11 +129,14 @@ while (<STDIN>)
    # ARM code
    s/\sARM/.arm/g;

+    # eabi_attributes numerical equivalents can be found in the
+    # "ARM IHI 0045C" document.
+
    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
+    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;

    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
+    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;

    # Use PROC and ENDP to give the symbols a .size directive.
    # This makes them show up properly in debugging tools like gdb and valgrind.
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -30,6 +30,8 @@ my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8

 my @incoming_array;

+my @imported_functions;
+
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -132,7 +134,18 @@ while (<STDIN>)
    # Make function visible to linker, and make additional symbol with
    # prepended underscore
    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-    s/IMPORT\s+\|([\$\w]*)\|/.globl $1/;
+
+    # Prepend imported functions with _
+    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
+    {
+        $function = trim($1);
+        push(@imported_functions, $function);
+    }
+
+    foreach $function (@imported_functions)
+    {
+        s/$function/_$function/;
+    }

    # No vertical bars required; make additional symbol with prepended
    # underscore
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -477,7 +477,11 @@ process_common_cmdline() {
        --libdir=*)
        libdir="${optval}"
        ;;
-        --libc|--as|--prefix|--libdir)
+        --sdk-path=*)
+        [ -d "${optval}" ] || die "Not a directory: ${optval}"
+        sdk_path="${optval}"
+        ;;
+        --libc|--as|--prefix|--libdir|--sdk-path)
        die "Option ${opt} requires argument"
        ;;
        --help|-h) show_help
@@ -561,6 +565,10 @@ process_common_toolchain() {
                tgt_isa=x86_64
                tgt_os=darwin10
                ;;
+            *darwin11*)
+                tgt_isa=x86_64
+                tgt_os=darwin11
+                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -599,8 +607,8 @@ process_common_toolchain() {

    # Enable the architecture family
    case ${tgt_isa} in
-        arm*|iwmmxt*) enable arm;;
-    mips*)        enable mips;;
+        arm*) enable arm;;
+        mips*) enable mips;;
    esac

    # PIC is probably what we want when building shared libs
@@ -617,6 +625,9 @@ process_common_toolchain() {
    if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then
        osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk"
    fi
+    if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then
+        osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk"
+    fi

    case ${toolchain} in
        *-darwin8-*)
@@ -637,6 +648,12 @@ process_common_toolchain() {
            add_ldflags "-isysroot ${osx_sdk_dir}"
            add_ldflags "-mmacosx-version-min=10.6"
            ;;
+        *-darwin11-*)
+            add_cflags  "-isysroot ${osx_sdk_dir}"
+            add_cflags  "-mmacosx-version-min=10.7"
+            add_ldflags "-isysroot ${osx_sdk_dir}"
+            add_ldflags "-mmacosx-version-min=10.7"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -652,37 +669,25 @@ process_common_toolchain() {

    # Process ARM architecture variants
    case ${toolchain} in
-    arm*|iwmmxt*)
-    # on arm, isa versions are supersets
-    enabled armv7a && soft_enable armv7 ### DEBUG
-    enabled armv7 && soft_enable armv6
-    enabled armv7 || enabled armv6 && soft_enable armv5te
-    enabled armv7 || enabled armv6 && soft_enable fast_unaligned
-    enabled iwmmxt2 && soft_enable iwmmxt
-    enabled iwmmxt && soft_enable armv5te
+    arm*)
+        # on arm, isa versions are supersets
+        enabled armv7a && soft_enable armv7 ### DEBUG
+        enabled armv7 && soft_enable armv6
+        enabled armv7 || enabled armv6 && soft_enable armv5te
+        enabled armv7 || enabled armv6 && soft_enable fast_unaligned

-    asm_conversion_cmd="cat"
+        asm_conversion_cmd="cat"

        case ${tgt_cc} in
        gcc)
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                CROSS=${CROSS:-arm-iwmmxt-linux-gnueabi-}
-            elif enabled symbian; then
-                CROSS=${CROSS:-arm-none-symbianelf-}
-            else
-                CROSS=${CROSS:-arm-none-linux-gnueabi-}
-            fi
+            CROSS=${CROSS:-arm-none-linux-gnueabi-}
            link_with_cc=gcc
            setup_gnu_toolchain
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --defsym ARCHITECTURE=${arch_int}
            tune_cflags="-mtune="
-        if enabled iwmmxt || enabled iwmmxt2
-            then
-                check_add_asflags -mcpu=${tgt_isa}
-            elif enabled armv7
+            if enabled armv7
            then
                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
                check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
@@ -727,12 +732,49 @@ process_common_toolchain() {
            disable multithread
            disable os_support
            ;;
+
+        android*)
+            SDK_PATH=${sdk_path}
+            COMPILER_LOCATION=`find "${SDK_PATH}" \
+                               -name "arm-linux-androideabi-gcc*" -print -quit`
+            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+            CC=${TOOLCHAIN_PATH}gcc
+            AR=${TOOLCHAIN_PATH}ar
+            LD=${TOOLCHAIN_PATH}gcc
+            AS=${TOOLCHAIN_PATH}as
+            STRIP=${TOOLCHAIN_PATH}strip
+            NM=${TOOLCHAIN_PATH}nm
+
+            if [ -z "${alt_libc}" ]; then
+                alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+                          awk '{n = split($0,a,"/"); \
+                                split(a[n-1],b,"-"); \
+                                print $0 " " b[2]}' | \
+                          sort -g -k 2 | \
+                          awk '{ print $1 }' | tail -1`
+            fi
+
+            add_cflags "--sysroot=${alt_libc}"
+            add_ldflags "--sysroot=${alt_libc}"
+
+            enable pic
+            soft_enable realtime_only
+            if enabled armv7
+            then
+                enable runtime_cpu_detect
+            fi
+          ;;
+
        darwin*)
-            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
+            if [ -z "${sdk_path}" ]; then
+                SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
+            else
+                SDK_PATH=${sdk_path}
+            fi
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -746,13 +788,14 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"
+            if [ -z "${alt_libc}" ]; then
+                alt_libc=${SDK_PATH}/SDKs/iPhoneOS5.0.sdk
+            fi

-            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk
+            add_cflags  "-isysroot ${alt_libc}"

            # Add the paths for the alternate libc
-            for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
+            for d in usr/include; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done
@@ -789,19 +832,6 @@ process_common_toolchain() {
            fi
        ;;

-        symbian*)
-            enable symbian
-            # Add the paths for the alternate libc
-            for d in include/libc; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
-            done
-            for d in release/armv5/urel; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-            done
-            add_cflags -DIMPORT_C=
-
        esac
    ;;
    mips*)
@@ -988,6 +1018,7 @@ EOF
    if enabled multithread; then
        case ${toolchain} in
            *-win*);;
+            *-android-gcc);;
            *) check_header pthread.h && add_extralibs -lpthread
        esac
    fi
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -1,291 +0,0 @@
-#!/bin/sh
-self=$0
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [options] FILE
-
-Reads the Run Time CPU Detections definitions from FILE and generates a
-C header file on stdout.
-
-Options:
-  --arch=ARCH   Architecture to generate defs for (required)
-  --disable-EXT Disable support for EXT extensions
-  --sym=SYMBOL  Unique symbol to use for RTCD initialization function
-
-EOF
-  exit 1
-}
-
-die() {
-  echo "$@" >&2
-  exit 1
-}
-
-die_argument_required() {
-  die "Option $opt requires argument"
-}
-
-for opt; do
-  optval="${opt#*=}"
-  case "$opt" in
-    --arch) die_argument_required;;
-    --arch=*) arch=${optval};;
-    --disable-*) eval "disable_${opt#--disable-}=true";;
-    --sym) die_argument_required;;
-    --sym=*) symbol=${optval};;
-    --rtcd=*) CONFIG_RUNTIME_CPU_DETECT=${optval};;
-    -h|--help)
-      usage
-      ;;
-    -*)
-      die "Unrecognized option: ${opt%%=*}"
-      ;;
-    *)
-      defs_file="$defs_file $opt"
-      ;;
-  esac
-  shift
-done
-for f in $defs_file; do [ -f "$f" ] || usage; done
-[ -n "$arch" ] || usage
-
-#
-# Routines for the RTCD DSL to call
-#
-prototype() {
-  local rtyp="$1"
-  local fn="$2"
-  local args="$3"
-
-  eval "${2}_rtyp='$1'"
-  eval "${2}_args='$3'"
-  ALL_FUNCS="$ALL_FUNCS $fn"
-}
-
-specialize() {
-  local fn="$1"
-  shift
-  for opt in c "$@"; do
-    eval "${fn}_${opt}=${fn}_${opt}"
-  done
-}
-
-require() {
-  for fn in $ALL_FUNCS; do
-    for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-
-      # if we already have a default, then we can undefine it, as we know
-      # we can do better.
-      local best=$(eval "echo \$${fn}_default")
-      [ -n "$best" ] && eval "unset $best"
-      eval "${fn}_default=${fn}_${opt}"
-    done
-  done
-}
-
-forward_decls() {
-  ALL_FORWARD_DECLS="$ALL_FORWARD_DECLS $1"
-}
-
-#
-# Include the user's directives
-#
-for f in $defs_file; do
-  . $f
-done
-
-#
-# Process the directives according to the command line
-#
-process_forward_decls() {
-  for fn in $ALL_FORWARD_DECLS; do
-    eval $fn
-  done
-}
-
-determine_indirection() {
-  [ "$CONFIG_RUNTIME_CPU_DETECT" = "yes" ] || require $ALL_ARCHS
-  for fn in $ALL_FUNCS; do
-    local n=""
-    local rtyp=$(eval "echo \$${fn}_rtyp")
-    local args=$(eval "echo \$${fn}_args")
-    local dfn=$(eval "echo \$${fn}_default")
-    dfn=$(eval "echo \$${dfn}")
-    for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-      n="${n}x"
-    done
-    if [ "$n" = "x" ]; then
-      eval "${fn}_indirect=false"
-    else
-      eval "${fn}_indirect=true"
-    fi
-    echo
-  done
-}
-
-declare_function_pointers() {
-  for fn in $ALL_FUNCS; do
-    local n=""
-    local rtyp=$(eval "echo \$${fn}_rtyp")
-    local args=$(eval "echo \$${fn}_args")
-    local dfn=$(eval "echo \$${fn}_default")
-    dfn=$(eval "echo \$${dfn}")
-    for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-      n="${n}x"
-      echo "$rtyp ${ofn}($args);"
-    done
-    if [ "$n" = "x" ]; then
-      echo "#define ${fn} ${dfn}"
-    else
-      echo "RTCD_EXTERN $rtyp (*${fn})($args);"
-    fi
-    echo
-  done
-}
-
-set_function_pointers() {
-  for fn in $ALL_FUNCS; do
-    local n=""
-    local rtyp=$(eval "echo \$${fn}_rtyp")
-    local args=$(eval "echo \$${fn}_args")
-    local dfn=$(eval "echo \$${fn}_default")
-    dfn=$(eval "echo \$${dfn}")
-    if $(eval "echo \$${fn}_indirect"); then
-      echo "    $fn = $dfn;"
-      for opt in "$@"; do
-        local ofn=$(eval "echo \$${fn}_${opt}")
-        [ -z "$ofn" ] && continue
-        [ "$ofn" = "$dfn" ] && continue;
-        echo "    if (have_${opt}) $fn = $ofn;"
-      done
-      echo
-    fi
-  done
-}
-
-filter() {
-  local filtered
-  for opt in "$@"; do
-    [ -z $(eval "echo \$disable_${opt}") ] && filtered="$filtered $opt"
-  done
-  echo $filtered
-}
-
-#
-# Helper functions for generating the arch specific RTCD files
-#
-common_top() {
-  local outfile_basename=$(basename ${outfile:-rtcd.h})
-  local include_guard=$(echo -n $outfile_basename | tr [a-z] [A-Z] | tr -c [A-Z] _)
-  cat <<EOF
-#ifndef ${include_guard}
-#define ${include_guard}
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-$(process_forward_decls)
-
-$(declare_function_pointers c $ALL_ARCHS)
-EOF
-}
-
-common_bottom() {
-  cat <<EOF
-#endif
-EOF
-}
-
-x86() {
-  determine_indirection c $ALL_ARCHS
-  cat <<EOF
-$(common_top)
-void ${symbol:-rtcd}(void);
-
-#ifdef RTCD_C
-#include "vpx_ports/x86.h"
-void ${symbol:-rtcd}(void)
-{
-    int flags = x86_simd_caps();
-EOF
-
-  # Write out the helper variable for each enabled extension
-  for opt in $ALL_ARCHS; do
-    local uc=$(echo -n $opt | tr [a-z] [A-Z])
-    echo "    int have_${opt} = flags & HAS_${uc};"
-  done
-  cat <<EOF
-
-$(set_function_pointers c $ALL_ARCHS)
-}
-#endif
-$(common_bottom)
-EOF
-}
-
-arm() {
-  determine_indirection c $ALL_ARCHS
-  cat <<EOF
-$(common_top)
-#include "vpx_config.h"
-#include "vp8/decoder/onyxd_int.h"
-
-void ${symbol:-rtcd}(VP8D_COMP *pbi);
-
-#ifdef RTCD_C
-void ${symbol:-rtcd}(VP8D_COMP *pbi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = pbi->common.rtcd.flags;
-
-    int have_v5te = flags & HAS_EDSP;
-    int have_v6 = flags & HAS_MEDIA;
-    int have_neon = flags & HAS_NEON;
-#endif
-
-$(set_function_pointers c $ALL_ARCHS)
-}
-#endif
-$(common_bottom)
-EOF
-}
-
-#
-# Main Driver
-#
-require c
-case $arch in
-  x86)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 sse4_1)
-    x86
-    ;;
-  x86_64)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 sse4_1)
-    require $(filter mmx sse sse2)
-    x86
-    ;;
-  armv5te)
-    ALL_ARCHS=$(filter v5te)
-    arm
-    ;;
-  armv6)
-    ALL_ARCHS=$(filter v5te v6)
-    arm
-    ;;
-  armv7)
-    ALL_ARCHS=$(filter v5te v6 neon)
-    arm
-    ;;
-  *)
-    die "Unrecognized architecture: $arch"
-esac
--- a/12
+++ b/12
@@ -25,6 +25,7 @@ Advanced options:
  ${toggle_unit_tests}            build unit tests
  --libc=PATH                     path to alternate libc
  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
+  --sdk-path=PATH                 path to root of sdk (iOS, android builds only)
  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                  supported by hardware [auto]
  ${toggle_codec_srcs}            in/exclude codec library source code
@@ -80,19 +81,15 @@ EOF

 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
+all_platforms="${all_platforms} armv5te-android-gcc"
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
-all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} iwmmxt-linux-rvct"
-all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-linux-rvct"
-all_platforms="${all_platforms} iwmmxt2-linux-gcc"
+all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
@@ -119,6 +116,7 @@ all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
+all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -197,8 +195,6 @@ ARCH_EXT_LIST="
    armv5te
    armv6
    armv7
-    iwmmxt
-    iwmmxt2

    mips32

--- a/examples.mk
+++ b/examples.mk
@@ -102,6 +102,7 @@ vp8_multi_resolution_encoder.SRCS  \
                         += third_party/libyuv/include/libyuv/basic_types.h  \
                            third_party/libyuv/include/libyuv/cpu_id.h  \
                            third_party/libyuv/include/libyuv/scale.h  \
+                            third_party/libyuv/source/row.h \
                            third_party/libyuv/source/scale.c  \
                            third_party/libyuv/source/cpu_id.c
 vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -58,7 +58,7 @@ if(frame_cnt%30 == 1) {
    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn off postproc");
 } else if(frame_cnt%30 == 16) {
-    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0};
+    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};

    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn on postproc");
--- a/libs.mk
+++ b/libs.mk
@@ -322,15 +322,6 @@ endif
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

-#
-# Rule to generate runtime cpu detection files
-#
-$(OBJS-yes:.o=.d): vpx_rtcd.h
-vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
-	$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) --sym=vpx_rtcd \
-      --rtcd=$(CONFIG_RUNTIME_CPU_DETECT) $^ > $@
-CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h
-
 CODEC_DOC_SRCS += vpx/vpx_codec.h \
                  vpx/vpx_decoder.h \
                  vpx/vpx_encoder.h \
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -43,6 +43,8 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)

    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);

    vpx_free(oci->above_context);
    vpx_free(oci->mip);
@@ -101,6 +103,8 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        return 1;
    }

+    oci->post_proc_buffer_int_used = 0;
+
    oci->mb_rows = height >> 4;
    oci->mb_cols = width >> 4;
    oci->MBs = oci->mb_rows * oci->mb_cols;
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,7 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/pragmas.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
@@ -63,6 +62,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
        rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
+
+        rtcd->dequant.block               = vp8_dequantize_b_v6;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+
    }
 #endif

@@ -97,6 +102,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mby_neon;
        rtcd->recon.build_intra_predictors_mby_s =
            vp8_build_intra_predictors_mby_s_neon;
+
+        rtcd->dequant.block               = vp8_dequantize_b_neon;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+
    }
 #endif

--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -10,6 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/dequantize.h"


 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -10,7 +10,8 @@


 #include "vpx_config.h"
-#include "vp8/common/blockd.h"
+#include "vp8/common/dequantize.h"
+#include "vp8/common/idct.h"

 #if HAVE_ARMV7
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
@@ -22,22 +23,20 @@ extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

 #if HAVE_ARMV7

-void vp8_dequantize_b_neon(BLOCKD *d)
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
 }
 #endif

 #if HAVE_ARMV6
-void vp8_dequantize_b_v6(BLOCKD *d)
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
 {
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -22,13 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
 #endif
@@ -44,13 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif

--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -10,6 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/dequantize.h"

 /* place these declarations here because we don't want to maintain them
 * outside of this scope
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
--- a/vp8/common/bigend.h
+++ b/vp8/common/bigend.h
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _bigend_h
-#define _bigend_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#define invert2(x) ( (((x)>>8)&0x00ff) | (((x)<<8)&0xff00) )
-#define invert4(x) ( ((invert2(x)&0x0000ffff)<<16) | (invert2((x>>16))&0x0000ffff) )
-
-#define high_byte(x) (unsigned char)x
-#define mid2Byte(x) (unsigned char)(x >> 8)
-#define mid1Byte(x) (unsigned char)(x >> 16)
-#define low_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 1
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -21,9 +21,6 @@ void vpx_log(const char *format, ...);
 #include "subpixel.h"
 #include "vpx_ports/mem.h"

-#define TRUE    1
-#define FALSE   0
-
 /*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
@@ -182,12 +179,11 @@ typedef struct
 } LOWER_RES_INFO;
 #endif

-typedef struct blockd
+typedef struct
 {
    short *qcoeff;
    short *dqcoeff;
    unsigned char  *predictor;
-    short *diff;
    short *dequant;

    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
@@ -206,12 +202,16 @@ typedef struct blockd

 typedef struct MacroBlockD
 {
-    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);

+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
+
    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
    BLOCKD block[25];
    int fullpixel_mask;
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -18,8 +18,6 @@

 #include "vpx_mem/vpx_mem.h"

-#include "common_types.h"
-
 /* Only need this for fixed-size arrays, for structs just assign. */

 #define vp8_copy( Dest, Src) { \
--- a/vp8/common/common_types.h
+++ b/vp8/common/common_types.h
@@ -1,18 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
-
-#define TRUE    1
-#define FALSE   0
-
-#endif
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -10,17 +10,15 @@


 #include "vpx_config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/blockd.h"
+#include "dequantize.h"
 #include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"

-void vp8_dequantize_b_c(BLOCKD *d)
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
 {
    int i;
    short *DQ  = d->dqcoeff;
    short *Q   = d->qcoeff;
-    short *DQC = d->dequant;

    for (i = 0; i < 16; i++)
    {
--- a/vp8/common/dequantize.h
+++ b/vp8/common/dequantize.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_H
+#define DEQUANTIZE_H
+#include "vp8/common/blockd.h"
+
+#define prototype_dequant_block(sym) \
+    void sym(BLOCKD *x, short *DQC)
+
+#define prototype_dequant_idct_add(sym) \
+    void sym(short *input, short *dq, \
+             unsigned char *output, \
+             int stride)
+
+#define prototype_dequant_idct_add_y_block(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *dst, \
+             int stride, char *eobs)
+
+#define prototype_dequant_idct_add_uv_block(sym) \
+    void sym(short *q, short *dq, \
+             unsigned char *dst_u, \
+             unsigned char *dst_v, int stride, char *eobs)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/dequantize_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/dequantize_arm.h"
+#endif
+
+#ifndef vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_c
+#endif
+extern prototype_dequant_block(vp8_dequant_block);
+
+#ifndef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+#endif
+extern prototype_dequant_idct_add(vp8_dequant_idct_add);
+
+#ifndef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+#endif
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
+
+#ifndef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+#endif
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
+
+
+typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
+
+typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
+
+typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
+
+typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
+
+typedef struct
+{
+    vp8_dequant_block_fn_t               block;
+    vp8_dequant_idct_add_fn_t            idct_add;
+    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
+    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
+} vp8_dequant_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define DEQUANT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define DEQUANT_INVOKE(ctx,fn) vp8_dequant_##fn
+#endif
+
+#endif
--- a/vp8/common/dma_desc.h
+++ b/vp8/common/dma_desc.h
@@ -1,125 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _dma_desc_h
-#define _dma_desc_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-#define NDSIZE_LG   0x00000900  // Next Descriptor Size
-#define NDSIZE_SM   0x00000800  // Next Descriptor Size
-#define NDSIZE_7    0x00000700  // Next Descriptor Size
-#define NDSIZE_6    0x00000600  // Next Descriptor Size
-#define NDSIZE_5    0x00000500  // Next Descriptor Size
-#define NDSIZE_4    0x00000400  // Next Descriptor Size
-#define NDSIZE_3    0x00000300  // Next Descriptor Size
-#define NDSIZE_2    0x00000200  // Next Descriptor Size
-#define NDSIZE_1    0x00000100  // Next Descriptor Size
-
-#define FLOW_STOP       0x0000
-#define FLOW_AUTO       0x1000
-#define FLOW_DESC_AR    0x4000
-#define FLOW_DESC_SM    0x6000
-#define FLOW_DESC_LG    0x7000
-
-    typedef struct
-    {
-        unsigned int ndp;
-        //unsigned short ndpl;
-        //unsigned short ndph;
-        unsigned int sa;
-        //unsigned short sal;
-        //unsigned short sah;
-
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-
-    } LARGE_DESC;
-
-    typedef struct
-    {
-        unsigned short ndpl;
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } SMALL_DESC;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-        unsigned short ymod;
-    } ARRAY_DESC_7;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-        unsigned short ycnt;
-    } ARRAY_DESC_6;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-        unsigned short xmod;
-    } ARRAY_DESC_5;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-        unsigned short xcnt;
-    } ARRAY_DESC_4;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-        unsigned short dmacfg;
-    } ARRAY_DESC_3;
-
-    typedef struct
-    {
-        unsigned short sal;
-        unsigned short sah;
-    } ARRAY_DESC_2;
-
-    typedef struct
-    {
-        unsigned short sal;
-    } ARRAY_DESC_1;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif //_dma_desc_h
--- a/vp8/common/duck_io.h
+++ b/vp8/common/duck_io.h
@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _duck_io_h
-#define _duck_io_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined (_WIN32)
-    typedef __int64 int64_t;
-#elif defined(__MWERKS__)
-    typedef long long int64_t;
-#elif defined(__APPLE__) || defined(__POWERPC)
-#include <ppc/types.h>
-#else
-    typedef long long int64_t;
-#endif
-
-    typedef struct
-    {
-        int64_t  offset;     // offset to start from
-        int    blocking;    // non-zero for blocking
-    } re_open_t;
-
-
-    typedef enum
-    {
-        SAL_ERR_MAX                 = -10,
-        SAL_ERROR                   = -11, // Default error
-        SAL_ERR_WSASTARTUP          = -12,
-        SAL_ERR_SOCKET_CREATE       = -13,
-        SAL_ERR_RESOLVING_HOSTNAME  = -14,
-        SAL_ERR_SERVER_CONNECTION   = -15,
-        SAL_ERR_SENDING_DATA        = -16,
-        SAL_ERR_RECEIVING_DATA      = -17,
-        SAL_ERR_404_FILE_NOT_FOUND  = -18,
-        SAL_ERR_PARSING_HTTP_HEADER = -19,
-        SAL_ERR_PARSING_CONTENT_LEN = -20,
-        SAL_ERR_CONNECTION_TIMEOUT  = -21,
-        SAL_ERR_FILE_OPEN_FAILED    = -22,
-        SAL_ERR_MIN                 = -23
-    } SAL_ERR; /* EMH 1-15-03 */
-
-
-    typedef struct sal_err_map_temp
-    {
-        SAL_ERR code;
-        const char *decode;
-
-    } sal_err_map_t;
-
-
-    static char *sal_err_text(SAL_ERR e)
-    {
-        int t;
-        const sal_err_map_t g_sal_err_map[] =
-        {
-            {   SAL_ERR_WSASTARTUP,             "Error with WSAStartup"         },
-            {   SAL_ERR_SOCKET_CREATE,          "Error creating socket"         },
-            {   SAL_ERR_RESOLVING_HOSTNAME,     "Error resolving hostname"      },
-            {   SAL_ERR_SERVER_CONNECTION,      "Error connecting to server"    },
-            {   SAL_ERR_SENDING_DATA,           "Error sending data"            },
-            {   SAL_ERR_RECEIVING_DATA,         "Error receiving data"          },
-            {   SAL_ERR_404_FILE_NOT_FOUND,     "Error file not found "         },
-            {   SAL_ERR_PARSING_HTTP_HEADER,    "Error parsing http header"     },
-            {   SAL_ERR_PARSING_CONTENT_LEN,    "Error parsing content length"  },
-            {   SAL_ERR_CONNECTION_TIMEOUT,     "Error Connection timed out"    },
-            {   SAL_ERR_FILE_OPEN_FAILED,       "Error opening file"            }
-        };
-
-        for (t = 0; t < sizeof(g_sal_err_map) / sizeof(sal_err_map_t); t++)
-        {
-            if (e == g_sal_err_map[t].code)
-                return (char *) g_sal_err_map[t].decode;
-        }
-
-        return 0;
-    }
-
-
-
-
-
-
-
-    int duck_open(const char *fname, unsigned long user_data);
-
-    void duck_close(int ghndl);
-
-    int duck_read(int ghndl, unsigned char *buf, int nbytes);
-
-    int64_t duck_seek(int g_hndl, int64_t offs, int origin);
-
-    int duck_read_finished(int han, int flag); /* FWG 7-9-99 */
-
-    int duck_name(int handle, char name[], size_t max_len); /* EMH 9-23-03 */
-
-    int duck_read_blocking(int handle, unsigned char *buffer, int bytes); /* EMH 9-23-03 */
-
-    int64_t duck_available_data(int handle); /* EMH 10-23-03 */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -134,13 +134,51 @@ void vp8_find_near_mvs
    best_mv->as_int = near_mvs[0].as_int;
    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
    nearby->as_int = near_mvs[CNT_NEAR].as_int;
-
-    //TODO: move clamp outside findnearmv
-    vp8_clamp_mv2(nearest, xd);
-    vp8_clamp_mv2(nearby, xd);
-    vp8_clamp_mv2(best_mv, xd);
 }

+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+    inv->as_mv.row = src->as_mv.row * -1;
+    inv->as_mv.col = src->as_mv.col * -1;
+    vp8_clamp_mv2(inv, xd);
+    vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    int sign_bias = ref_frame_sign_bias[refframe];
+
+    vp8_find_near_mvs(xd,
+                      here,
+                      &mode_mv_sb[sign_bias][NEARESTMV],
+                      &mode_mv_sb[sign_bias][NEARMV],
+                      &best_mv_sb[sign_bias],
+                      cnt,
+                      refframe,
+                      ref_frame_sign_bias);
+
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+                         &mode_mv_sb[sign_bias][NEARMV], xd);
+    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+                         &best_mv_sb[sign_bias], xd);
+
+    return sign_bias;
+}
+
+
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 )
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -60,10 +60,10 @@ static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
                                int mb_to_bottom_edge)
 {
    unsigned int need_to_clamp;
-    need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
-    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
    return need_to_clamp;
 }

@@ -77,6 +77,19 @@ void vp8_find_near_mvs
    int *ref_frame_sign_bias
 );

+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
 vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 );
--- a/vp8/common/g_common.h
+++ b/vp8/common/g_common.h
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-extern void (*vp8_clear_system_state)(void);
-extern void (*vp8_plane_add_noise)(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int DPitch, int q);
-extern void (*de_interlace)
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int Width,
-    int Height,
-    int Stride
-);
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,7 +10,6 @@


 #include "vpx_config.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -70,6 +69,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #if CONFIG_RUNTIME_CPU_DETECT
    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

+
+    rtcd->dequant.block             = vp8_dequantize_b_c;
+    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;
+    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;
+    rtcd->dequant.idct_add_uv_block =
+        vp8_dequant_idct_add_uv_block_c;
+
+
    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
    rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -10,6 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
+#include "dequantize.h"

 void vp8_dequant_idct_add_c(short *input, short *dq,
                            unsigned char *dest, int stride);
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -1,56 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "invtrans.h"
-
-
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
-                             int pitch)
-{
-    if (*b->eob > 1)
-    {
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
-              *(b->base_dst) + b->dst, b->dst_stride);
-    }
-    else
-    {
-        IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
-                         *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-}
-
-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    if(x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        /* do 2nd order transform on the dc block */
-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
-    }
-
-}
-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 8);
-    }
-
-}
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,9 +15,49 @@
 #include "vpx_config.h"
 #include "idct.h"
 #include "blockd.h"
-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#include "onyxc_int.h"

+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
+                                      const VP8_COMMON_RTCD *rtcd)
+{
+    short *DQC = xd->dequant_y1;
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh16)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh1)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+        DQC = xd->dequant_y1_dc;
+    }
+    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+                    (xd->qcoeff, DQC,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+}
 #endif
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -87,7 +87,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 4; c++)
        {
-            x->block[r*4+c].diff      = &x->diff[r * 4 * 16 + c * 4];
            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
        }
    }
@@ -96,7 +95,6 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[16+r*2+c].diff      = &x->diff[256 + r * 4 * 8 + c * 4];
            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;

        }
@@ -106,14 +104,11 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
    {
        for (c = 0; c < 2; c++)
        {
-            x->block[20+r*2+c].diff      = &x->diff[320+ r * 4 * 8 + c * 4];
            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;

        }
    }

-    x->block[24].diff = &x->diff[384];
-
    for (r = 0; r < 25; r++)
    {
        x->block[r].qcoeff  = x->qcoeff  + r * 16;
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -22,9 +22,9 @@ extern "C"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
 #include "ppflags.h"
-    typedef int *VP8_PTR;
+
+    struct VP8_COMP;

    /* Create/destroy static data structures. */

@@ -147,10 +147,14 @@ extern "C"
        int over_shoot_pct;

        // buffering parameters
-        int64_t starting_buffer_level;  // in seconds
+        int64_t starting_buffer_level;  // in bytes
        int64_t optimal_buffer_level;
        int64_t maximum_buffer_size;

+        int64_t starting_buffer_level_in_ms;  // in milli-seconds
+        int64_t optimal_buffer_level_in_ms;
+        int64_t maximum_buffer_size_in_ms;
+
        // controlling quality
        int fixed_q;
        int worst_allowed_q;
@@ -226,27 +230,27 @@ extern "C"

    void vp8_initialize();

-    VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf);
-    void vp8_remove_compressor(VP8_PTR *comp);
+    struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(struct VP8_COMP* *comp);

-    void vp8_init_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
-    void vp8_change_config(VP8_PTR onyx, VP8_CONFIG *oxcf);
+    void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);

 // receive a frames worth of data caller can assume that a copy of this frame is made
 // and not just a copy of the pointer..
-    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
-    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+    int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+    int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+    int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);

-    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
-    int vp8_get_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_set_reference(VP8_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_update_entropy(VP8_PTR comp, int update);
-    int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
-    int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols);
-    int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-    int vp8_get_quantizer(VP8_PTR c);
+    int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(struct VP8_COMP* comp, int update);
+    int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(struct VP8_COMP* c);

 #ifdef __cplusplus
 }
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -22,6 +22,7 @@
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
+#include "dequantize.h"

 /*#ifdef PACKET_TESTING*/
 #include "header.h"
@@ -73,6 +74,7 @@ typedef enum
 typedef struct VP8_COMMON_RTCD
 {
 #if CONFIG_RUNTIME_CPU_DETECT
+    vp8_dequant_rtcd_vtable_t        dequant;
    vp8_idct_rtcd_vtable_t        idct;
    vp8_recon_rtcd_vtable_t       recon;
    vp8_subpix_rtcd_vtable_t      subpix;
@@ -91,9 +93,9 @@ typedef struct VP8Common
 {
    struct vpx_internal_error_info  error;

-    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);

    int Width;
    int Height;
@@ -112,6 +114,8 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;

+    YV12_BUFFER_CONFIG post_proc_buffer_int;
+    int post_proc_buffer_int_used;

    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
    FRAME_TYPE frame_type;
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -18,13 +18,13 @@
 extern "C"
 {
 #endif
-#include "type_aliases.h"
 #include "vpx_scale/yv12config.h"
 #include "ppflags.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_codec.h"

-    typedef void   *VP8D_PTR;
+    struct VP8D_COMP;
+
    typedef struct
    {
        int     Width;
@@ -49,19 +49,19 @@ extern "C"

    void vp8dx_initialize(void);

-    void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x);
+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);

-    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);

-    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);

-    vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);

-    VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+    struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);

-    void vp8dx_remove_decompressor(VP8D_PTR comp);
+    void vp8dx_remove_decompressor(struct VP8D_COMP* comp);

 #ifdef __cplusplus
 }
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -12,9 +12,12 @@
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
+#include "common.h"
+#include "recon.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
+#include "../encoder/variance.h"

 #include <math.h>
 #include <stdlib.h>
@@ -26,6 +29,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)

 /* global constants */
+#define MFQE_PRECISION 4
 #if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
@@ -121,7 +125,6 @@ const short vp8_rv[] =
    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };

-
 extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
@@ -175,6 +178,12 @@ void vp8_post_proc_down_and_across_c
        p_src = dst_ptr;
        p_dst = dst_ptr;

+        for (i = -8; i<0; i++)
+          p_src[i]=p_src[0];
+
+        for (i = cols; i<cols+8; i++)
+          p_src[i]=p_src[cols-1];
+
        for (i = 0; i < 8; i++)
            d[i] = p_src[i];

@@ -225,12 +234,19 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
    unsigned char *s = src;
    unsigned char d[16];

-
    for (r = 0; r < rows; r++)
    {
        int sumsq = 0;
        int sum   = 0;

+        for (i = -8; i<0; i++)
+          s[i]=s[0];
+
+        // 17 avoids valgrind warning - we buffer values in c in d
+        // and only write them when we've read 8 ahead...
+        for (i = cols; i<cols+17; i++)
+          s[i]=s[cols-1];
+
        for (i = -8; i <= 6; i++)
        {
            sumsq += s[i] * s[i];
@@ -269,7 +285,7 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
    int r, c, i;
    const short *rv3 = &vp8_rv[63&rand()];

-    for (c = 0; c < cols; c++)
+    for (c = 0; c < cols; c++ )
    {
        unsigned char *s = &dst[c];
        int sumsq = 0;
@@ -277,6 +293,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
        unsigned char d[16];
        const short *rv2 = rv3 + ((c * 17) & 127);

+        for (i = -8; i < 0; i++)
+          s[i*pitch]=s[0];
+
+        // 17 avoids valgrind warning - we buffer values in c in d
+        // and only write them when we've read 8 ahead...
+        for (i = rows; i < rows+17; i++)
+          s[i*pitch]=s[(rows-1)*pitch];
+
        for (i = -8; i <= 6; i++)
        {
            sumsq += s[i*pitch] * s[i*pitch];
@@ -317,17 +341,18 @@ static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));

+
    POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
    POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);

 }

 void vp8_deblock(YV12_BUFFER_CONFIG         *source,
-                        YV12_BUFFER_CONFIG         *post,
-                        int                         q,
-                        int                         low_var_thresh,
-                        int                         flag,
-                        vp8_postproc_rtcd_vtable_t *rtcd)
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag,
+                 vp8_postproc_rtcd_vtable_t *rtcd)
 {
    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
    int ppl = (int)(level + .5);
@@ -672,12 +697,219 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 }


+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8, 4 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int blksizeby2 = blksize >> 1;
+    int qdiff = qcurr - qprev;
+
+    int i, j;
+    unsigned char *yp;
+    unsigned char *ydp;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, sse, sad, thr;
+    if (blksize == 16)
+    {
+        act = (vp8_variance_var16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        sad = (vp8_variance_sad16x16(y, y_stride, yd, yd_stride, 0)+128)>>8;
+    }
+    else if (blksize == 8)
+    {
+        act = (vp8_variance_var8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        sad = (vp8_variance_sad8x8(y, y_stride, yd, yd_stride, 0)+32)>>6;
+    }
+    else
+    {
+        act = (vp8_variance_var4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
+        sad = (vp8_variance_sad4x4(y, y_stride, yd, yd_stride, 0)+8)>>4;
+    }
+    /* thr = qdiff/8 + log2(act) + log4(qprev) */
+    thr = (qdiff>>3);
+    while (act>>=1) thr++;
+    while (qprev>>=2) thr++;
+    if (sad < thr)
+    {
+        static const int roundoff = (1 << (MFQE_PRECISION - 1));
+        int ifactor = (sad << MFQE_PRECISION) / thr;
+        ifactor >>= (qdiff >> 5);
+        // TODO: SIMD optimize this section
+        if (ifactor)
+        {
+            int icfactor = (1 << MFQE_PRECISION) - ifactor;
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+            {
+                for (j = 0; j < blksize; ++j)
+                    ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+            {
+                for (j = 0; j < blksizeby2; ++j)
+                    vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION);
+            }
+        }
+    }
+    else
+    {
+        if (blksize == 16)
+        {
+            vp8_recon_copy16x16(y, y_stride, yd, yd_stride);
+            vp8_recon_copy8x8(u, uv_stride, ud, uvd_stride);
+            vp8_recon_copy8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else if (blksize == 8)
+        {
+            vp8_recon_copy8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+        else
+        {
+            for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
+                vpx_memcpy(ydp, yp, blksize);
+            for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, blksizeby2);
+            for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, blksizeby2);
+        }
+    }
+}
+
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
 #else
 #define RTCD_VTABLE(oci) NULL
 #endif

+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mb_row;
+    int mb_col;
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (((frame_type == INTER_FRAME &&
+                  abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+                  abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+                 (frame_type == KEY_FRAME)))
+            {
+                if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
+                {
+                    int i, j;
+                    for (i=0; i<2; ++i)
+                        for (j=0; j<2; ++j)
+                            multiframe_quality_enhance_block(8,
+                                                             qcurr,
+                                                             qprev,
+                                                             y_ptr + 8*(i*show->y_stride+j),
+                                                             u_ptr + 4*(i*show->uv_stride+j),
+                                                             v_ptr + 4*(i*show->uv_stride+j),
+                                                             show->y_stride,
+                                                             show->uv_stride,
+                                                             yd_ptr + 8*(i*dest->y_stride+j),
+                                                             ud_ptr + 4*(i*dest->uv_stride+j),
+                                                             vd_ptr + 4*(i*dest->uv_stride+j),
+                                                             dest->y_stride,
+                                                             dest->uv_stride);
+                }
+                else
+                {
+                    multiframe_quality_enhance_block(16,
+                                                     qcurr,
+                                                     qprev,
+                                                     y_ptr,
+                                                     u_ptr,
+                                                     v_ptr,
+                                                     show->y_stride,
+                                                     show->uv_stride,
+                                                     yd_ptr,
+                                                     ud_ptr,
+                                                     vd_ptr,
+                                                     dest->y_stride,
+                                                     dest->uv_stride);
+
+                }
+            }
+            else
+            {
+                vp8_recon_copy16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_recon_copy8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_recon_copy8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
+
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
    int q = oci->filter_level * 10 / 6;
@@ -699,27 +931,69 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
        dest->y_width = oci->Width;
        dest->y_height = oci->Height;
        dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
        return 0;
+    }

+    /* Allocate post_proc_buffer_int if needed */
+    if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+    {
+        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+        {
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int, oci->Width, oci->Height, VP8BORDERINPIXELS) >= 0)
+            {
+                oci->post_proc_buffer_int_used = 1;
+            }
+            // insure that postproc is set to all 0's so that post proc
+            // doesn't pull random data in from edge
+            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,126,(&oci->post_proc_buffer)->frame_size);
+
+        }
    }

 #if ARCH_X86||ARCH_X86_64
    vpx_reset_mmx_state();
 #endif

-    if (flags & VP8D_DEMACROBLOCK)
+    if ((flags & VP8D_MFQE) &&
+         oci->current_video_frame >= 2 &&
+         oci->base_qindex - oci->postproc_state.last_base_qindex >= 10)
+    {
+        vp8_multiframe_quality_enhance(oci);
+        if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+            oci->post_proc_buffer_int_used)
+        {
+            vp8_yv12_copy_frame_ptr(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+            if (flags & VP8D_DEMACROBLOCK)
+            {
+                vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                                               q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+            }
+            else if (flags & VP8D_DEBLOCK)
+            {
+                vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                            q, 1, 0, RTCD_VTABLE(oci));
+            }
+        }
+        /* Move partially towards the base q of the previous frame */
+        oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+    }
+    else if (flags & VP8D_DEMACROBLOCK)
    {
        vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                                       q + (deblock_level - 5) * 10, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else if (flags & VP8D_DEBLOCK)
    {
        vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
                    q, 1, 0, RTCD_VTABLE(oci));
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }
    else
    {
        vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
    }

    if (flags & VP8D_ADDNOISE)
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -104,6 +104,7 @@ struct postproc_state
    int           last_q;
    int           last_noise;
    char          noise[3072];
+    int           last_base_qindex;
    DECLARE_ALIGNED(16, char, blackclamp[16]);
    DECLARE_ALIGNED(16, char, whiteclamp[16]);
    DECLARE_ALIGNED(16, char, bothclamp[16]);
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -9,7 +9,6 @@
 */


-#include "g_common.h"
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -23,7 +23,8 @@ enum
    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
    VP8D_DEBUG_DRAW_MV          = 1<<7,
    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
 };

 typedef struct
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)


 /*encoder only*/
-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
-    unsigned char *pred_ptr = x->predictor;
    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
    int pre_stride = x->block[0].pre_stride;
@@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)

    if ((mv_row | mv_col) & 7)
    {
-        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+            dst_ystride);
    }
 }

@@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
        build_inter4x4_predictors_mb(xd);
    }
 }
-/* encoder only*/
-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
-{
-    int i;
-
-    if (x->mode_info_context->mbmi.partitioning < 3)
-    {
-        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
-        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
-        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
-        x->block[10].bmi = x->mode_info_context->bmi[10];
-
-        build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
-        build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
-    }
-    else
-    {
-        for (i = 0; i < 16; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
-            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, d0->predictor, 16);
-            else
-            {
-                build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
-                build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
-            }
-
-        }
-
-    }
-
-    for (i = 16; i < 24; i += 2)
-    {
-        BLOCKD *d0 = &x->block[i];
-        BLOCKD *d1 = &x->block[i+1];
-
-        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, d0->predictor, 8);
-        else
-        {
-            build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
-            build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
-        }
-    }
-}
-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-    }
-    else
-    {
-        build_4x4uvmvs(xd);
-        build_inter4x4_predictors_mb_e(xd);
-    }
-}
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
                                               int dst_uvstride);


-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         vp8_subpix_fn_t sppf);

 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);

 #endif
--- a/vp8/common/rtcd.c
+++ b/vp8/common/rtcd.c
@@ -1,12 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "vpx_config.h"
-#define RTCD_C
-#include "vpx_rtcd.h"
--- a/vp8/common/type_aliases.h
+++ b/vp8/common/type_aliases.h
@@ -1,117 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
--- a/vp8/common/x86/dequantize_x86.h
+++ b/vp8/common/x86/dequantize_x86.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_X86_H
+#define DEQUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_dequant_block(vp8_dequantize_b_mmx);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_mmx
+
+#undef  vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
+
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
+
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
+extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_dequant_idct_add_y_block
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
+
+#undef vp8_dequant_idct_add_uv_block
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
+
+#endif
+#endif
+
+#endif
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -9,8 +9,18 @@
 */

 #include "vpx_config.h"
-#include "vpx_rtcd.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/dequantize.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+
+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}

 void vp8_dequant_idct_add_y_block_mmx
            (short *q, short *dq,
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -10,6 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
+#include "vp8/common/dequantize.h"

 void vp8_idct_dequant_0_2x_sse2
            (short *q, short *dq ,
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
    SHADOW_ARGS_TO_STACK 3
    SAVE_XMM 7
    GET_GOT     rbx
-    push        rsi
-    push        rdi
    ; end prolog

-        mov         rsi, arg(0)             ;src_ptr
+        mov         rcx, arg(0)             ;src_ptr
        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]

-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
+        lea         rdx, [rcx + rax]
        neg         rax

        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
        movdqa      xmm2, xmm1
        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
+
        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
+        psubusb     xmm1, xmm7              ; p1-=q1
        por         xmm1, xmm0              ; abs(p1-q1)
        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
        psrlw       xmm1, 1                 ; abs(p1-q1)/2

-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
+        movdqa      xmm3, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
        movdqa      xmm0, xmm4              ; q0
        movdqa      xmm6, xmm5              ; p0
        psubusb     xmm5, xmm4              ; p0-=q0
        psubusb     xmm4, xmm6              ; q0-=p0
        por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
        pxor        xmm3, xmm3
        pcmpeqb     xmm5, xmm3

+
        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm7, xmm4     ; q1 offset to convert to signed values
        psubsb      xmm2, xmm7              ; p1 - q1

-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
        movdqa      xmm3, xmm0              ; q0
        psubsb      xmm0, xmm6              ; q0 - p0
        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
        pand        xmm5, xmm2              ; mask filter values we don't care about

-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
+        paddsb      xmm5, [GLOBAL(t4)]      ;  3* (q0 - p0) + (p1 - q1) + 4
+        movdqa      xmm0, xmm5
        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]

+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add

-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
+
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
+
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back

    ; begin epilog
-    pop rdi
-    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
        lea         rsi,        [rsi + rax*8]
        lea         rdi,        [rsi + rax]
        lea         rdx,        [rsi + rax*4]
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90

-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0

        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0

-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
        movdqa      xmm1,       xmm0
        movdqa      xmm3,       xmm2

@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        mov         rdx,        arg(2)                          ;blimit
+
        ; calculate mask
        movdqa      xmm6,       xmm0                            ; p1
        movdqa      xmm7,       xmm3                            ; q1
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
        psrlw       xmm6,       1                               ; abs(p1-q1)/2

+        movdqa      xmm7, [rdx]
+
        movdqa      xmm5,       xmm1                            ; p0
        movdqa      xmm4,       xmm2                            ; q0
        psubusb     xmm5,       xmm2                            ; p0-=q0
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
+        movdqa      xmm4, [GLOBAL(t80)]

        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
        pxor        xmm7,        xmm7
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
        movdqa        t0,        xmm0
        movdqa        t1,        xmm3

-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
        psubsb      xmm0,        xmm3                           ; p1 - q1
+
        movdqa      xmm6,        xmm1                           ; p0
+;        movdqa      xmm7,        xmm2                           ; q0

-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
+        pxor        xmm6,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values

+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm6                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
        pand        xmm5,        xmm0                           ; mask filter values we don't care about

-
        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
+        movdqa      xmm0, xmm5
        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
+        movdqa  xmm1, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]

-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add

-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add

-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm6,        xmm4                  ; unoffset   p0

        movdqa      xmm0,        t0                             ; p1
        movdqa      xmm4,        t1                             ; q1
@@ -1763,3 +1746,9 @@ s9:
 align 16
 s63:
    times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -151,6 +151,23 @@ sym(vp8_post_proc_down_and_across_mmx):
        sub         rsi, rdx
        sub         rdi, rdx

+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(5)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+

        push        rax
        xor         rdx,    rdx
@@ -298,8 +315,36 @@ sym(vp8_mbpost_proc_down_mmx):
            pxor        mm0,        mm0     ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
            neg         rax                                     ; rax = -pitch

+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      mm1
+
+            dec         rcx
+            jne         .init_border
+
+
            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax

--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -139,6 +139,24 @@ sym(vp8_post_proc_down_and_across_xmm):
        sub         rsi,        rdx
        sub         rdi,        rdx

+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(5)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
        xor         rdx,        rdx
        movq        mm0,        QWORD PTR [rdi-8];

@@ -287,12 +305,40 @@ sym(vp8_mbpost_proc_down_xmm):
            pxor        xmm0,       xmm0        ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
            neg         rax                                     ; rax = -pitch

+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax

-
            pxor        xmm5,       xmm5
            pxor        xmm6,       xmm6        ;

@@ -480,7 +526,25 @@ sym(vp8_mbpost_proc_across_ip_xmm):
        xor         rdx,    rdx ;sumsq=0;
        xor         rcx,    rcx ;sum=0;
        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
 .ip_var_loop:
        ;for(i=-8;i<=6;i++)
        ;{
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -559,12 +559,492 @@ sym(vp8_intra_pred_uv_ho_%1):
 vp8_intra_pred_uv_ho mmx2
 vp8_intra_pred_uv_ho ssse3

+;void vp8_intra_pred_y_dc_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc_sse2)
+sym(vp8_intra_pred_y_dc_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rsi]
+    psadbw      xmm1,       xmm0
+    movq        xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; from left
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi+rax]
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*4]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        xmm1, 0x0
+    lea         edx,        [edx+ecx+16]
+    sar         edx,        5
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dctop_sse2)
+sym(vp8_intra_pred_y_dctop_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; from top
+    mov         rcx,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rcx,        rax
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rcx]
+    psadbw      xmm1,       xmm0
+    movdqa      xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; add up
+    paddw       xmm1,       [GLOBAL(dc_8)]
+    psraw       xmm1,       4
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdx,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dcleft_sse2)
+sym(vp8_intra_pred_y_dcleft_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from left
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    dec         rsi
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+8]
+
+    ; add up
+    shr         edx,        4
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_dc128_sse2)
+sym(vp8_intra_pred_y_dc128_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; write out
+    mov         rsi,        2
+    movdqa      xmm1,       [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1)
+sym(vp8_intra_pred_y_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; read top row
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    sub         rsi,        rax
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm3,       [GLOBAL(dc_1024)]
+%endif
+    movdqa      xmm1,       [rsi]
+    movdqa      xmm2,       xmm1
+    punpcklbw   xmm1,       xmm0
+    punpckhbw   xmm2,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm4,       [rsi-1]
+    lea         rsi,        [rsi+rax-1]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    punpcklqdq  xmm4,       xmm4
+%else
+    pshufb      xmm4,       xmm3
+%endif
+    psubw       xmm1,       xmm4
+    psubw       xmm2,       xmm4
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+    movd        xmm4,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm4,       xmm4
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm4,       xmm3
+    pshufb      xmm5,       xmm3
+%endif
+    movdqa      xmm6,       xmm4
+    movdqa      xmm7,       xmm5
+    paddw       xmm4,       xmm1
+    paddw       xmm6,       xmm2
+    paddw       xmm5,       xmm1
+    paddw       xmm7,       xmm2
+    packuswb    xmm4,       xmm6
+    packuswb    xmm5,       xmm7
+    movdqa [rdi    ],       xmm4
+    movdqa [rdi+rcx],       xmm5
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_tm_%1_loop
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ve_sse2)
+sym(vp8_intra_pred_y_ve_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    ; end prolog
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+    movsxd      rdx,        dword ptr arg(3) ;src_stride;
+    sub         rax,        rdx
+    movdqa      xmm1,       [rax]
+
+    ; write out
+    mov         rsi,        2
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_ho_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *src,
+;    int src_stride,
+;    )
+global sym(vp8_intra_pred_y_ho_sse2)
+sym(vp8_intra_pred_y_ho_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read from left and write out
+    mov         edx,        8
+    mov         rsi,        arg(2) ;src;
+    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    dec         rsi
+
+vp8_intra_pred_y_ho_sse2_loop:
+    movd        xmm0,       [rsi]
+    movd        xmm1,       [rsi+rax]
+    ; FIXME use pshufb for ssse3 version
+    punpcklbw   xmm0,       xmm0
+    punpcklbw   xmm1,       xmm1
+    pshuflw     xmm0,       xmm0, 0x0
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm0,       xmm0
+    punpcklqdq  xmm1,       xmm1
+    movdqa [rdi    ],       xmm0
+    movdqa [rdi+rcx],       xmm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_ho_sse2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 SECTION_RODATA
+align 16
 dc_128:
-    times 8 db 128
+    times 16 db 128
 dc_4:
    times 4 dw 4
 align 16
+dc_8:
+    times 8 dw 8
+align 16
 dc_1024:
    times 8 dw 0x400
 align 16
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -94,3 +94,69 @@ void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
                                        vp8_intra_pred_uv_tm_ssse3,
                                        vp8_intra_pred_uv_ho_ssse3);
 }
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               int dst_stride,
+                                               build_intra_predictors_mbuv_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.mode;
+    build_intra_predictors_mbuv_fn_t fn;
+    int src_stride = x->dst.y_stride;
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+        case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_y_dc_sse2; break;
+                } else {
+                    fn = vp8_intra_pred_y_dctop_sse2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_y_dcleft_sse2; break;
+            } else {
+                fn = vp8_intra_pred_y_dc128_sse2; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+    return;
+}
+
+void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+                                       vp8_intra_pred_y_tm_ssse3);
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x)
+{
+    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+                                       vp8_intra_pred_y_tm_ssse3);
+
+}
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -42,6 +42,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
 extern prototype_copy_block(vp8_copy_mem16x16_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_sse2);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_copy16x16
@@ -53,12 +55,20 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2

+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_sse2
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_sse2
+
 #endif
 #endif

 #if HAVE_SSSE3
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3);
 extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_ssse3);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_ssse3);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_build_intra_predictors_mbuv
@@ -67,6 +77,12 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3)
 #undef  vp8_recon_build_intra_predictors_mbuv_s
 #define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3

+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_ssse3
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
+
 #endif
 #endif
 #endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -11,7 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/x86.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/recon.h"
@@ -37,6 +36,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)

    if (flags & HAS_MMX)
    {
+        rtcd->dequant.block               = vp8_dequantize_b_mmx;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
+
        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
        rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
@@ -81,6 +85,13 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mbuv_sse2;
        rtcd->recon.build_intra_predictors_mbuv_s =
            vp8_build_intra_predictors_mbuv_s_sse2;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_sse2;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_sse2;
+
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;

        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;

@@ -124,6 +135,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mbuv_ssse3;
        rtcd->recon.build_intra_predictors_mbuv_s =
            vp8_build_intra_predictors_mbuv_s_ssse3;
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_ssse3;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_ssse3;
    }
 #endif

--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -9,25 +9,31 @@
 */


-#ifndef _littlend_h
-#define _littlend_h
+#include "vpx_config.h"
+#include "vpx_ports/arm.h"
+#include "vp8/decoder/onyxd_int.h"

-#if defined(__cplusplus)
-extern "C" {
+void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = pbi->common.rtcd.flags;
+
+#if HAVE_ARMV5TE
+    if (flags & HAS_EDSP)
+    {
+    }
 #endif

-#define invert2(x) (x)
-#define invert4(x) (x)
+#if HAVE_ARMV6
+    if (flags & HAS_MEDIA)
+    {
+    }
+#endif

-#define low_byte(x) (unsigned char)x
-#define mid1Byte(x) (unsigned char)(x >> 8)
-#define mid2Byte(x) (unsigned char)(x >> 16)
-#define high_byte(x) (unsigned char)(x >> 24)
-
-#define SWAPENDS 0
-
-#if defined(__cplusplus)
+#if HAVE_ARMV7
+    if (flags & HAS_NEON)
+    {
+    }
+#endif
+#endif
 }
-#endif
-
-#endif
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -9,14 +9,13 @@
 */


-#include "vpx_config.h"
-#include "vpx_rtcd.h"
 #include "onyxd_int.h"
 #include "vp8/common/header.h"
 #include "vp8/common/reconintra.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/recon.h"
 #include "vp8/common/reconinter.h"
+#include "vp8/common/dequantize.h"
 #include "detokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
@@ -33,6 +32,7 @@
 #endif
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/idct.h"
+
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
@@ -42,7 +42,6 @@

 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
-    int i;
    int Q;
    VP8_COMMON *const pc = & pbi->common;

@@ -52,15 +51,9 @@ void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);

-        /* all the ac values = ; */
-        for (i = 1; i < 16; i++)
-        {
-            int rc = vp8_default_zig_zag1d[i];
-
-            pc->Y1dequant[Q][rc] = (short)vp8_ac_yquant(Q);
-            pc->Y2dequant[Q][rc] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
-            pc->UVdequant[Q][rc] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
-        }
+        pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+        pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+        pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
    }
 }

@@ -88,19 +81,19 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
    else
        QIndex = pc->base_qindex;

-    /* Set up the block level dequant pointers */
-    for (i = 0; i < 16; i++)
+    /* Set up the macroblock dequant constants */
+    xd->dequant_y1_dc[0] = 1;
+    xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+    xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+    xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+    for (i = 1; i < 16; i++)
    {
-        xd->block[i].dequant = pc->Y1dequant[QIndex];
+        xd->dequant_y1_dc[i] =
+        xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+        xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+        xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
    }
-
-    for (i = 16; i < 24; i++)
-    {
-        xd->block[i].dequant = pc->UVdequant[QIndex];
-    }
-
-    xd->block[24].dequant = pc->Y2dequant[QIndex];
-
 }

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -109,32 +102,12 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 #define RTCD_VTABLE(x) NULL
 #endif

-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-    {
-        RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
-        RECON_INVOKE(&pbi->common.rtcd.recon,
-                     build_intra_predictors_mby_s)(xd);
-    }
-    else
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
-                                           xd->dst.u_buffer, xd->dst.v_buffer,
-                                           xd->dst.y_stride, xd->dst.uv_stride);
-    }
-}
-
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                              unsigned int mb_idx)
 {
-    int eobtotal = 0;
-    int throw_residual = 0;
    MB_PREDICTION_MODE mode;
    int i;
+    int corruption_detected = 0;

    if (xd->mode_info_context->mbmi.mb_skip_coeff)
    {
@@ -142,28 +115,52 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
    }
    else if (!vp8dx_bool_error(xd->current_bc))
    {
+        int eobtotal;
        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
    }

-
-
    mode = xd->mode_info_context->mbmi.mode;

-    if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV &&
-            !vp8dx_bool_error(xd->current_bc))
-    {
-        /* Special case:  Force the loopfilter to skip when eobtotal and
-         * mb_skip_coeff are zero.
-         * */
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-        skip_recon_mb(pbi, xd);
-        return;
-    }
-
    if (xd->segmentation_enabled)
        mb_init_dequantizer(pbi, xd);

+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+
    /* do prediction */
    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
@@ -173,121 +170,114 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
        {
            RECON_INVOKE(&pbi->common.rtcd.recon,
                         build_intra_predictors_mby_s)(xd);
-        } else {
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
            vp8_intra_prediction_down_copy(xd);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                int b_mode = xd->mode_info_context->bmi[i].as_mode;
+
+                RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
+                              ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
+                                *(b->base_dst) + b->dst, b->dst_stride );
+
+                if (xd->eobs[i])
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                            (b->qcoeff, DQC,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                    }
+                    else
+                    {
+                        IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                            (b->qcoeff[0] * DQC[0],
+                            *(b->base_dst) + b->dst, b->dst_stride,
+                            *(b->base_dst) + b->dst, b->dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
        }
    }
    else
    {
        vp8_build_inter_predictors_mb(xd);
    }
-    /* When we have independent partitions we can apply residual even
-     * though other partitions within the frame are corrupt.
-     */
-    throw_residual = (!pbi->independent_partitions &&
-                      pbi->frame_corrupt_residual);
-    throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+

 #if CONFIG_ERROR_CONCEALMENT
-    if (pbi->ec_active &&
-        (mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+    if (corruption_detected)
    {
-        /* MB with corrupt residuals or corrupt mode/motion vectors.
-         * Better to use the predictor as reconstruction.
-         */
-        pbi->frame_corrupt_residual = 1;
-        vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-        vp8_conceal_corrupt_mb(xd);
        return;
    }
 #endif

-    /* dequantization and idct */
-    if (mode == B_PRED)
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
    {
-        for (i = 0; i < 16; i++)
+        /* dequantization and idct */
+        if (mode != B_PRED)
        {
-            BLOCKD *b = &xd->block[i];
-            int b_mode = xd->mode_info_context->bmi[i].as_mode;
+            short *DQC = xd->dequant_y1;

-            RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
-                          ( *(b->base_dst) + b->dst, b->dst_stride, b_mode,
-                            *(b->base_dst) + b->dst, b->dst_stride );
-
-            if (xd->eobs[i] )
+            if (mode != SPLITMV)
            {
-                if (xd->eobs[i] > 1)
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
                {
-                    vp8_dequant_idct_add
-                        (b->qcoeff, b->dequant,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b,
+                        xd->dequant_y2);
+
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
                }
                else
                {
-                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
-                        *(b->base_dst) + b->dst, b->dst_stride,
-                        *(b->base_dst) + b->dst, b->dst_stride);
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
+                        xd->qcoeff);
                    ((int *)b->qcoeff)[0] = 0;
                }
-            }
-        }
-    }
-    else
-    {
-        short *DQC = xd->block[0].dequant;

-        /* save the dc dequant constant in case it is overridden */
-        short dc_dequant_temp = DQC[0];
-
-        if (mode != SPLITMV)
-        {
-            BLOCKD *b = &xd->block[24];
-
-            /* do 2nd order transform on the dc block */
-            if (xd->eobs[24] > 1)
-            {
-                vp8_dequantize_b(b);
-
-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
-                    xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
-                ((int *)b->qcoeff)[1] = 0;
-                ((int *)b->qcoeff)[2] = 0;
-                ((int *)b->qcoeff)[3] = 0;
-                ((int *)b->qcoeff)[4] = 0;
-                ((int *)b->qcoeff)[5] = 0;
-                ((int *)b->qcoeff)[6] = 0;
-                ((int *)b->qcoeff)[7] = 0;
-            }
-            else
-            {
-                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
-                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
-                    xd->qcoeff);
-                ((int *)b->qcoeff)[0] = 0;
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
            }

-            /* override the dc dequant constant */
-            DQC[0] = 1;
+            DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
        }

-        vp8_dequant_idct_add_y_block
-                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->dst.y_buffer,
-                         xd->dst.y_stride, xd->eobs);
-
-        /* restore the dc dequant constant */
-        DQC[0] = dc_dequant_temp;
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
    }
-
-    vp8_dequant_idct_add_uv_block
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
-                     xd->dst.u_buffer, xd->dst.v_buffer,
-                     xd->dst.uv_stride, xd->eobs+16);
 }

-
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 {
    int ret_val = 0;
@@ -484,7 +474,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                                const unsigned char* token_part_sizes)
 {
    vp8_reader *bool_decoder = &pbi->bc2;
-    int fragment_idx, partition_idx;
+    unsigned int partition_idx;
+    int fragment_idx;
    int num_token_partitions;
    const unsigned char *first_fragment_end = pbi->fragments[0] +
                                          pbi->fragment_sizes[0];
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,7 +15,7 @@
 #include "vpx_ports/mem.h"
 #include "detokenize.h"

-#define BOOL_DATA UINT8
+#define BOOL_DATA unsigned char

 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
 DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
@@ -157,10 +157,10 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
    DECODE_AND_APPLYSIGN(val) \
    Prob = coef_probs + (ENTROPY_NODES*2); \
    if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        qcoeff_ptr [ scan[c] ] = (int16_t) v; \
        ++c; \
        goto DO_WHILE; }\
-    qcoeff_ptr [ 15 ] = (INT16) v; \
+    qcoeff_ptr [ 15 ] = (int16_t) v; \
    goto BLOCK_FINISHED;


@@ -172,7 +172,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
    {\
        range = range-split;\
        value = value-bigsplit;\
-        val += ((UINT16)1<<bits_count);\
+        val += ((uint16_t)1<<bits_count);\
    }\
    else\
    {\
@@ -340,12 +340,12 @@ ONE_CONTEXT_NODE_0_:

    if (c < 15)
    {
-        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        qcoeff_ptr [ scan[c] ] = (int16_t) v;
        ++c;
        goto DO_WHILE;
    }

-    qcoeff_ptr [ 15 ] = (INT16) v;
+    qcoeff_ptr [ 15 ] = (int16_t) v;
 BLOCK_FINISHED:
    eobs[i] = c;
    eobtotal += c;
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -10,16 +10,24 @@


 #include "vpx_config.h"
-#include "vpx_rtcd.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"

+extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
+extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
+
 void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
    /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd                     = &pbi->common.rtcd;
+    pbi->mb.rtcd                               = &pbi->common.rtcd;
 #endif

-    /* Move this to common once we use it from more than one place. */
-    vpx_rtcd();
+#if ARCH_X86 || ARCH_X86_64
+    vp8_arch_x86_decode_init(pbi);
+#endif
+
+#if ARCH_ARM
+    vp8_arch_arm_decode_init(pbi);
+#endif
 }
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -20,7 +20,6 @@
 #include "vpx_scale/yv12extend.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
-#include "vp8/common/g_common.h"
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
@@ -57,7 +56,7 @@ void vp8dx_initialize()
 }


-VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 {
    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));

@@ -117,14 +116,12 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     */
    pbi->independent_partitions = 0;

-    return (VP8D_PTR) pbi;
+    return pbi;
 }


-void vp8dx_remove_decompressor(VP8D_PTR ptr)
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
-
    if (!pbi)
        return;

@@ -142,9 +139,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 }


-vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int ref_fb_idx;

@@ -174,9 +170,8 @@ vpx_codec_err_t vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, Y
 }


-vpx_codec_err_t vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int *ref_fb_ptr = NULL;
    int free_fb;
@@ -301,19 +296,18 @@ static int swap_frame_buffers (VP8_COMMON *cm)
    return err;
 }

-int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, int64_t time_stamp)
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsigned char *source, int64_t time_stamp)
 {
 #if HAVE_ARMV7
    int64_t dx_store_reg[8];
 #endif
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int retcode = 0;

    /*if(pbi->ready_for_new_data == 0)
        return -1;*/

-    if (ptr == 0)
+    if (pbi == 0)
    {
        return -1;
    }
@@ -575,10 +569,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
-    VP8D_COMP *pbi = (VP8D_COMP *) ptr;

    if (pbi->ready_for_new_data == 1)
        return ret;
@@ -613,3 +606,26 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, int64_t *time_stam
    vp8_clear_system_state();
    return ret;
 }
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+    const MODE_INFO *mi = oci->mi;
+    int mb_row, mb_col;
+
+    for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+        {
+            if( mi->mbmi.ref_frame == ref_frame)
+              return 1;
+        }
+        mi++;
+    }
+    return 0;
+
+}
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -16,6 +16,8 @@
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/threading.h"
+
+
 #if CONFIG_ERROR_CONCEALMENT
 #include "ec_types.h"
 #endif
@@ -42,7 +44,7 @@ typedef struct
 } DATARATE;


-typedef struct VP8Decompressor
+typedef struct VP8D_COMP
 {
    DECLARE_ALIGNED(16, MACROBLOCKD, mb);

@@ -92,7 +94,6 @@ typedef struct VP8Decompressor

    DATARATE dr[16];

-
    vp8_prob prob_intra;
    vp8_prob prob_last;
    vp8_prob prob_gf;
--- a/vp8/decoder/rtcd_defs.sh
+++ b/vp8/decoder/rtcd_defs.sh
@@ -1,18 +0,0 @@
-decoder_forward_decls() {
-cat <<EOF
-struct blockd;
-EOF
-}
-forward_decls decoder_forward_decls
-
-prototype void vp8_dequantize_b "struct blockd*"
-specialize vp8_dequantize_b mmx v6 neon
-
-prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
-specialize vp8_dequant_idct_add mmx v6 neon
-
-prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
-specialize vp8_dequant_idct_add_y_block mmx sse2 v6 neon
-
-prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
-specialize vp8_dequant_idct_add_uv_block mmx sse2 v6 neon
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -9,8 +9,6 @@
 */


-#include "vpx_config.h"
-#include "vpx_rtcd.h"
 #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
@@ -39,7 +37,7 @@ extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
    VP8_COMMON *const pc = & pbi->common;
-    int i, j;
+    int i;

    for (i = 0; i < count; i++)
    {
@@ -79,10 +77,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D

        mbd->current_bc = &pbi->bc2;

-        for (j = 0; j < 25; j++)
-        {
-            mbd->block[j].dequant = xd->block[j].dequant;
-        }
+        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));

        mbd->fullpixel_mask = 0xffffffff;
        if(pc->full_pixel)
@@ -179,6 +177,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    /* dequantization and idct */
    if (xd->mode_info_context->mbmi.mode == B_PRED)
    {
+        short *DQC = xd->dequant_y1;
+
        for (i = 0; i < 16; i++)
        {
            BLOCKD *b = &xd->block[i];
@@ -191,14 +191,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            {
                if (xd->eobs[i] > 1)
                {
-                    vp8_dequant_idct_add
-                        (b->qcoeff, b->dequant,
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
+                        (b->qcoeff, DQC,
                        *(b->base_dst) + b->dst, b->dst_stride);
                }
                else
                {
                    IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                        (b->qcoeff[0] * b->dequant[0],
+                        (b->qcoeff[0] * DQC[0],
                        *(b->base_dst) + b->dst, b->dst_stride,
                        *(b->base_dst) + b->dst, b->dst_stride);
                    ((int *)b->qcoeff)[0] = 0;
@@ -208,9 +208,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    }
    else
    {
-        short *DQC = xd->block[0].dequant;
-
-        DECLARE_ALIGNED(16, short, local_dequant[16]);
+        short *DQC = xd->dequant_y1;

        if (xd->mode_info_context->mbmi.mode != SPLITMV)
        {
@@ -219,7 +217,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            /* do 2nd order transform on the dc block */
            if (xd->eobs[24] > 1)
            {
-                vp8_dequantize_b(b);
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b, xd->dequant_y2);

                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                    xd->qcoeff);
@@ -234,30 +232,23 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            }
            else
            {
-                b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
+                b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
                ((int *)b->qcoeff)[0] = 0;
            }

-            /* make a local copy of the dequant constants */
-            vpx_memcpy(local_dequant, xd->block[0].dequant,
-                       sizeof(local_dequant));
-
            /* override the dc dequant constant */
-            local_dequant[0] = 1;
-
-            /* use the new dequant constants */
-            DQC = local_dequant;
+            DQC = xd->dequant_y1_dc;
        }

-        vp8_dequant_idct_add_y_block
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
                        (xd->qcoeff, DQC,
                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
    }

-    vp8_dequant_idct_add_uv_block
-                    (xd->qcoeff+16*16, xd->block[16].dequant,
+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->dequant_uv,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
 }
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -10,18 +10,10 @@


 #include "vpx_config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/blockd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/decoder/onyxd_int.h"

-
-#if HAVE_MMX
-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d)
+void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 {
-    short *sq = (short *) d->qcoeff;
-    short *dq = (short *) d->dqcoeff;
-    short *q = (short *) d->dequant;
-    vp8_dequantize_b_impl_mmx(sq, dq, q);
+
 }
-#endif
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -11,9 +11,9 @@
 #include "vpx_config.h"
 #include "vp8/encoder/variance.h"
 #include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"

 #if HAVE_ARMV6
+#include "vp8/common/arm/bilinearfilter_arm.h"

 unsigned int vp8_sub_pixel_variance8x8_armv6
 (
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1013,6 +1013,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                    int ct[4];

                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_clamp_mv2(&best_mv, xd);
+
                    vp8_mv_ref_probs(mv_ref_p, ct);

 #ifdef ENTROPY_STATS
@@ -1206,7 +1208,7 @@ static void sum_probs_over_prev_coef_context(
    {
        for (j=0; j < PREV_COEF_CONTEXTS; ++j)
        {
-            const int tmp = out[i];
+            const unsigned int tmp = out[i];
            out[i] += probs[j][i];
            /* check for wrap */
            if (out[i] < tmp)
@@ -1644,20 +1646,20 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest


    // Signal whether or not Segmentation is enabled
-    vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->segmentation_enabled);

    // Indicate which features are enabled
    if (xd->segmentation_enabled)
    {
        // Signal whether or not the segmentation map is being updated.
-        vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-        vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+        vp8_write_bit(bc, xd->update_mb_segmentation_map);
+        vp8_write_bit(bc, xd->update_mb_segmentation_data);

        if (xd->update_mb_segmentation_data)
        {
            signed char Data;

-            vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+            vp8_write_bit(bc, xd->mb_segement_abs_delta);

            // For each segmentation feature (Quant and loop filter level)
            for (i = 0; i < MB_LVL_MAX; i++)
@@ -1714,7 +1716,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
    vp8_write_literal(bc, pc->sharpness_level, 3);

    // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-    vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+    vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);

    if (xd->mode_ref_lf_delta_enabled)
    {
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -45,10 +45,6 @@ typedef struct
    unsigned char **base_src;
    int src;
    int src_stride;
-
-//  MV  enc_mv;
-    int force_empty;
-
 } BLOCK;

 typedef struct
@@ -107,7 +103,6 @@ typedef struct
    int mv_row_min;
    int mv_row_max;

-    int vector_range;    // Used to monitor limiting range of recent vectors to guide search.
    int skip;

    int encode_breakout;
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -595,8 +595,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
    // Activity map pointer
    x->mb_activity_ptr = cpi->mb_activity_map;

-    x->vector_range = 32;
-
    x->act_zbin_adj = 0;

    x->partition_info = x->pi;
@@ -1091,8 +1089,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }

-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col)
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int mb_row, int mb_col)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    int rate;

    if (cpi->sf.RD && cpi->compressor_speed != 2)
@@ -1112,14 +1112,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
    sum_intra_stats(cpi, x);
    vp8_tokenize_mb(cpi, &x->e_mbd, t);

-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));

+    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->dequant_uv,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
    return rate;
 }
 #ifdef SPEEDSTATS
@@ -1181,23 +1184,8 @@ int vp8cx_encode_inter_macroblock
    }
    else
    {
-#if CONFIG_MULTI_RES_ENCODING
-        if (cpi->oxcf.mr_encoder_id == 0)
-        {
-            /* Lowest-resolution encoding */
-            vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                                    &distortion, &intra_error);
-
-        }else
-        {
-            /* Higher-resolution encoding */
-            vp8_mr_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                                &distortion, &intra_error, mb_row, mb_col);
-        }
-#else
        vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                            &distortion, &intra_error);
-#endif
+                            &distortion, &intra_error, mb_row, mb_col);
    }

    cpi->prediction_error += distortion;
@@ -1312,12 +1300,14 @@ int vp8cx_encode_inter_macroblock
    if (!x->skip)
    {
        vp8_tokenize_mb(cpi, xd, t);
-        if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        {
-          vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),
-                                      &x->e_mbd);
-        }
-        vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+
+        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
    }
    else
    {
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,7 +18,6 @@
 #include "vp8/common/invtrans.h"
 #include "vp8/common/recon.h"
 #include "dct.h"
-#include "vp8/common/g_common.h"
 #include "encodeintra.h"


@@ -45,7 +44,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

        vp8_encode_intra16x16mby(rtcd, x);

-        vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
    }
    else
    {
@@ -77,8 +76,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,

    x->quantize_b(be, b);

-    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
-
+    if (*b->eob > 1)
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
+            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+    else
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
+            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
+                b->dst_stride);
+    }
 }

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -96,11 +104,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);

    vp8_transform_intra_mby(x);

@@ -108,16 +117,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mby(x, rtcd);
-
 }

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);

    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);

    vp8_transform_mbuv(x);

@@ -125,5 +135,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mbuv(x, rtcd);
-
 }
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -105,10 +105,10 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    BLOCK *b = &x->block[0];

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
 }

 static void build_dcblock(MACROBLOCK *x)
@@ -625,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_inter_predictors_mb_e(&x->e_mbd);
+    vp8_build_inter_predictors_mb(&x->e_mbd);

    vp8_subtract_mb(rtcd, x);

@@ -635,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        optimize_mb(x, rtcd);
-
 }

 /* this funciton is used by first pass only */
@@ -643,15 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];

-    vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

    transform_mby(x);

    vp8_quantize_mby(x);

-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
+    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
 }
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,6 +12,7 @@
 #ifndef __INC_ENCODEMB_H
 #define __INC_ENCODEMB_H

+
 #include "vpx_config.h"
 #include "block.h"

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -38,7 +38,7 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)

        if (sem_wait(&cpi->h_event_start_lpf) == 0)
        {
-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                break;

            loopfilter_frame(cpi, cm);
@@ -78,7 +78,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
            int *segment_counts = mbri->segment_counts;
            int *totalrate = &mbri->totalrate;

-            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+            if (cpi->b_multi_threaded == 0) // we're shutting down
                break;

            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
@@ -302,7 +302,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->mv_col_max    = x->mv_col_max;
    z->mv_row_min    = x->mv_row_min;
    z->mv_row_max    = x->mv_row_max;
-    z->vector_range = x->vector_range ;
    */

    z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
@@ -350,8 +349,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
        z->block[i].src             = x->block[i].src;
        */
        z->block[i].src_stride       = x->block[i].src_stride;
-        z->block[i].force_empty      = x->block[i].force_empty;
-
    }

    {
@@ -387,10 +384,22 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
        zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));

-        for (i = 0; i < 25; i++)
-        {
-            zd->block[i].dequant = xd->block[i].dequant;
-        }
+        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            zd->block[i].dequant = zd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            zd->block[i].dequant = zd->dequant_uv;
+        zd->block[24].dequant = zd->dequant_y2;
+#endif
    }
 }

@@ -421,8 +430,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 #endif
        mb->gf_active_ptr            = x->gf_active_ptr;

-        mb->vector_range             = 32;
-
        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
        mbr_ei[i].totalrate = 0;

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -267,8 +267,8 @@ static void avg_stats(FIRSTPASS_STATS *section)
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    double av_err = ( cpi->twopass.total_stats->ssim_weighted_pred_err /
-                      cpi->twopass.total_stats->count );
+    double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+                      cpi->twopass.total_stats.count );
    double this_err = this_frame->ssim_weighted_pred_err;
    double modified_err;

@@ -373,7 +373,7 @@ static int frame_max_bits(VP8_COMP *cpi)
    else
    {
        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
    }

    // Trap case where we are out of bits
@@ -385,12 +385,12 @@ static int frame_max_bits(VP8_COMP *cpi)

 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    zero_stats(cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_stats);
 }

 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }

 static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
@@ -804,17 +804,17 @@ void vp8_first_pass(VP8_COMP *cpi)
                       - cpi->source->ts_start;

        // don't want to do output stats with a stack variable!
-        memcpy(cpi->twopass.this_frame_stats,
+        memcpy(&cpi->twopass.this_frame_stats,
               &fps,
               sizeof(FIRSTPASS_STATS));
-        output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-        accumulate_stats(cpi->twopass.total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+        accumulate_stats(&cpi->twopass.total_stats, &fps);
    }

    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
    if ((cm->current_video_frame > 0) &&
-        (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-        ((cpi->twopass.this_frame_stats->intra_error / cpi->twopass.this_frame_stats->coded_error) > 2.0))
+        (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
    {
        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
    }
@@ -861,7 +861,7 @@ double bitcost( double prob )
 {
    return -(log( prob ) / log( 2.0 ));
 }
-static long long estimate_modemvcost(VP8_COMP *cpi,
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
                                     FIRSTPASS_STATS * fpstats)
 {
    int mv_cost;
@@ -1019,7 +1019,7 @@ static int estimate_max_q(VP8_COMP *cpi,
    // averaga q observed in clip for non kf/gf.arf frames
    // Give average a chance to settle though.
    if ( (cpi->ni_frames >
-                  ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+                  ((unsigned int)cpi->twopass.total_stats.count >> 8)) &&
         (cpi->ni_frames > 150) )
    {
        cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
@@ -1075,8 +1075,8 @@ static int estimate_cq( VP8_COMP *cpi,
    }

    // II ratio correction factor for clip as a whole
-    clip_iiratio = cpi->twopass.total_stats->intra_error /
-                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+    clip_iiratio = cpi->twopass.total_stats.intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
    if (clip_iifactor < 0.80)
        clip_iifactor = 0.80;
@@ -1260,25 +1260,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)

    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-    zero_stats(cpi->twopass.total_stats);
-    zero_stats(cpi->twopass.total_left_stats);
+    zero_stats(&cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_left_stats);

    if (!cpi->twopass.stats_in_end)
        return;

-    *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-    *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+    cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+    cpi->twopass.total_left_stats = cpi->twopass.total_stats;

    // each frame can have a different duration, as the frame rate in the source
    // isn't guaranteed to be constant.   The frame rate prior to the first frame
    // encoded in the second pass is a guess.  However the sum duration is not.
    // Its calculated based on the actual durations of all frames from the first
    // pass.
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);

    cpi->output_frame_rate = cpi->frame_rate;
-    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
-    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0);
+    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);

    // Calculate a minimum intra value to be used in determining the IIratio
    // scores used in the second pass. We have this minimum to make sure
@@ -1301,7 +1301,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
            sum_iiratio += IIRatio;
        }

-        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);

        // Reset file position
        reset_fpf_position(cpi, start_pos);
@@ -1376,7 +1376,7 @@ static int detect_transition_to_still(
    double loop_decay_rate,
    double decay_accumulator )
 {
-    BOOL trans_to_still = FALSE;
+    int trans_to_still = 0;

    // Break clause to detect very still sections after motion
    // For example a static image after a fade or other transition
@@ -1406,7 +1406,7 @@ static int detect_transition_to_still(

        // Only if it does do we signal a transition to still
        if ( j == still_interval )
-            trans_to_still = TRUE;
+            trans_to_still = 1;
    }

    return trans_to_still;
@@ -1415,14 +1415,14 @@ static int detect_transition_to_still(
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static BOOL detect_flash( VP8_COMP *cpi, int offset )
+static int detect_flash( VP8_COMP *cpi, int offset )
 {
    FIRSTPASS_STATS next_frame;

-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;

    // Read the frame data.
-    // The return is FALSE (no flash detected) if not a valid frame
+    // The return is 0 (no flash detected) if not a valid frame
    if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
    {
        // What we are looking for here is a situation where there is a
@@ -1433,7 +1433,7 @@ static BOOL detect_flash( VP8_COMP *cpi, int offset )
        if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
             (next_frame.pcnt_second_ref >= 0.5 ) )
        {
-            flash_detected = TRUE;
+            flash_detected = 1;

            /*if (1)
            {
@@ -1548,7 +1548,7 @@ static int calc_arf_boost(
    double mv_in_out_accumulator = 0.0;
    double abs_mv_in_out_accumulator = 0.0;
    double r;
-    BOOL flash_detected = FALSE;
+    int flash_detected = 0;

    // Search forward from the proposed arf/next gf position
    for ( i = 0; i < f_frames; i++ )
@@ -1677,7 +1677,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    int alt_boost = 0;
    int f_boost = 0;
    int b_boost = 0;
-    BOOL flash_detected;
+    int flash_detected;

    cpi->twopass.gf_group_bits = 0;
    cpi->twopass.gf_decay_rate = 0;
@@ -1751,7 +1751,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                                         loop_decay_rate,
                                         decay_accumulator ) )
        {
-            allow_alt_ref = FALSE;
+            allow_alt_ref = 0;
            boost_score = old_boost_score;
            break;
        }
@@ -1923,7 +1923,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

-            cpi->source_alt_ref_pending = TRUE;
+            cpi->source_alt_ref_pending = 1;

            // For alt ref frames the error score for the end frame of the
            // group (the alt ref frame) should not contribute to the group
@@ -1949,7 +1949,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            // Note: this_frame->frame has been updated in the loop
            // so it now points at the ARF frame.
            half_gf_int = cpi->baseline_gf_interval >> 1;
-            frames_after_arf = cpi->twopass.total_stats->count -
+            frames_after_arf = cpi->twopass.total_stats.count -
                               this_frame->frame - 1;

            switch (cpi->oxcf.arnr_type)
@@ -1989,13 +1989,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        else
        {
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
            cpi->baseline_gf_interval = i;
        }
    }
    else
    {
-        cpi->source_alt_ref_pending = FALSE;
+        cpi->source_alt_ref_pending = 0;
        cpi->baseline_gf_interval = i;
    }

@@ -2005,7 +2005,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
    // This is also important for short clips where there may only be one
    // key frame.
-    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                            cpi->common.current_video_frame))
    {
        cpi->twopass.kf_group_bits =
@@ -2296,7 +2296,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 void vp8_second_pass(VP8_COMP *cpi)
 {
    int tmp_q;
-    int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);

    FIRSTPASS_STATS this_frame = {0};
    FIRSTPASS_STATS this_frame_copy;
@@ -2341,7 +2341,7 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left;
            cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-            cpi->source_alt_ref_pending = FALSE;
+            cpi->source_alt_ref_pending = 0;
        }

    }
@@ -2411,7 +2411,7 @@ void vp8_second_pass(VP8_COMP *cpi)

    // Account for mv, mode and other overheads.
    overhead_bits = estimate_modemvcost(
-                        cpi, cpi->twopass.total_left_stats );
+                        cpi, &cpi->twopass.total_left_stats );

    // Special case code for first frame.
    if (cpi->common.current_video_frame == 0)
@@ -2425,7 +2425,7 @@ void vp8_second_pass(VP8_COMP *cpi)

            est_cq =
                estimate_cq( cpi,
-                             cpi->twopass.total_left_stats,
+                             &cpi->twopass.total_left_stats,
                             (int)(cpi->twopass.bits_left / frames_left),
                             overhead_bits );

@@ -2440,7 +2440,7 @@ void vp8_second_pass(VP8_COMP *cpi)

        tmp_q = estimate_max_q(
                    cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                    (int)(cpi->twopass.bits_left / frames_left),
                    overhead_bits );

@@ -2463,16 +2463,16 @@ void vp8_second_pass(VP8_COMP *cpi)
    // radical adjustments to the allowed quantizer range just to use up a
    // few surplus bits or get beneath the target rate.
    else if ( (cpi->common.current_video_frame <
-                 (((unsigned int)cpi->twopass.total_stats->count * 255)>>8)) &&
+                 (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-                 (unsigned int)cpi->twopass.total_stats->count) )
+                 (unsigned int)cpi->twopass.total_stats.count) )
    {
        if (frames_left < 1)
            frames_left = 1;

        tmp_q = estimate_max_q(
                    cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                    (int)(cpi->twopass.bits_left / frames_left),
                    overhead_bits );

@@ -2489,13 +2489,13 @@ void vp8_second_pass(VP8_COMP *cpi)
    cpi->twopass.frames_to_key --;

    // Update the total stats remaining sturcture
-    subtract_stats(cpi->twopass.total_left_stats, &this_frame );
+    subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
 }


-static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
 {
-    BOOL is_viable_kf = FALSE;
+    int is_viable_kf = 0;

    // Does the frame satisfy the primary criteria of a key frame
    //      If so, then examine how well it predicts subsequent frames
@@ -2569,13 +2569,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRST

        // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
        if (boost_score > 5.0 && (i > 3))
-            is_viable_kf = TRUE;
+            is_viable_kf = 1;
        else
        {
            // Reset the file position
            reset_fpf_position(cpi, start_pos);

-            is_viable_kf = FALSE;
+            is_viable_kf = 0;
        }
    }

@@ -2611,7 +2611,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->this_key_frame_forced = cpi->next_key_frame_forced;

    // Clear the alt ref active flag as this can never be active on a key frame
-    cpi->source_alt_ref_active = FALSE;
+    cpi->source_alt_ref_active = 0;

    // Kf is always a gf so clear frames till next gf counter
    cpi->frames_till_gf_update_due = 0;
@@ -2727,10 +2727,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // Reset to the start of the group
        reset_fpf_position(cpi, current_pos);

-        cpi->next_key_frame_forced = TRUE;
+        cpi->next_key_frame_forced = 1;
    }
    else
-        cpi->next_key_frame_forced = FALSE;
+        cpi->next_key_frame_forced = 0;

    // Special case for the last frame of the file
    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
@@ -3034,8 +3034,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    if (cpi->oxcf.allow_spatial_resampling)
    {
-        int resample_trigger = FALSE;
-        int last_kf_resampled = FALSE;
+        int resample_trigger = 0;
+        int last_kf_resampled = 0;
        int kf_q;
        int scale_val = 0;
        int hr, hs, vr, vs;
@@ -3053,14 +3053,14 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        double effective_size_ratio;

        if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
-            last_kf_resampled = TRUE;
+            last_kf_resampled = 1;

        // Set back to unscaled by defaults
        cpi->common.horiz_scale = NORMAL;
        cpi->common.vert_scale = NORMAL;

        // Calculate Average bits per frame.
-        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
        //if ( av_bits_per_frame < 0.0 )
        //  av_bits_per_frame = 0.0
@@ -3117,21 +3117,21 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
                //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
                //  ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
-                resample_trigger = TRUE;
+                resample_trigger = 1;
            else
-                resample_trigger = FALSE;
+                resample_trigger = 0;
        }
        else
        {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;

            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
                ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
                 (over_spend > clip_bits / 20)))                                                               // ... Overspend > 5% of total bits
-                resample_trigger = TRUE;
+                resample_trigger = 1;
            else
-                resample_trigger = FALSE;
+                resample_trigger = 0;

        }

--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -48,7 +48,7 @@ vp8_lookahead_destroy(struct lookahead_ctx *ctx)
    {
        if(ctx->buf)
        {
-            int i;
+            unsigned int i;

            for(i = 0; i < ctx->max_sz; i++)
                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
@@ -65,7 +65,7 @@ vp8_lookahead_init(unsigned int width,
                   unsigned int depth)
 {
    struct lookahead_ctx *ctx = NULL;
-    int i;
+    unsigned int i;

    /* Clamp the lookahead queue depth */
    if(depth < 1)
@@ -188,7 +188,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,

 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index)
+                   unsigned int          index)
 {
    struct lookahead_entry* buf = NULL;

--- a/vp8/encoder/lookahead.h
+++ b/vp8/encoder/lookahead.h
@@ -92,7 +92,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 */
 struct lookahead_entry*
 vp8_lookahead_peek(struct lookahead_ctx *ctx,
-                   int                   index);
+                   unsigned int          index);


 /**\brief Get the number of frames currently in the lookahead queue
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1156,7 +1156,7 @@ int vp8_diamond_search_sadx4
    int tot_steps;
    int_mv this_mv;

-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int best_site = 0;
    int last_site = 0;

@@ -1397,7 +1397,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int r, c;

    unsigned char *check_here;
@@ -1527,7 +1527,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    unsigned char *bestaddress;
    int_mv *best_mv = &d->bmi.mv;
    int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad = UINT_MAX;
    int r, c;

    unsigned char *check_here;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -253,13 +253,16 @@ typedef struct
    int starting_buffer_level;
    int optimal_buffer_level;
    int maximum_buffer_size;
+    int starting_buffer_level_in_ms;
+    int optimal_buffer_level_in_ms;
+    int maximum_buffer_size_in_ms;

    int avg_frame_size_for_layer;

    int buffer_level;
    int bits_off_target;

-    long long total_actual_bits;
+    int64_t total_actual_bits;
    int total_target_vs_actual;

    int worst_quality;
@@ -279,7 +282,7 @@ typedef struct
    int zbin_over_quant;

    int inter_frame_target;
-    INT64 total_byte_count;
+    int64_t total_byte_count;

    int filter_level;

@@ -421,6 +424,7 @@ typedef struct VP8_COMP
    int buffered_mode;

    double frame_rate;
+    double ref_frame_rate;
    int64_t buffer_level;
    int bits_off_target;

@@ -579,10 +583,10 @@ typedef struct VP8_COMP
        double section_max_qfactor;
        unsigned int next_iiratio;
        unsigned int this_iiratio;
-        FIRSTPASS_STATS *total_stats;
-        FIRSTPASS_STATS *this_frame_stats;
+        FIRSTPASS_STATS total_stats;
+        FIRSTPASS_STATS this_frame_stats;
        FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-        FIRSTPASS_STATS *total_left_stats;
+        FIRSTPASS_STATS total_left_stats;
        int first_pass_done;
        int64_t bits_left;
        int64_t clip_bits_total;
@@ -673,8 +677,8 @@ typedef struct VP8_COMP
    unsigned int current_layer;
    LAYER_CONTEXT layer_context[MAX_LAYERS];

-    long long frames_in_layer[MAX_LAYERS];
-    long long bytes_in_layer[MAX_LAYERS];
+    int64_t frames_in_layer[MAX_LAYERS];
+    int64_t bytes_in_layer[MAX_LAYERS];
    double sum_psnr[MAX_LAYERS];
    double sum_psnr_p[MAX_LAYERS];
    double total_error2[MAX_LAYERS];
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -16,14 +16,8 @@

 extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                int recon_uvoffset, int *returnrate,
-                                int *returndistortion, int *returnintra);
+                                int *returndistortion, int *returnintra,
+                                int mb_row, int mb_col);
 extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);

-#if CONFIG_MULTI_RES_ENCODING
-extern void vp8_mr_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
-                                   int recon_yoffset, int recon_uvoffset,
-                                   int *returnrate, int *returndistortion,
-                                   int *returnintra, int mb_row, int mb_col);
-#endif
-
 #endif
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -186,7 +186,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    best_err = calc_partial_ssl_err(sd, cm->frame_to_show,
                                    IF_RTCD(&cpi->rtcd.variance));

-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val -= 1 + (filt_val > 10);

    // Search lower filter levels
    while (filt_val >= min_filter_level)
@@ -209,11 +209,11 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
            break;

        // Adjust filter level
-        filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+        filt_val -= 1 + (filt_val > 10);
    }

    // Search up (note that we have already done filt_val = cm->filter_level)
-    filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+    filt_val = cm->filter_level + 1 + (filt_val > 10);

    if (best_filt_val == cm->filter_level)
    {
@@ -243,7 +243,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
                break;

            // Adjust filter level
-            filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+            filt_val += 1 + (filt_val > 10);
        }
    }

@@ -289,8 +289,12 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)

    int Bias = 0;                       // Bias against raising loop filter and in favor of lowering it

+    int ss_err[MAX_LOOP_FILTER + 1];
+
    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;

+    vpx_memset(ss_err, 0, sizeof(ss_err));
+
    /* Replace unfiltered frame buffer with a new one */
    cm->frame_to_show = &cpi->pick_lf_lvl_frame;

@@ -320,6 +324,9 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)

    best_err = vp8_calc_ss_err(sd, cm->frame_to_show,
                               IF_RTCD(&cpi->rtcd.variance));
+
+    ss_err[filt_mid] = best_err;
+
    filt_best = filt_mid;

    while (filter_step > 0)
@@ -335,13 +342,19 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)

        if ((filt_direction <= 0) && (filt_low != filt_mid))
        {
-            // Get Low filter error score
-            vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
-            vp8cx_set_alt_lf_level(cpi, filt_low);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+            if(ss_err[filt_low] == 0)
+            {
+                // Get Low filter error score
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_low);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
-                                       IF_RTCD(&cpi->rtcd.variance));
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_low] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_low];

            // If value is close to the best so far then bias towards a lower loop filter value.
            if ((filt_err - Bias) < best_err)
@@ -357,12 +370,18 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
        // Now look at filt_high
        if ((filt_direction >= 0) && (filt_high != filt_mid))
        {
-            vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
-            vp8cx_set_alt_lf_level(cpi, filt_high);
-            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+            if(ss_err[filt_high] == 0)
+            {
+                vp8_yv12_copy_y_ptr(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_high);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

-            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
-                                       IF_RTCD(&cpi->rtcd.variance));
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show,
+                                           IF_RTCD(&cpi->rtcd.variance));
+                ss_err[filt_high] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_high];

            // Was it better than the previous best?
            if (filt_err < (best_err - Bias))
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -436,7 +436,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
    int quant_val;
    int Q;

-    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+                          44, 44};

    for (Q = 0; Q < QINDEX_RANGE; Q++)
    {
@@ -469,36 +470,58 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;

        // all the ac values = ;
-        for (i = 1; i < 16; i++)
+        quant_val = vp8_ac_yquant(Q);
+        cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+                     cpi->Y1quant_shift[Q] + 1, quant_val);
+        cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+        cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+                     cpi->Y2quant_shift[Q] + 1, quant_val);
+        cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+        cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+                     cpi->UVquant_shift[Q] + 1, quant_val);
+        cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        for (i = 2; i < 16; i++)
        {
-            int rc = vp8_default_zig_zag1d[i];
+            cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+            cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+            cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+            cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+            cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+            cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;

-            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
-                         cpi->Y1quant_shift[Q] + rc, quant_val);
-            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+            cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+            cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+            cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+            cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+            cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+            cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;

-            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
-                         cpi->Y2quant_shift[Q] + rc, quant_val);
-            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
-            invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
-                         cpi->UVquant_shift[Q] + rc, quant_val);
-            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+            cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+            cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+            cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+            cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+            cpi->UVround[Q][i] = cpi->UVround[Q][1];
+            cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
        }
    }
 }
@@ -615,6 +638,31 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
     */
    if (!ok_to_skip || QIndex != x->q_index)
    {
+
+        xd->dequant_y1_dc[0] = 1;
+        xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0];
+        xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0];
+        xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0];
+
+        for (i = 1; i < 16; i++)
+        {
+            xd->dequant_y1_dc[i] =
+            xd->dequant_y1[i] = cpi->common.Y1dequant[QIndex][1];
+            xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1];
+            xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1];
+        }
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_y1; //cpi->common.Y1dequant[QIndex];
+        for (i = 16; i < 24; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_uv; //cpi->common.UVdequant[QIndex];
+        x->e_mbd.block[24].dequant = xd->dequant_y2; //cpi->common.Y2dequant[QIndex];
+#endif
+
        // Y
        zbin_extra = ZBIN_EXTRA_Y;

@@ -625,7 +673,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
            x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
            x->block[i].zbin = cpi->Y1zbin[QIndex];
            x->block[i].round = cpi->Y1round[QIndex];
-            x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
            x->block[i].zbin_extra = (short)zbin_extra;
        }
@@ -640,7 +687,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
            x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
            x->block[i].zbin = cpi->UVzbin[QIndex];
            x->block[i].round = cpi->UVround[QIndex];
-            x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
            x->block[i].zbin_extra = (short)zbin_extra;
        }
@@ -653,7 +699,6 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
        x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
        x->block[24].zbin = cpi->Y2zbin[QIndex];
        x->block[24].round = cpi->Y2round[QIndex];
-        x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
        x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
        x->block[24].zbin_extra = (short)zbin_extra;

@@ -663,6 +708,9 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
        x->last_act_zbin_adj = x->act_zbin_adj;
+
+
+
    }
    else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
            || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -332,8 +332,8 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
    else
        cpi->frames_till_gf_update_due = cpi->goldfreq;

-    cpi->common.refresh_golden_frame = TRUE;
-    cpi->common.refresh_alt_ref_frame = TRUE;
+    cpi->common.refresh_golden_frame = 1;
+    cpi->common.refresh_alt_ref_frame = 1;
 }


@@ -471,7 +471,7 @@ static void calc_gf_params(VP8_COMP *cpi)
    if (cpi->pass != 2)
    {
        // Single Pass lagged mode: TBD
-        if (FALSE)
+        if (0)
        {
        }

@@ -598,14 +598,14 @@ static void calc_gf_params(VP8_COMP *cpi)
    if (cpi->pass != 2)
    {
        // For now Alt ref is not allowed except in 2 pass modes.
-        cpi->source_alt_ref_pending = FALSE;
+        cpi->source_alt_ref_pending = 0;

        /*if ( cpi->oxcf.fixed_q == -1)
        {
            if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
-                cpi->source_alt_ref_pending = TRUE;
+                cpi->source_alt_ref_pending = 1;
            else
-                cpi->source_alt_ref_pending = FALSE;
+                cpi->source_alt_ref_pending = 0;
        }*/
    }
 }
@@ -940,6 +940,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
            if (cpi->active_worst_quality <= cpi->active_best_quality)
                cpi->active_worst_quality = cpi->active_best_quality + 1;

+            if(cpi->active_worst_quality > 127)
+                cpi->active_worst_quality = 127;
        }
        // Unbuffered mode (eg. video conferencing)
        else
@@ -980,7 +982,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
 #endif
            //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);

-            cpi->drop_frame = TRUE;
+            cpi->drop_frame = 1;
        }

 #if 0
@@ -988,7 +990,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
        else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
                 (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
        {
-            cpi->drop_frame = TRUE;
+            cpi->drop_frame = 1;
        }

 #endif
@@ -1034,11 +1036,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
        {
            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
-                cpi->common.refresh_golden_frame = TRUE;
+                cpi->common.refresh_golden_frame = 1;

            // Two pass GF descision
            else if (cpi->pass == 2)
-                cpi->common.refresh_golden_frame = TRUE;
+                cpi->common.refresh_golden_frame = 1;
        }

 #if 0
@@ -1056,7 +1058,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)

 #endif

-        if (cpi->common.refresh_golden_frame == TRUE)
+        if (cpi->common.refresh_golden_frame == 1)
        {
 #if 0

@@ -1541,7 +1543,7 @@ int vp8_pick_frame_size(VP8_COMP *cpi)
        // Check if we're dropping the frame:
        if (cpi->drop_frame)
        {
-            cpi->drop_frame = FALSE;
+            cpi->drop_frame = 0;
            cpi->drop_count++;
            return 0;
        }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -28,7 +28,6 @@
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/idct.h"
-#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
@@ -1385,8 +1384,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,

        if (bsi.segment_rd < best_rd)
        {
-            int col_min = (best_ref_mv->as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv->as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv->as_mv.row & 7)?1:0);
+            int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
            int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
            int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;

@@ -1716,7 +1715,10 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
    }
 }

-void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra)
 {
    BLOCK *b = &x->block[0];
    BLOCKD *d = &x->e_mbd.block[0];
@@ -1724,8 +1726,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    union b_mode_info best_bmodes[16];
    MB_MODE_INFO best_mbmode;
    PARTITION_INFO best_partition;
+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
    int_mv best_ref_mv;
-    int_mv mode_mv[MB_MODE_COUNT];
+    int_mv *mode_mv;
    MB_PREDICTION_MODE this_mode;
    int num00;
    int best_mode_index = 0;
@@ -1743,91 +1747,49 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    int distortion_uv;
    int best_yrd = INT_MAX;

-    //int all_rds[MAX_MODES];        // Experimental debug code.
-    //int all_rates[MAX_MODES];
-    //int all_dist[MAX_MODES];
-    //int intermodecost[MAX_MODES];
-
    MB_PREDICTION_MODE uv_intra_mode;
    int_mv mvp;
    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    int saddone=0;
    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)

-    int_mv frame_nearest_mv[4];
-    int_mv frame_near_mv[4];
-    int_mv frame_best_ref_mv[4];
-    int frame_mdcounts[4][4];
-    int frame_lf_or_gf[4];
-    unsigned char *y_buffer[4];
-    unsigned char *u_buffer[4];
-    unsigned char *v_buffer[4];
+    unsigned char *plane[4][3];
    int ref_frame_map[4];
+    int sign_bias = 0;

+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
    vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));

    /* Setup search priorities */
-    i=0;
-    ref_frame_map[i++] = INTRA_FRAME;
-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
-        ref_frame_map[i++] = LAST_FRAME;
-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-        ref_frame_map[i++] = GOLDEN_FRAME;
-    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-        ref_frame_map[i++] = ALTREF_FRAME;
-    for(; i<4; i++)
-        ref_frame_map[i] = -1;
+    get_reference_search_order(cpi, ref_frame_map);

-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
    {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);

-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[LAST_FRAME], &frame_near_mv[LAST_FRAME],
-                          &frame_best_ref_mv[LAST_FRAME], frame_mdcounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
-
-        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
-        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
-        v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
-
-        frame_lf_or_gf[LAST_FRAME] = 0;
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
    }

-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-    {
-        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
-
-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[GOLDEN_FRAME], &frame_near_mv[GOLDEN_FRAME],
-                          &frame_best_ref_mv[GOLDEN_FRAME], frame_mdcounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
-
-        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
-        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
-        v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
-
-        frame_lf_or_gf[GOLDEN_FRAME] = 1;
-    }
-
-    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-    {
-        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
-
-        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[ALTREF_FRAME], &frame_near_mv[ALTREF_FRAME],
-                          &frame_best_ref_mv[ALTREF_FRAME], frame_mdcounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
-
-        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
-        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
-        v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
-
-        frame_lf_or_gf[ALTREF_FRAME] = 1;
-    }
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);

    *returnintra = INT_MAX;
    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame

    x->skip = 0;

-    vpx_memset(mode_mv, 0, sizeof(mode_mv));
-
    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
    rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
    uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
@@ -1835,18 +1797,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
    {
        int this_rd = INT_MAX;
-        int lf_or_gf = 0;           // Lat Frame (01) or gf/arf (1)
        int disable_skip = 0;
        int other_cost = 0;
        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];

-        // Experimental debug code.
-        // Record of rd values recorded for this MB. -1 indicates not measured
-        //all_rds[mode_index] = -1;
-        //all_rates[mode_index] = -1;
-        //all_dist[mode_index] = -1;
-        //intermodecost[mode_index] = -1;
-
        // Test best rd so far against threshold for trying this mode.
        if (best_rd <= cpi->rd_threshes[mode_index])
            continue;
@@ -1876,14 +1830,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
        /* everything but intra */
        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
        {
-            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARESTMV] = frame_nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            mode_mv[NEARMV] = frame_near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            vpx_memcpy(mdcounts, frame_mdcounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
-            lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+            {
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
        }

        // Check to see if the testing frequency for this mode is at its max
@@ -2016,8 +1972,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
            int sadpb = x->sadperbit16;
            int_mv mvp_full;

-            int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7)?1:0);
-            int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7)?1:0);
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
            int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
            int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;

@@ -2166,7 +2122,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                continue;

            vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);

            if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
                x->skip = 1;
@@ -2286,11 +2242,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
        }

-        // Experimental debug code.
-        //all_rds[mode_index] = this_rd;
-        //all_rates[mode_index] = rate2;
-        //all_dist[mode_index] = distortion2;
-
        // Keep record of best intra distortion
        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
            (this_rd < best_intra_rd) )
@@ -2391,7 +2342,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
-                                        (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+                                        (cpi->common.mb_no_coeff_skip);
        x->e_mbd.mode_info_context->mbmi.partitioning = 0;

        return;
@@ -2418,10 +2369,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                      x->partition_info->bmi[15].mv.as_int;
    }

-    rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
-
-
+    if (sign_bias
+        != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;

+    rd_update_mvcount(cpi, x, &best_ref_mv);
 }

 void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -69,6 +69,54 @@ extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
 extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);

+
+static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
+                               unsigned char            *plane[3],
+                               unsigned int              recon_yoffset,
+                               unsigned int              recon_uvoffset)
+{
+    plane[0] = fb->y_buffer + recon_yoffset;
+    plane[1] = fb->u_buffer + recon_uvoffset;
+    plane[2] = fb->v_buffer + recon_uvoffset;
+}
+
+
+static void get_predictor_pointers(const VP8_COMP *cpi,
+                                       unsigned char  *plane[4][3],
+                                       unsigned int    recon_yoffset,
+                                       unsigned int    recon_uvoffset)
+{
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
+                           plane[LAST_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx],
+                           plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx],
+                           plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset);
+}
+
+
+static void get_reference_search_order(const VP8_COMP *cpi,
+                                           int             ref_frame_map[4])
+{
+    int i=0;
+
+    ref_frame_map[i++] = INTRA_FRAME;
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+        ref_frame_map[i++] = LAST_FRAME;
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        ref_frame_map[i++] = GOLDEN_FRAME;
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+        ref_frame_map[i++] = ALTREF_FRAME;
+    for(; i<4; i++)
+        ref_frame_map[i] = -1;
+}
+
+
 extern void vp8_mv_pred
 (
    VP8_COMP *cpi,
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -22,7 +22,6 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
@@ -98,7 +97,7 @@ void vp8_temporal_filter_apply_c
    unsigned short *count
 )
 {
-    int i, j, k;
+    unsigned int i, j, k;
    int modifier;
    int byte = 0;

@@ -186,7 +185,7 @@ static int vp8_temporal_filter_find_matching_mb_c
    if (cpi->Speed < 8)
    {
        step_param = cpi->sf.first_step +
-                    ((cpi->Speed > 5) ? 1 : 0);
+                    (cpi->Speed > 5);
        further_steps =
            (cpi->sf.max_step_search_steps - 1)-step_param;
    }
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -514,17 +514,19 @@ static __inline void stuff1st_order_b
    TOKENEXTRA **tp,
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,
+    int type,
    VP8_COMP *cpi
 )
 {
    int pt; /* near block/prev token context index */
+    int band;
    TOKENEXTRA *t = *tp;        /* store tokens starting here */
    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
+    band = type ? 0 : 1;
    t->Token = DCT_EOB_TOKEN;
-    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+    t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
    t->skip_eob_node = 0;
-    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
+    ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
    ++t;
    *tp = t;
    pt = 0; /* 0 <-> all coeff data is zero */
@@ -561,15 +563,19 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
    int plane_type;
    int b;
-
-    stuff2nd_order_b(t,
+    plane_type = 3;
+    if((x->mode_info_context->mbmi.mode != B_PRED
+                        && x->mode_info_context->mbmi.mode != SPLITMV))
+    {
+        stuff2nd_order_b(t,
                     A + vp8_block2above[24], L + vp8_block2left[24], cpi);
-    plane_type = 0;
+        plane_type = 0;
+    }

    for (b = 0; b < 16; b++)
        stuff1st_order_b(t,
                         A + vp8_block2above[b],
-                         L + vp8_block2left[b], cpi);
+                         L + vp8_block2left[b], plane_type, cpi);

    for (b = 16; b < 24; b++)
        stuff1st_order_buv(t,
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -80,6 +80,9 @@ sym(vp8_fast_quantize_b_ssse3):
    mov         rdi, [rsi + vp8_blockd_dequant]
    mov         rcx, [rsi + vp8_blockd_dqcoeff]

+    movdqa      xmm2, xmm1                  ;store y for getting eob
+    movdqa      xmm3, xmm5
+
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
@@ -88,35 +91,30 @@ sym(vp8_fast_quantize_b_ssse3):
    movdqa      [rax], xmm1
    movdqa      [rax + 16], xmm5

-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
+    movdqa      xmm0, [rdi]
+    movdqa      xmm4, [rdi + 16]

-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
+    pmullw      xmm0, xmm1
+    pmullw      xmm4, xmm5
+    pxor        xmm1, xmm1

-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
+    pcmpgtw     xmm2, xmm1                  ;calculate eob
+    pcmpgtw     xmm3, xmm1
+    packsswb    xmm2, xmm3
+    pshufb      xmm2, [GLOBAL(zz_shuf)]

-    pmovmskb    edx, xmm1
-
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
+    pmovmskb    edx, xmm2

+    movdqa      [rcx], xmm0                 ;store dqcoeff
+    movdqa      [rcx + 16], xmm4            ;store dqcoeff
    mov         rcx, [rsi + vp8_blockd_eob]

-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
+    bsr         eax, edx                    ;count 0
    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
+
+    cmp         edx, 0                      ;if all 0, eob=0
+    cmove       eax, edx
+
    mov         BYTE PTR [rcx], al          ;store eob

    ; begin epilog
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -9,7 +9,6 @@
 ##

 VP8_COMMON_SRCS-yes += vp8_common.mk
-VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h
 VP8_COMMON_SRCS-yes += common/ppflags.h
 VP8_COMMON_SRCS-yes += common/onyx.h
@@ -20,6 +19,8 @@ VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/dequantize.h
 VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
@@ -28,17 +29,16 @@ VP8_COMMON_SRCS-yes += common/filter.c
 VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
 VP8_COMMON_SRCS-yes += common/alloccommon.h
 VP8_COMMON_SRCS-yes += common/blockd.h
 VP8_COMMON_SRCS-yes += common/common.h
-VP8_COMMON_SRCS-yes += common/common_types.h
 VP8_COMMON_SRCS-yes += common/entropy.h
 VP8_COMMON_SRCS-yes += common/entropymode.h
 VP8_COMMON_SRCS-yes += common/entropymv.h
 VP8_COMMON_SRCS-yes += common/extend.h
 VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/g_common.h
 VP8_COMMON_SRCS-yes += common/header.h
 VP8_COMMON_SRCS-yes += common/idct.h
 VP8_COMMON_SRCS-yes += common/invtrans.h
@@ -51,14 +51,12 @@ VP8_COMMON_SRCS-yes += common/recon.h
 VP8_COMMON_SRCS-yes += common/reconinter.h
 VP8_COMMON_SRCS-yes += common/reconintra.h
 VP8_COMMON_SRCS-yes += common/reconintra4x4.h
-VP8_COMMON_SRCS-yes += common/rtcd.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.h
 VP8_COMMON_SRCS-yes += common/subpixel.h
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
 VP8_COMMON_SRCS-yes += common/systemdependent.h
 VP8_COMMON_SRCS-yes += common/threading.h
 VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
 VP8_COMMON_SRCS-yes += common/loopfilter.c
 VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
 VP8_COMMON_SRCS-yes += common/mbpitch.c
@@ -70,9 +68,13 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+
+
+
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c

+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
@@ -85,11 +87,14 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
@@ -107,8 +112,6 @@ endif

 # common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
@@ -116,8 +119,12 @@ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.h

 # common (armv6)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)
@@ -130,6 +137,9 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_blk_v6.c

 # common (neon)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
@@ -152,3 +162,8 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_blk_neon.c
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -83,7 +83,7 @@ struct vpx_codec_alg_priv
    vpx_codec_enc_cfg_t     cfg;
    struct vp8_extracfg     vp8_cfg;
    VP8_CONFIG              oxcf;
-    VP8_PTR             cpi;
+    struct VP8_COMP        *cpi;
    unsigned char          *cx_data;
    unsigned int            cx_data_sz;
    vpx_image_t             preview_img;
@@ -137,7 +137,8 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,

 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                       const vpx_codec_enc_cfg_t *cfg,
-                                       const struct vp8_extracfg *vp8_cfg)
+                                       const struct vp8_extracfg *vp8_cfg,
+                                       int                        finalize)
 {
    RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
    RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
@@ -193,6 +194,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    if(finalize && cfg->rc_end_usage == VPX_CQ)
+        RANGE_CHECK(vp8_cfg, cq_level,
+                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);

 #if !(CONFIG_REALTIME_ONLY)
    if (cfg->g_pass == VPX_RC_LAST_PASS)
@@ -331,6 +335,10 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    oxcf->under_shoot_pct          = cfg.rc_undershoot_pct;
    oxcf->over_shoot_pct           = cfg.rc_overshoot_pct;

+    oxcf->maximum_buffer_size_in_ms   = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level_in_ms  = cfg.rc_buf_optimal_sz;
+
    oxcf->maximum_buffer_size      = cfg.rc_buf_sz;
    oxcf->starting_buffer_level    = cfg.rc_buf_initial_sz;
    oxcf->optimal_buffer_level     = cfg.rc_buf_optimal_sz;
@@ -357,6 +365,10 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
    }

 #if CONFIG_MULTI_RES_ENCODING
+    /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+     * are both memset to 0, which ensures the correct logic under this
+     * situation.
+     */
    if(mr_cfg)
    {
        oxcf->mr_total_resolutions        = mr_cfg->mr_total_resolutions;
@@ -439,7 +451,7 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
    if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
        ERROR("Cannot increase lag_in_frames");

-    res = validate_config(ctx, cfg, &ctx->vp8_cfg);
+    res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);

    if (!res)
    {
@@ -505,7 +517,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,

    }

-    res = validate_config(ctx, &ctx->cfg, &xcfg);
+    res = validate_config(ctx, &ctx->cfg, &xcfg, 0);

    if (!res)
    {
@@ -548,7 +560,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
    vpx_codec_enc_cfg_t       *cfg;
    unsigned int               i;

-    VP8_PTR optr;
+    struct VP8_COMP *optr;

    if (!ctx->priv)
    {
@@ -602,7 +614,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,

        vp8_initialize();

-        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);

        if (!res)
        {
@@ -732,6 +744,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
    if (img)
        res = validate_img(ctx, img);

+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
    pick_quickcompress_mode(ctx, duration, deadline);
    vpx_codec_pkt_list_init(&ctx->pkt_list);

@@ -1226,7 +1241,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        /* keyframing settings (kf) */
        VPX_KF_AUTO,        /* g_kfmode*/
        0,                  /* kf_min_dist */
-        9999,               /* kf_max_dist */
+        128,                /* kf_max_dist */

 #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
        1,                  /* g_delete_first_pass_file */
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -57,7 +57,7 @@ struct vpx_codec_alg_priv
    vp8_stream_info_t       si;
    int                     defer_alloc;
    int                     decoder_init;
-    VP8D_PTR                pbi;
+    struct VP8D_COMP       *pbi;
    int                     postproc_cfg_set;
    vp8_postproc_cfg_t      postproc_cfg;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -389,7 +389,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
        if (!res)
        {
            VP8D_CONFIG oxcf;
-            VP8D_PTR optr;
+            struct VP8D_COMP* optr;

            vp8dx_initialize();

@@ -412,7 +412,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
            {
                ctx->postproc_cfg.post_proc_flag =
-                    VP8_DEBLOCK | VP8_DEMACROBLOCK;
+                    VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
                ctx->postproc_cfg.deblocking_level = 4;
                ctx->postproc_cfg.noise_level = 0;
            }
@@ -700,6 +700,27 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
        return VPX_CODEC_INVALID_PARAM;
 }

+extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+                                              int ctrl_id,
+                                              va_list args)
+{
+    int *ref_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+    VP8_COMMON *oci = &pbi->common;
+
+    if (ref_info)
+    {
+        *ref_info =
+            (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+            (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+            (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}

 static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                               int ctrl_id,
@@ -731,6 +752,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
    {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+    {VP8D_GET_LAST_REF_USED,        vp8_get_last_ref_frame},
    { -1, NULL},
 };

--- a/Show More
+++ b/Show More