Merge vp9-preview changes into experimental branch

Incorportate vp9-preview changes by merging master branch into experimental. Conflicts: test/test.mk vp9/common/vp9_filter.c vp9/common/vp9_idctllm.c vp9/common/vp9_invtrans.h vp9/common/vp9_mbpitch.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_systemdependent.h vp9/common/vp9_type_aliases.h vp9/common/x86/vp9_asm_stubs.c vp9/common/x86/vp9_subpixel_mmx.asm vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_dequantize.c vp9/decoder/vp9_dequantize.h vp9/decoder/vp9_onyxd_int.h vp9/encoder/vp9_bitstream.c vp9/encoder/vp9_encodeframe.c vp9/encoder/vp9_rdopt.c Change-Id: I17f51c3666d1b59cf1a699f87607cbc5d30a87c5
2013-01-08 10:11:26 -08:00 · 2013-01-08 10:11:26 -08:00 · 879cb7d962
commit 879cb7d962
parent c14439c3d3 bdca030caf
132 changed files with 1491 additions and 1192 deletions
--- a/29
+++ b/29
@ -1,3 +1,32 @@
+2012-12-21 v1.2.0
+  This release acts as a checkpoint for a large amount of internal refactoring
+  and testing. It also contains a number of small bugfixes, so all users are
+  encouraged to upgrade.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      VP8 optimizations for MIPS dspr2
+      vpxenc: add -quiet option
+
+  - Speed:
+      Encoder and decoder speed is consistent with the Eider release.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+      Minor tweaks to ARNR filtering
+      Minor improvements to real time encoding with multiple temporal layers
+
+  - Bug Fixes:
+      Fixes multithreaded encoder race condition in loopfilter
+      Fixes multi-resolution threaded encoding
+      Fix potential encoder dead-lock after picture resize
+
+
 2012-05-09 v1.1.0 "Eider"
  This introduces a number of enhancements, mostly focused on real-time
  encoding. In addition, it fixes a decoder bug (first introduced in
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@ -27,7 +27,7 @@
 # Android.mk file in the libvpx directory:
 # LOCAL_PATH := $(call my-dir)
 # include $(CLEAR_VARS)
-# include libvpx/build/make/Android.mk
+# include jni/libvpx/build/make/Android.mk
 #
 # There are currently two TARGET_ARCH_ABI targets for ARM.
 # armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@ -61,26 +61,26 @@ while (<STDIN>)
    s/:SHR:/ >> /g;

    # Convert ELSE to .else
-    s/ELSE/.else/g;
+    s/\bELSE\b/.else/g;

    # Convert ENDIF to .endif
-    s/ENDIF/.endif/g;
+    s/\bENDIF\b/.endif/g;

    # Convert ELSEIF to .elseif
-    s/ELSEIF/.elseif/g;
+    s/\bELSEIF\b/.elseif/g;

    # Convert LTORG to .ltorg
-    s/LTORG/.ltorg/g;
+    s/\bLTORG\b/.ltorg/g;

    # Convert endfunc to nothing.
-    s/endfunc//ig;
+    s/\bendfunc\b//ig;

    # Convert FUNCTION to nothing.
-    s/FUNCTION//g;
-    s/function//g;
+    s/\bFUNCTION\b//g;
+    s/\bfunction\b//g;

-    s/ENTRY//g;
-    s/MSARMASM/0/g;
+    s/\bENTRY\b//g;
+    s/\bMSARMASM\b/0/g;
    s/^\s+end\s+$//g;

    # Convert IF :DEF:to .if
@ -149,11 +149,15 @@ while (<STDIN>)
    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;

    # ALIGN directive
-    s/ALIGN/.balign/g;
+    s/\bALIGN\b/.balign/g;

    # ARM code
    s/\sARM/.arm/g;

+    # push/pop
+    s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
+    s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
+
    # NEON code
    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
@ -189,7 +193,7 @@ while (<STDIN>)
    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;

    # Begin macro definition
-    if (/MACRO/) {
+    if (/\bMACRO\b/) {
        $_ = <STDIN>;
        s/^/.macro/;
        s/\$//g;                # remove formal param reference
@ -198,7 +202,7 @@ while (<STDIN>)

    # For macros, use \ to reference formal params
    s/\$/\\/g;                  # End macro definition
-    s/MEND/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
    next if /^\s*END\s*$/;
    print;
    print "$comment_sub$comment\n" if defined $comment;
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -277,6 +277,7 @@ clean_temp_files() {
 # Toolchain Check Functions
 #
 check_cmd() {
+    enabled external_build && return
    log "$@"
    "$@" >>${logfile} 2>&1
 }
@ -767,6 +768,7 @@ process_common_toolchain() {
            ;;
        armv5te)
            soft_enable edsp
+            disable fast_unaligned
            ;;
        esac

@ -1000,7 +1002,11 @@ EOF
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
-        soft_enable sse4_1
+        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
+        else
+            soft_enable sse4_1
+        fi

        case  ${tgt_os} in
            win*)
@ -1175,9 +1181,6 @@ EOF
        ;;
    esac

-    # for sysconf(3) and friends.
-    check_header unistd.h
-
    # glibc needs these
    if enabled linux; then
        add_cflags -D_LARGEFILE_SOURCE
--- a/28
+++ b/28
@ -303,6 +303,7 @@ CONFIG_LIST="
    ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
+    external_build
    extra_warnings
    werror
    install_docs
@ -502,7 +503,7 @@ process_detect() {
            fi
        fi
    fi
-    if [ -z "$CC" ]; then
+    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
        enable external_build
        check_header() {
@ -511,6 +512,7 @@ process_detect() {
            shift
            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
            disable $var
+            # Headers common to all environments
            case $header in
                stdio.h)
                    true;
@ -522,6 +524,25 @@ process_detect() {
                    done
                    ${result:-true}
            esac && enable $var
+
+            # Specialize windows and POSIX environments.
+            case $toolchain in
+                *-win*-*)
+                    case $header-$toolchain in
+                        stdint*-gcc) true;;
+                        *) false;;
+                    esac && enable $var
+                    ;;
+                *)
+                    case $header in
+                        stdint.h) true;;
+                        pthread.h) true;;
+                        sys/mman.h) true;;
+                        unistd.h) true;;
+                        *) false;;
+                    esac && enable $var
+            esac
+            enabled $var
        }
        check_ld() {
            true
@ -535,6 +556,7 @@ EOF
    check_header stdint.h
    check_header pthread.h
    check_header sys/mman.h
+    check_header unistd.h # for sysconf(3) and friends.

    check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports
 }
@ -643,6 +665,10 @@ process_toolchain() {
        *-android-*)
            # GTestLog must be modified to use Android logging utilities.
        ;;
+        *-darwin-*)
+            # iOS/ARM builds do not work with gtest. This does not match
+            # x86 targets.
+        ;;
        *)
            check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
--- a/examples/decode_with_partial_drops.txt
+++ b/examples/decode_with_partial_drops.txt
@ -0,0 +1,238 @@
+@TEMPLATE decoder_tmpl.c
+Decode With Partial Drops Example
+=========================
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+This is an example utility which drops a series of frames (or parts of frames),
+as specified on the command line. This is useful for observing the error
+recovery features of the codec.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+#include <time.h>
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+struct parsed_header
+{
+    char key_frame;
+    int version;
+    char show_frame;
+    int first_part_size;
+};
+
+int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
+{
+    int size = 0;
+    int remaining = length - pos;
+    /* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
+    int uncomp_part_size = (hdr->key_frame ? 10 : 3);
+    /* number of bytes yet to send from header and the first partition */
+    int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
+    if (remainFirst > 0)
+    {
+        if (remainFirst <= mtu)
+        {
+            size = remainFirst;
+        }
+        else
+        {
+            size = mtu;
+        }
+
+        return size;
+    }
+
+    /* second partition; just slot it up according to MTU */
+    if (remaining <= mtu)
+    {
+        size = remaining;
+        return size;
+    }
+    return mtu;
+}
+
+void throw_packets(unsigned char* frame, int* size, int loss_rate,
+                   int* thrown, int* kept)
+{
+    unsigned char loss_frame[256*1024];
+    int pkg_size = 1;
+    int pos = 0;
+    int loss_pos = 0;
+    struct parsed_header hdr;
+    unsigned int tmp;
+    int mtu = 1500;
+
+    if (*size < 3)
+    {
+        return;
+    }
+    putc('|', stdout);
+    /* parse uncompressed 3 bytes */
+    tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
+    hdr.key_frame = !(tmp & 0x1); /* inverse logic */
+    hdr.version = (tmp >> 1) & 0x7;
+    hdr.show_frame = (tmp >> 4) & 0x1;
+    hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
+
+    /* don't drop key frames */
+    if (hdr.key_frame)
+    {
+        int i;
+        *kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
+        for (i=0; i < *kept; i++)
+            putc('.', stdout);
+        return;
+    }
+
+    while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
+    {
+        int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
+        if (*thrown == 0 && !loss_event)
+        {
+            memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
+            loss_pos += pkg_size;
+            (*kept)++;
+            putc('.', stdout);
+        }
+        else
+        {
+            (*thrown)++;
+            putc('X', stdout);
+        }
+        pos += pkg_size;
+    }
+    memcpy(frame, loss_frame, loss_pos);
+    memset(frame + loss_pos, 0, *size - loss_pos);
+    *size = loss_pos;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+/* Initialize codec */
+flags = VPX_CODEC_USE_ERROR_CONCEALMENT;
+res = vpx_codec_dec_init(&codec, interface, &dec_cfg, flags);
+if(res)
+    die_codec(&codec, "Failed to initialize decoder");
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+
+Usage
+-----
+This example adds a single argument to the `simple_decoder` example,
+which specifies the range or pattern of frames to drop. The parameter is
+parsed as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+if(argc < 4 || argc > 6)
+    die("Usage: %s <infile> <outfile> [-t <num threads>] <N-M|N/M|L,S>\n",
+        argv[0]);
+{
+    char *nptr;
+    int arg_num = 3;
+    if (argc == 6 && strncmp(argv[arg_num++], "-t", 2) == 0)
+        dec_cfg.threads = strtol(argv[arg_num++], NULL, 0);
+    n = strtol(argv[arg_num], &nptr, 0);
+    mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
+
+    m = strtol(nptr+1, NULL, 0);
+    if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
+        *nptr != '\0' && *nptr != ','))
+        die("Couldn't parse pattern %s\n", argv[3]);
+}
+seed = (m > 0) ? m : (unsigned int)time(NULL);
+srand(seed);thrown_frame = 0;
+printf("Seed: %u\n", seed);
+printf("Threads: %d\n", dec_cfg.threads);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+
+
+Dropping A Range Of Frames
+--------------------------
+To drop a range of frames, specify the starting frame and the ending
+frame to drop, separated by a dash. The following command will drop
+frames 5 through 10 (base 1).
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5-10
+
+
+Dropping A Pattern Of Frames
+----------------------------
+To drop a pattern of frames, specify the number of frames to drop and
+the number of frames after which to repeat the pattern, separated by
+a forward-slash. The following command will drop 3 of 7 frames.
+Specifically, it will decode 4 frames, then drop 3 frames, and then
+repeat.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 3/7
+
+Dropping Random Parts Of Frames
+-------------------------------
+A third argument tuple is available to split the frame into 1500 bytes pieces
+and randomly drop pieces rather than frames. The frame will be split at
+partition boundaries where possible. The following example will seed the RNG
+with the seed 123 and drop approximately 5% of the pieces. Pieces which
+are depending on an already dropped piece will also be dropped.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5,123
+
+
+Extra Variables
+---------------
+This example maintains the pattern passed on the command line in the
+`n`, `m`, and `is_range` variables:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+int              n, m, mode;
+unsigned int     seed;
+int              thrown=0, kept=0;
+int              thrown_frame=0, kept_frame=0;
+vpx_codec_dec_cfg_t  dec_cfg = {0};
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+
+
+Making The Drop Decision
+------------------------
+The example decides whether to drop the frame based on the current
+frame number, immediately before decoding the frame.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
+/* Decide whether to throw parts of the frame or the whole frame
+   depending on the drop mode */
+thrown_frame = 0;
+kept_frame = 0;
+switch (mode)
+{
+case 0:
+    if (m - (frame_cnt-1)%m <= n)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 1:
+    if (frame_cnt >= n && frame_cnt <= m)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 2:
+    throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
+    break;
+default: break;
+}
+if (mode < 2)
+{
+    if (frame_sz == 0)
+    {
+        putc('X', stdout);
+        thrown_frame++;
+    }
+    else
+    {
+        putc('.', stdout);
+        kept_frame++;
+    }
+}
+thrown += thrown_frame;
+kept += kept_frame;
+fflush(stdout);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
--- a/libs.mk
+++ b/libs.mk
@ -61,8 +61,16 @@ endef
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

+# If this is a universal (fat) binary, then all the subarchitectures have
+# already been built and our job is to stitch them together. The
+# BUILD_LIBVPX variable indicates whether we should be building
+# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
+# that we're stitching.
+$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
+
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
+CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))

 include $(SRC_PATH_BARE)/vpx_mem/vpx_mem.mk
 CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
@ -70,6 +78,9 @@ CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
 include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))

+include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
+CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
+
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
  VP8_PREFIX=vp8/
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@ -79,11 +90,8 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h
-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
  CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif

@ -91,10 +99,8 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
  CODEC_DOC_SECTIONS += vp8 vp8_decoder
 endif

@ -155,30 +161,13 @@ INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Release/%)
 INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
 endif

-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_LIBVPX variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
-
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emms.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
 CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
-endif
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
-CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@ -202,8 +191,7 @@ INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
 endif

-CODEC_SRCS=$(filter-out %_offsets.c,\
-           $(filter-out %_test.cc,$(call enabled,CODEC_SRCS)))
+CODEC_SRCS=$(call enabled,CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)

@ -306,6 +294,7 @@ CLEAN-OBJS += libvpx.syms
 define libvpx_symlink_template
 $(1): $(2)
 	@echo "    [LN]     $(2) $$@"
+	$(qexec)mkdir -p $$(dir $$@)
 	$(qexec)ln -sf $(2) $$@
 endef

@ -314,7 +303,7 @@ $(eval $(call libvpx_symlink_template,\
    $(BUILD_PFX)$(LIBVPX_SO)))
 $(eval $(call libvpx_symlink_template,\
    $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\
-    $(DIST_DIR)/$(LIBSUBDIR)/$(LIBVPX_SO)))
+    $(LIBVPX_SO)))


 INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBVPX_SO_SYMLINKS)
@ -375,10 +364,6 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

-CODEC_DOC_SRCS += vpx/vpx_codec.h \
-                  vpx/vpx_decoder.h \
-                  vpx/vpx_encoder.h \
-                  vpx/vpx_image.h

 ##
 ## libvpx test directives
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@ -59,9 +59,13 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
    /* Test the buffer model here before subtracting the frame. Do so because
     * the way the leaky bucket model works in libvpx is to allow the buffer to
     * empty - and then stop showing frames until we've got enough bits to
-     * show one. */
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-        << pkt->data.frame.pts;
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }

    const int frame_size_in_bits = pkt->data.frame.sz * 8;

@ -125,7 +129,12 @@ TEST_P(DatarateTest, BasicBufferModel) {
  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                       30, 1, 0, 140);

-  for (int i = 70; i < 700; i += 200) {
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted about (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i < 800; i += 200) {
    cfg_.rc_target_bitrate = i;
    ResetModel();
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@ -9,6 +9,7 @@
 */
 #include "test/decode_test_driver.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
 #include "test/video_source.h"

 namespace libvpx_test {
@ -21,8 +22,9 @@ void Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
    ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
  }

-  const vpx_codec_err_t res_dec = vpx_codec_decode(&decoder_,
-                                                   cxdata, size, NULL, 0);
+  vpx_codec_err_t res_dec;
+  REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,
+                                                  cxdata, size, NULL, 0));
  ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
 }

--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -12,6 +12,7 @@
 #if CONFIG_VP8_DECODER
 #include "test/decode_test_driver.h"
 #endif
+#include "test/register_state_check.h"
 #include "test/video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

@ -58,9 +59,10 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
  }

  // Encode the frame
-  res = vpx_codec_encode(&encoder_,
-                         video.img(), video.pts(), video.duration(),
-                         frame_flags, deadline_);
+  REGISTER_STATE_CHECK(
+      res = vpx_codec_encode(&encoder_,
+                             video.img(), video.pts(), video.duration(),
+                             frame_flags, deadline_));
  ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
 }

--- a/test/idctllm_test.cc
+++ b/test/idctllm_test.cc
@ -13,6 +13,7 @@ extern "C" {
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 }
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

 typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
@ -54,7 +55,7 @@ TEST_P(IDCTTest, TestAllZeros)
 {
    int i;

-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

    for(i=0; i<256; i++)
        if((i&0xF) < 4 && i<64)
@ -68,7 +69,7 @@ TEST_P(IDCTTest, TestAllOnes)
    int i;

    input[0] = 4;
-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

    for(i=0; i<256; i++)
        if((i&0xF) < 4 && i<64)
@ -85,7 +86,7 @@ TEST_P(IDCTTest, TestAddOne)
        predict[i] = i;

    input[0] = 4;
-    UUT(input, predict, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

    for(i=0; i<256; i++)
        if((i&0xF) < 4 && i<64)
@ -101,7 +102,7 @@ TEST_P(IDCTTest, TestWithData)
    for(i=0; i<16; i++)
        input[i] = i;

-    UUT(input, output, 16, output, 16);
+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

    for(i=0; i<256; i++)
        if((i&0xF) > 3 || i>63)
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@ -11,6 +11,7 @@

 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@ -246,8 +247,10 @@ class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,

  virtual void Predict(MB_PREDICTION_MODE mode) {
    mb_.mode_info_context->mbmi.mode = mode;
-    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride,
-             data_ptr_[0], kStride);
+    REGISTER_STATE_CHECK(pred_fn_(&mb_,
+                                  data_ptr_[0] - kStride,
+                                  data_ptr_[0] - 1, kStride,
+                                  data_ptr_[0], kStride));
  }

  intra_pred_y_fn_t pred_fn_;
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@ -7,6 +7,7 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
 #include "vpx_config.h"
@ -74,8 +75,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
  // Initialize pixels in the output to 99.
  (void)vpx_memset(dst_image, 99, output_size);

-  GetParam()(src_image_ptr, dst_image_ptr, input_stride,
-             output_stride, block_width, flimits, 16);
+  REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+                                  output_stride, block_width, flimits, 16));

  static const uint8_t expected_data[block_height] = {
    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+
+#ifdef _WIN64
+
+#define _WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+inline bool operator==(const M128A& lhs, const M128A& rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+namespace libvpx_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+// Usage:
+// {
+//   RegisterStateCheck reg_check;
+//   FunctionToVerify();
+// }
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+
+ private:
+  static bool StoreRegisters(CONTEXT* const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  bool Check() const {
+    if (!initialized_) return false;
+    CONTEXT post_context;
+    if (!StoreRegisters(&post_context)) return false;
+
+    const M128A* xmm_pre = &pre_context_.Xmm6;
+    const M128A* xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define REGISTER_STATE_CHECK(statement) do { \
+  libvpx_test::RegisterStateCheck reg_check; \
+  statement;                               \
+} while (false)
+
+}  // namespace libvpx_test
+
+#else  // !_WIN64
+
+namespace libvpx_test {
+
+class RegisterStateCheck {};
+#define REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libvpx_test
+
+#endif  // _WIN64
+
+#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@ -21,6 +21,7 @@ extern "C" {
 }

 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"

@ -65,9 +66,11 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {

  sad_m_by_n_fn_t sad_fn_;
  virtual unsigned int SAD(unsigned int max_sad) {
-    return sad_fn_(source_data_, source_stride_,
-                   reference_data_, reference_stride_,
-                   max_sad);
+    unsigned int ret;
+    REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_,
+                                       reference_data_, reference_stride_,
+                                       max_sad));
+    return ret;
  }

  // Sum of Absolute Differences. Given two blocks, calculate the absolute
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
@ -136,8 +137,8 @@ TEST_P(SixtapPredictTest, TestWithPresetData) {

  uint8_t *src = const_cast<uint8_t*>(test_data);

-  sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
-                  2, 2, dst_, kDstStride);
+  REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                       2, 2, dst_, kDstStride));

  for (int i = 0; i < height_; ++i)
    for (int j = 0; j < width_; ++j)
@ -162,8 +163,9 @@ TEST_P(SixtapPredictTest, TestWithRandomData) {
                                xoffset, yoffset, dst_c_, kDstStride);

      // Run test.
-      sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
-                      xoffset, yoffset, dst_, kDstStride);
+      REGISTER_STATE_CHECK(
+          sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                          xoffset, yoffset, dst_, kDstStride));

      for (int i = 0; i < height_; ++i)
        for (int j = 0; j < width_; ++j)
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@ -10,6 +10,7 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
+#include "test/register_state_check.h"
 extern "C" {
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@ -77,7 +78,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
      predictor += kDiffPredStride;
    }

-    GetParam()(&be, &bd, kDiffPredStride);
+    REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));

    base_src = *be.base_src;
    src_diff = be.src_diff;
--- a/test/test.mk
+++ b/test/test.mk
@ -1,3 +1,4 @@
+LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
 LIBVPX_TEST_SRCS-yes += acm_random.h

@ -59,16 +60,18 @@ ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
+
+# IDCT test currently depends on FDCT function
+LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
 endif

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes)
 LIBVPX_TEST_SRCS-yes += dct32x32_test.cc
 endif
-LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
-LIBVPX_TEST_SRCS-yes += variance_test.cc
 endif # VP9


--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@ -9,9 +9,10 @@
 */
 #include <string>
 #include "vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
 extern "C" {
+#if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
+#endif
 #if CONFIG_VP8
 extern void vp8_rtcd();
 #endif
@ -19,7 +20,6 @@ extern void vp8_rtcd();
 extern void vp9_rtcd();
 #endif
 }
-#endif
 #include "third_party/googletest/src/include/gtest/gtest.h"

 static void append_gtest_filter(const char *str) {
@ -47,11 +47,14 @@ int main(int argc, char **argv) {
    append_gtest_filter(":-SSE4_1/*");
 #endif

+#if !CONFIG_SHARED
+  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
 #if CONFIG_VP8
  vp8_rtcd();
 #endif
 #if CONFIG_VP9
  vp9_rtcd();
+#endif
 #endif

  return RUN_ALL_TESTS();
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@ -567,46 +567,28 @@ void vp8_loop_filter_partial_frame
    int mb_cols = post->y_width >> 4;
    int mb_rows = post->y_height >> 4;

-    int linestocopy, i;
+    int linestocopy;

    loop_filter_info_n *lfi_n = &cm->lf_info;
    loop_filter_info lfi;

    int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
    FRAME_TYPE frame_type = cm->frame_type;

    const MODE_INFO *mode_info_context;

-    int lvl_seg[MAX_MB_SEGMENTS];
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);

    /* number of MB rows to use in partial filtering */
    linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */

-    /* Note the baseline filter values for each segment */
-    /* See vp8_loop_filter_frame_init. Rather than call that for each change
-     * to default_filt_lvl, copy the relevant calculation here.
-     */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {    /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-            {
-                lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            }
-            /* Delta Value */
-            else
-            {
-                lvl_seg[i] = default_filt_lvl
-                        + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                lvl_seg[i] = (lvl_seg[i] > 0) ?
-                        ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
-            }
-        }
-    }
-
    /* Set up the buffer pointers; partial image starts at ~middle of frame */
    y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
@ -620,10 +602,12 @@ void vp8_loop_filter_partial_frame
                           mode_info_context->mbmi.mode != SPLITMV &&
                           mode_info_context->mbmi.mb_skip_coeff);

-            if (alt_flt_enabled)
-                filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
-            else
-                filter_level = default_filt_lvl;
+            const int mode_index =
+                lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

            if (filter_level)
            {
--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ b/vp8/common/x86/loopfilter_block_sse2.asm
@ -150,6 +150,7 @@ sym(vp8_loop_filter_bh_y_sse2):

    push    rbp
    mov     rbp, rsp
+    SAVE_XMM 11
    push    r12
    push    r13
    mov     thresh, arg(4)
@ -258,6 +259,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
 %ifidn __OUTPUT_FORMAT__,x64
    pop    r13
    pop    r12
+    RESTORE_XMM
    pop    rbp
 %endif

--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@ -890,6 +890,7 @@ sym(vp8_intra_pred_y_tm_%1):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
    push        rsi
    push        rdi
    GET_GOT     rbx
@ -957,6 +958,7 @@ vp8_intra_pred_y_tm_%1_loop:
    RESTORE_GOT
    pop         rdi
    pop         rsi
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@ -352,6 +352,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
    pop rdi
    pop rsi
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@ -29,6 +29,13 @@
 #include "error_concealment.h"
 #endif

+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do {                      \
+  CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n)));  \
+  memset((p), 0, (n) * sizeof(*(p)));                              \
+} while (0)
+
+
 extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);

 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
@ -668,11 +675,10 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
        pbi->b_multithreaded_rd = 1;
        pbi->decoding_thread_count = core_count - 1;

-        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
-        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
-        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
+        CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+        CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+        CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+        CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);

        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
        {
@ -796,32 +802,32 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        uv_width = width >>1;

        /* Allocate an int for each mb row. */
-        CHECK_MEM_ERROR(pbi->mt_current_mb_col, vpx_malloc(sizeof(int) * pc->mb_rows));
+        CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);

        /* Allocate memory for above_row buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));

-        CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));

-        CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));

        /* Allocate memory for left_col buffers. */
-        CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));

-        CHECK_MEM_ERROR(pbi->mt_uleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));

-        CHECK_MEM_ERROR(pbi->mt_vleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
-        for (i=0; i< pc->mb_rows; i++)
+        CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
 }
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@ -118,7 +118,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)

        update_mode(
            w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
-            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count
        );
    }
    {
@ -127,7 +127,7 @@ static void update_mbintra_mode_probs(VP8_COMP *cpi)

        update_mode(
            w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
-            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count
        );
    }
 }
@ -493,7 +493,7 @@ static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACRO
 }
 void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
 {
-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
    const int rf_intra = rfct[INTRA_FRAME];
    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

@ -539,7 +539,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
    {
        int total_mbs = pc->mb_rows * pc->mb_cols;

-        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;

        if (prob_skip_false <= 1)
            prob_skip_false = 1;
@ -730,7 +730,7 @@ static void write_kfmodes(VP8_COMP *cpi)
    {
        int total_mbs = c->mb_rows * c->mb_cols;

-        prob_skip_false = (total_mbs - cpi->skip_true_count ) * 256 / total_mbs;
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;

        if (prob_skip_false <= 1)
            prob_skip_false = 1;
@ -851,6 +851,7 @@ static int prob_update_savings(const unsigned int *ct,

 static int independent_coef_context_savings(VP8_COMP *cpi)
 {
+    MACROBLOCK *const x = & cpi->mb;
    int savings = 0;
    int i = 0;
    do
@ -867,7 +868,7 @@ static int independent_coef_context_savings(VP8_COMP *cpi)
             */

            probs = (const unsigned int (*)[MAX_ENTROPY_TOKENS])
-                                                    cpi->coef_counts[i][j];
+                x->coef_counts[i][j];

            /* Reset to default probabilities at key frames */
            if (cpi->common.frame_type == KEY_FRAME)
@ -926,6 +927,7 @@ static int independent_coef_context_savings(VP8_COMP *cpi)

 static int default_coef_context_savings(VP8_COMP *cpi)
 {
+    MACROBLOCK *const x = & cpi->mb;
    int savings = 0;
    int i = 0;
    do
@ -945,7 +947,7 @@ static int default_coef_context_savings(VP8_COMP *cpi)
                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
                    cpi->frame_coef_probs [i][j][k],
                    cpi->frame_branch_ct [i][j][k],
-                    cpi->coef_counts [i][j][k],
+                    x->coef_counts [i][j][k],
                    256, 1
                );

@ -994,7 +996,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi)
 {
    int savings = 0;

-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
    const int rf_intra = rfct[INTRA_FRAME];
    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
    int new_intra, new_last, new_garf, oldtotal, newtotal;
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@ -18,6 +18,9 @@
 #include "vp8/common/entropy.h"
 #include "vpx_ports/mem.h"

+#define MAX_MODES 20
+#define MAX_ERROR_BINS 1024
+
 /* motion search site */
 typedef struct
 {
@ -127,7 +130,26 @@ typedef struct macroblock
    unsigned char need_to_clamp_best_mvs;
 #endif

+    int skip_true_count;
+    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
+    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
+    int64_t prediction_error;
+    int64_t intra_error;
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];

+    int rd_thresh_mult[MAX_MODES];
+    int rd_threshes[MAX_MODES];
+    unsigned int mbs_tested_so_far;
+    unsigned int mode_test_hit_counts[MAX_MODES];
+    int zbin_mode_boost_enabled;
+    int zbin_mode_boost;
+    int last_zbin_mode_boost;
+
+    int last_zbin_over_quant;
+    int zbin_over_quant;
+    int error_bins[MAX_ERROR_BINS];

    void (*short_fdct4x4)(short *input, short *output, int pitch);
    void (*short_fdct8x4)(short *input, short *output, int pitch);
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@ -140,8 +140,7 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
    int i;
    assert(denoiser);

-    /* don't need one for intra start at 1 */
-    for (i = 1; i < MAX_REF_FRAMES; i++)
+    for (i = 0; i < MAX_REF_FRAMES; i++)
    {
        denoiser->yv12_running_avg[i].flags = 0;

@ -175,8 +174,7 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
    int i;
    assert(denoiser);

-    /* we don't have one for intra ref frame */
-    for (i = 1; i < MAX_REF_FRAMES ; i++)
+    for (i = 0; i < MAX_REF_FRAMES ; i++)
    {
        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
    }
@ -291,7 +289,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
    {
        /* Filter. */
        decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
-                                       &denoiser->yv12_running_avg[LAST_FRAME],
+                                       &denoiser->yv12_running_avg[INTRA_FRAME],
                                       x,
                                       motion_magnitude2,
                                       recon_yoffset, recon_uvoffset);
@ -303,7 +301,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         */
        vp8_copy_mem16x16(
                x->thismb, 16,
-                denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,
-                denoiser->yv12_running_avg[LAST_FRAME].y_stride);
+                denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
    }
 }
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@ -33,7 +33,7 @@
 #endif
 #include "encodeframe.h"

-extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
 extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
                                     int prob_intra,
                                     int prob_last,
@ -45,7 +45,6 @@ extern void vp8_auto_select_speed(VP8_COMP *cpi);
 extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                      MACROBLOCK *x,
                                      MB_ROW_COMP *mbr_ei,
-                                      int mb_row,
                                      int count);
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );

@ -530,7 +529,8 @@ void encode_mb_row(VP8_COMP *cpi,
             * segmentation map
             */
            if ((cpi->current_layer == 0) &&
-                (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
+                (cpi->cyclic_refresh_mode_enabled &&
+                 xd->segmentation_enabled))
            {
                cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;

@ -642,10 +642,6 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)

    xd->left_context = &cm->left_context;

-    vp8_zero(cpi->count_mb_ref_frame_usage)
-    vp8_zero(cpi->ymode_count)
-    vp8_zero(cpi->uv_mode_count)
-
    x->mvc = cm->fc.mvc;

    vpx_memset(cm->above_context, 0,
@ -674,6 +670,43 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
    xd->fullpixel_mask = 0xffffffff;
    if(cm->full_pixel)
        xd->fullpixel_mask = 0xfffffff8;
+
+    vp8_zero(x->coef_counts);
+    vp8_zero(x->ymode_count);
+    vp8_zero(x->uv_mode_count)
+    x->prediction_error = 0;
+    x->intra_error = 0;
+    vp8_zero(x->count_mb_ref_frame_usage);
+}
+
+static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
+{
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    x->coef_counts [i][j][k][t] +=
+                        x_thread->coef_counts [i][j][k][t];
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
 }

 void vp8_encode_frame(VP8_COMP *cpi)
@ -717,9 +750,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
        xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
    }

-    cpi->prediction_error = 0;
-    cpi->intra_error = 0;
-    cpi->skip_true_count = 0;
+    cpi->mb.skip_true_count = 0;
    cpi->tok_count = 0;

 #if 0
@ -730,13 +761,11 @@ void vp8_encode_frame(VP8_COMP *cpi)

    xd->mode_info_context = cm->mi;

-    vp8_zero(cpi->MVcount);
-
-    vp8_zero(cpi->coef_counts);
+    vp8_zero(cpi->mb.MVcount);

    vp8cx_frame_init_quantizer(cpi);

-    vp8_initialize_rd_consts(cpi,
+    vp8_initialize_rd_consts(cpi, x,
                             vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));

    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
@ -775,7 +804,8 @@ void vp8_encode_frame(VP8_COMP *cpi)
        {
            int i;

-            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
+                                      cpi->encoding_thread_count);

            for (i = 0; i < cm->mb_rows; i++)
                cpi->mt_current_mb_col[i] = -1;
@ -837,13 +867,49 @@ void vp8_encode_frame(VP8_COMP *cpi)

            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
+                int mode_count;
+                int c_idx;
                totalrate += cpi->mb_row_ei[i].totalrate;
+
+                cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count;
+
+                for(mode_count = 0; mode_count < VP8_YMODES; mode_count++)
+                    cpi->mb.ymode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.ymode_count[mode_count];
+
+                for(mode_count = 0; mode_count < VP8_UV_MODES; mode_count++)
+                    cpi->mb.uv_mode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.uv_mode_count[mode_count];
+
+                for(c_idx = 0; c_idx < MVvals; c_idx++)
+                {
+                    cpi->mb.MVcount[0][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[0][c_idx];
+                    cpi->mb.MVcount[1][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[1][c_idx];
+                }
+
+                cpi->mb.prediction_error +=
+                    cpi->mb_row_ei[i].mb.prediction_error;
+                cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error;
+
+                for(c_idx = 0; c_idx < MAX_REF_FRAMES; c_idx++)
+                    cpi->mb.count_mb_ref_frame_usage[c_idx] +=
+                        cpi->mb_row_ei[i].mb.count_mb_ref_frame_usage[c_idx];
+
+                for(c_idx = 0; c_idx < MAX_ERROR_BINS; c_idx++)
+                    cpi->mb.error_bins[c_idx] +=
+                        cpi->mb_row_ei[i].mb.error_bins[c_idx];
+
+                /* add up counts for each thread */
+                sum_coef_counts(x, &cpi->mb_row_ei[i].mb);
            }

        }
        else
 #endif
        {
+
            /* for each macroblock row in image */
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
            {
@ -929,13 +995,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
    {
        int tot_modes;

-        tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
-                    + cpi->count_mb_ref_frame_usage[LAST_FRAME]
-                    + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
-                    + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+        tot_modes = cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[LAST_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];

        if (tot_modes)
-            cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+            cpi->this_frame_percent_intra =
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;

    }

@ -1065,8 +1132,8 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)

 #endif

-    ++cpi->ymode_count[m];
-    ++cpi->uv_mode_count[uvm];
+    ++x->ymode_count[m];
+    ++x->uv_mode_count[uvm];

 }

@ -1093,15 +1160,16 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }

-int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t)
 {
    MACROBLOCKD *xd = &x->e_mbd;
    int rate;

    if (cpi->sf.RD && cpi->compressor_speed != 2)
-        vp8_rd_pick_intra_mode(cpi, x, &rate);
+        vp8_rd_pick_intra_mode(x, &rate);
    else
-        vp8_pick_intra_mode(cpi, x, &rate);
+        vp8_pick_intra_mode(x, &rate);

    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
    {
@ -1118,7 +1186,7 @@ int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)

    sum_intra_stats(cpi, x);

-    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    vp8_tokenize_mb(cpi, x, t);

    if (xd->mode_info_context->mbmi.mode != B_PRED)
        vp8_inverse_transform_mby(xd);
@ -1165,17 +1233,17 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
-        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+        int zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;

        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
        {
-            cpi->mb.quantize_b      = vp8_fast_quantize_b;
-            cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+            x->quantize_b      = vp8_fast_quantize_b;
+            x->quantize_b_pair = vp8_fast_quantize_b_pair;

            /* the fast quantizer does not use zbin_extra, so
             * do not recalculate */
-            cpi->zbin_mode_boost_enabled = 0;
+            x->zbin_mode_boost_enabled = 0;
        }
        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                               &distortion, &intra_error);
@ -1183,12 +1251,12 @@ int vp8cx_encode_inter_macroblock
        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
        {
-            cpi->mb.quantize_b      = vp8_regular_quantize_b;
-            cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+            x->quantize_b      = vp8_regular_quantize_b;
+            x->quantize_b_pair = vp8_regular_quantize_b_pair;
        }

        /* restore cpi->zbin_mode_boost_enabled */
-        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+        x->zbin_mode_boost_enabled = zbin_mode_boost_enabled;

    }
    else
@ -1197,8 +1265,8 @@ int vp8cx_encode_inter_macroblock
                            &distortion, &intra_error, mb_row, mb_col);
    }

-    cpi->prediction_error += distortion;
-    cpi->intra_error += intra_error;
+    x->prediction_error += distortion;
+    x->intra_error += intra_error;

    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
    {
@ -1234,22 +1302,22 @@ int vp8cx_encode_inter_macroblock
        /* Experimental code. Special case for gf and arf zeromv modes.
         * Increase zbin size to supress noise
         */
-        cpi->zbin_mode_boost = 0;
-        if (cpi->zbin_mode_boost_enabled)
+        x->zbin_mode_boost = 0;
+        if (x->zbin_mode_boost_enabled)
        {
            if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
            {
                if (xd->mode_info_context->mbmi.mode == ZEROMV)
                {
                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                    else
-                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                }
                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
-                    cpi->zbin_mode_boost = 0;
+                    x->zbin_mode_boost = 0;
                else
-                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
            }
        }

@ -1259,7 +1327,7 @@ int vp8cx_encode_inter_macroblock
            vp8_update_zbin_extra(cpi, x);
    }

-    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
+    x->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;

    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
@ -1304,7 +1372,7 @@ int vp8cx_encode_inter_macroblock

    if (!x->skip)
    {
-        vp8_tokenize_mb(cpi, xd, t);
+        vp8_tokenize_mb(cpi, x, t);

        if (xd->mode_info_context->mbmi.mode != B_PRED)
            vp8_inverse_transform_mby(xd);
@ -1321,12 +1389,12 @@ int vp8cx_encode_inter_macroblock

        if (cpi->common.mb_no_coeff_skip)
        {
-            cpi->skip_true_count ++;
+            x->skip_true_count ++;
            vp8_fix_contexts(xd);
        }
        else
        {
-            vp8_stuff_mb(cpi, xd, t);
+            vp8_stuff_mb(cpi, x, t);
        }
    }

--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@ -363,10 +363,12 @@ void vp8_write_mvprobs(VP8_COMP *cpi)
    active_section = 4;
 #endif
    write_component_probs(
-        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0]
+        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+        cpi->mb.MVcount[0], 0, &flags[0]
    );
    write_component_probs(
-        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1]
+        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+        cpi->mb.MVcount[1], 1, &flags[1]
    );

    if (flags[0] || flags[1])
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@ -17,12 +17,6 @@

 #if CONFIG_MULTITHREAD

-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t,
-                                         int recon_yoffset, int recon_uvoffset,
-                                         int mb_row, int mb_col);
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);

 extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
@ -220,7 +214,9 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         * vp8cx_encode_inter_macroblock()) back into the
                         * global segmentation map
                         */
-                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                        if ((cpi->current_layer == 0) &&
+                            (cpi->cyclic_refresh_mode_enabled &&
+                             xd->segmentation_enabled))
                        {
                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
                            cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
@ -422,13 +418,23 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
            zd->block[i].dequant = zd->dequant_uv;
        zd->block[24].dequant = zd->dequant_y2;
 #endif
+
+
+        vpx_memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+        vpx_memcpy(z->rd_thresh_mult, x->rd_thresh_mult,
+                   sizeof(x->rd_thresh_mult));
+
+        z->zbin_over_quant = x->zbin_over_quant;
+        z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
+        z->zbin_mode_boost = x->zbin_mode_boost;
+
+        vpx_memset(z->error_bins, 0, sizeof(z->error_bins));
    }
 }

 void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                               MACROBLOCK *x,
                               MB_ROW_COMP *mbr_ei,
-                               int mb_row,
                               int count
                              )
 {
@ -436,7 +442,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
    VP8_COMMON *const cm = & cpi->common;
    MACROBLOCKD *const xd = & x->e_mbd;
    int i;
-    (void) mb_row;

    for (i = 0; i < count; i++)
    {
@ -477,6 +482,15 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
        mbd->fullpixel_mask = 0xffffffff;
        if(cm->full_pixel)
            mbd->fullpixel_mask = 0xfffffff8;
+
+        vp8_zero(mb->coef_counts);
+        vp8_zero(x->ymode_count);
+        mb->skip_true_count = 0;
+        vp8_zero(mb->MVcount);
+        mb->prediction_error = 0;
+        mb->intra_error = 0;
+        vp8_zero(mb->count_mb_ref_frame_usage);
+        mb->mbs_tested_so_far = 0;
    }
 }

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -570,7 +570,7 @@ void vp8_first_pass(VP8_COMP *cpi)
    /* Initialise the MV cost table to the defaults */
    {
        int flag[2] = {1, 1};
-        vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+        vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
    }
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -239,7 +239,7 @@ static void save_layer_context(VP8_COMP *cpi)
    lc->rate_correction_factor           = cpi->rate_correction_factor;
    lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
    lc->gf_rate_correction_factor        = cpi->gf_rate_correction_factor;
-    lc->zbin_over_quant                  = cpi->zbin_over_quant;
+    lc->zbin_over_quant                  = cpi->mb.zbin_over_quant;
    lc->inter_frame_target               = cpi->inter_frame_target;
    lc->total_byte_count                 = cpi->total_byte_count;
    lc->filter_level                     = cpi->common.filter_level;
@ -247,8 +247,8 @@ static void save_layer_context(VP8_COMP *cpi)
    lc->last_frame_percent_intra         = cpi->last_frame_percent_intra;

    memcpy (lc->count_mb_ref_frame_usage,
-            cpi->count_mb_ref_frame_usage,
-            sizeof(cpi->count_mb_ref_frame_usage));
+            cpi->mb.count_mb_ref_frame_usage,
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
 }

 static void restore_layer_context(VP8_COMP *cpi, const int layer)
@ -277,16 +277,16 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer)
    cpi->rate_correction_factor           = lc->rate_correction_factor;
    cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
    cpi->gf_rate_correction_factor        = lc->gf_rate_correction_factor;
-    cpi->zbin_over_quant                  = lc->zbin_over_quant;
+    cpi->mb.zbin_over_quant                  = lc->zbin_over_quant;
    cpi->inter_frame_target               = lc->inter_frame_target;
    cpi->total_byte_count                 = lc->total_byte_count;
    cpi->common.filter_level              = lc->filter_level;

    cpi->last_frame_percent_intra         = lc->last_frame_percent_intra;

-    memcpy (cpi->count_mb_ref_frame_usage,
+    memcpy (cpi->mb.count_mb_ref_frame_usage,
            lc->count_mb_ref_frame_usage,
-            sizeof(cpi->count_mb_ref_frame_usage));
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
 }

 static void setup_features(VP8_COMP *cpi)
@ -356,8 +356,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
    /* Activity mask based per mb zbin adjustments */
    vpx_free(cpi->mb_activity_map);
    cpi->mb_activity_map = 0;
-    vpx_free(cpi->mb_norm_activity_map);
-    cpi->mb_norm_activity_map = 0;

    vpx_free(cpi->mb.pip);
    cpi->mb.pip = 0;
@ -643,11 +641,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    for (i = 0; i < MAX_MODES; i ++)
    {
        cpi->mode_check_freq[i] = 0;
-        cpi->mode_test_hit_counts[i] = 0;
        cpi->mode_chosen_counts[i] = 0;
    }

-    cpi->mbs_tested_so_far = 0;
+    cpi->mb.mbs_tested_so_far = 0;

    /* best quality defaults */
    sf->RD = 1;
@ -841,7 +838,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)

            for (i = 0; i < min; i++)
            {
-                sum += cpi->error_bins[i];
+                sum += cpi->mb.error_bins[i];
            }

            total_skip = sum;
@ -850,7 +847,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
            /* i starts from 2 to make sure thresh started from 2048 */
            for (; i < 1024; i++)
            {
-                sum += cpi->error_bins[i];
+                sum += cpi->mb.error_bins[i];

                if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
                    break;
@ -905,7 +902,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        if (Speed >= 15)
            sf->half_pixel_search = 0;

-        vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+        vpx_memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));

    }; /* switch */

@ -1080,10 +1077,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
    }

    /* Data used for real time vc mode to see if gf needs refreshing */
-    cpi->inter_zz_count = 0;
    cpi->zeromv_count = 0;
-    cpi->gf_bad_count = 0;
-    cpi->gf_update_recommended = 0;


    /* Structures used to monitor GF usage */
@ -1098,11 +1092,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
                    vpx_calloc(sizeof(*cpi->mb_activity_map),
                    cm->mb_rows * cm->mb_cols));

-    vpx_free(cpi->mb_norm_activity_map);
-    CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
-                    vpx_calloc(sizeof(*cpi->mb_norm_activity_map),
-                    cm->mb_rows * cm->mb_cols));
-
    /* allocate memory for storing last frame's MVs for MV prediction. */
    vpx_free(cpi->lfmv);
    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
@ -1932,7 +1921,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    /* Set starting values of RD threshold multipliers (128 = *1) */
    for (i = 0; i < MAX_MODES; i++)
    {
-        cpi->rd_thresh_mult[i] = 128;
+        cpi->mb.rd_thresh_mult[i] = 128;
    }

 #ifdef ENTROPY_STATS
@ -2010,7 +1999,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->refining_search_sad = vp8_refining_search_sad;

    /* make sure frame 1 is okay */
-    cpi->error_bins[0] = cpi->common.MBs;
+    cpi->mb.error_bins[0] = cpi->common.MBs;

    /* vp8cx_init_quantizer() is first called here. Add check in
     * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
@ -2783,10 +2772,14 @@ static void update_golden_frame_stats(VP8_COMP *cpi)

        if (cpi->common.frames_since_golden > 1)
        {
-            cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
-            cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
-            cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
-            cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
+            cpi->recent_ref_frame_usage[LAST_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[LAST_FRAME];
+            cpi->recent_ref_frame_usage[GOLDEN_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME];
+            cpi->recent_ref_frame_usage[ALTREF_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];
        }
    }
 }
@ -2798,7 +2791,7 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
 {
    VP8_COMMON *cm = &cpi->common;

-    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
    const int rf_intra = rfct[INTRA_FRAME];
    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

@ -2865,38 +2858,17 @@ static int decide_key_frame(VP8_COMP *cpi)

    if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
    {
-        double change = 1.0 * abs((int)(cpi->intra_error - cpi->last_intra_error)) / (1 + cpi->last_intra_error);
-        double change2 = 1.0 * abs((int)(cpi->prediction_error - cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+        double change = 1.0 * abs((int)(cpi->mb.intra_error -
+            cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+        double change2 = 1.0 * abs((int)(cpi->mb.prediction_error -
+            cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
        double minerror = cm->MBs * 256;

-#if 0
+        cpi->last_intra_error = cpi->mb.intra_error;
+        cpi->last_prediction_error = cpi->mb.prediction_error;

-        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
-            && cpi->prediction_error > minerror
-            && (change > .25 || change2 > .25))
-        {
-            FILE *f = fopen("intra_inter.stt", "a");
-
-            if (cpi->prediction_error <= 0)
-                cpi->prediction_error = 1;
-
-            fprintf(f, "%d %d %d %d %14.4f\n",
-                    cm->current_video_frame,
-                    (int) cpi->prediction_error,
-                    (int) cpi->intra_error,
-                    (int)((10 * cpi->intra_error) / cpi->prediction_error),
-                    change);
-
-            fclose(f);
-        }
-
-#endif
-
-        cpi->last_intra_error = cpi->intra_error;
-        cpi->last_prediction_error = cpi->prediction_error;
-
-        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
-            && cpi->prediction_error > minerror
+        if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15
+            && cpi->mb.prediction_error > minerror
            && (change > .25 || change2 > .25))
        {
            /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
@ -3160,6 +3132,57 @@ static void update_reference_frames(VP8_COMP *cpi)
        cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
 #endif
    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        /* we shouldn't have to keep multiple copies as we know in advance which
+         * buffer we should start - for now to get something up and running
+         * I've chosen to copy the buffers
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            int i;
+            vp8_yv12_copy_frame(
+                    cpi->Source,
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[i]);
+        }
+        else /* For non key frames */
+        {
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+
+            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+            }
+            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+            }
+            if(cm->refresh_last_frame)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+            }
+        }
+
+    }
+#endif
+
 }

 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
@ -3203,51 +3226,6 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
    }

    vp8_yv12_extend_frame_borders(cm->frame_to_show);
-#if CONFIG_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity)
-    {
-
-
-        /* we shouldn't have to keep multiple copies as we know in advance which
-         * buffer we should start - for now to get something up and running
-         * I've chosen to copy the buffers
-         */
-        if (cm->frame_type == KEY_FRAME)
-        {
-            int i;
-            vp8_yv12_copy_frame(
-                    cpi->Source,
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            vp8_yv12_extend_frame_borders(
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
-                vp8_yv12_copy_frame(
-                        cpi->Source,
-                        &cpi->denoiser.yv12_running_avg[i]);
-        }
-        else /* For non key frames */
-        {
-            vp8_yv12_extend_frame_borders(
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
-            {
-                vp8_yv12_copy_frame(
-                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
-                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
-            }
-            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
-            {
-                vp8_yv12_copy_frame(
-                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
-                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
-            }
-        }
-
-    }
-#endif

 }

@ -3331,19 +3309,19 @@ static void encode_frame_to_data_rate
    cm->copy_buffer_to_arf = 0;

    /* Clear zbin over-quant value and mode boost values. */
-    cpi->zbin_over_quant = 0;
-    cpi->zbin_mode_boost = 0;
+    cpi->mb.zbin_over_quant = 0;
+    cpi->mb.zbin_mode_boost = 0;

    /* Enable or disable mode based tweaking of the zbin
     * For 2 Pass Only used where GF/ARF prediction quality
     * is above a threshold
     */
-    cpi->zbin_mode_boost_enabled = 1;
+    cpi->mb.zbin_mode_boost_enabled = 1;
    if (cpi->pass == 2)
    {
        if ( cpi->gfu_boost <= 400 )
        {
-            cpi->zbin_mode_boost_enabled = 0;
+            cpi->mb.zbin_mode_boost_enabled = 0;
        }
    }

@ -3410,7 +3388,7 @@ static void encode_frame_to_data_rate
        /* Reset the RD threshold multipliers to default of * 1 (128) */
        for (i = 0; i < MAX_MODES; i++)
        {
-            cpi->rd_thresh_mult[i] = 128;
+            cpi->mb.rd_thresh_mult[i] = 128;
        }
    }

@ -4099,8 +4077,9 @@ static void encode_frame_to_data_rate
                q_low = (Q < q_high) ? (Q + 1) : q_high;

                /* If we are using over quant do the same for zbin_oq_low */
-                if (cpi->zbin_over_quant > 0)
-                    zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+                if (cpi->mb.zbin_over_quant > 0)
+                    zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                        (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;

                if (undershoot_seen)
                {
@ -4116,11 +4095,13 @@ static void encode_frame_to_data_rate
                     * is max)
                     */
                    if (Q < MAXQ)
-                        cpi->zbin_over_quant = 0;
+                        cpi->mb.zbin_over_quant = 0;
                    else
                    {
-                        zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                        zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                            (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
                    }
                }
                else
@ -4133,7 +4114,9 @@ static void encode_frame_to_data_rate

                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);

-                    while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+                    while (((Q < q_low) ||
+                        (cpi->mb.zbin_over_quant < zbin_oq_low)) &&
+                        (Retries < 10))
                    {
                        vp8_update_rate_correction_factors(cpi, 0);
                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
@ -4146,12 +4129,13 @@ static void encode_frame_to_data_rate
            /* Frame is too small */
            else
            {
-                if (cpi->zbin_over_quant == 0)
+                if (cpi->mb.zbin_over_quant == 0)
                    /* Lower q_high if not using over quant */
                    q_high = (Q > q_low) ? (Q - 1) : q_low;
                else
                    /* else lower zbin_oq_high */
-                    zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+                    zbin_oq_high = (cpi->mb.zbin_over_quant > zbin_oq_low) ?
+                        (cpi->mb.zbin_over_quant - 1) : zbin_oq_low;

                if (overshoot_seen)
                {
@ -4167,9 +4151,10 @@ static void encode_frame_to_data_rate
                     * is max)
                     */
                    if (Q < MAXQ)
-                        cpi->zbin_over_quant = 0;
+                        cpi->mb.zbin_over_quant = 0;
                    else
-                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
                }
                else
                {
@ -4192,7 +4177,9 @@ static void encode_frame_to_data_rate
                        q_low = Q;
                    }

-                    while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+                    while (((Q > q_high) ||
+                        (cpi->mb.zbin_over_quant > zbin_oq_high)) &&
+                        (Retries < 10))
                    {
                        vp8_update_rate_correction_factors(cpi, 0);
                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
@ -4210,7 +4197,9 @@ static void encode_frame_to_data_rate
                Q = q_low;

            /* Clamp cpi->zbin_over_quant */
-            cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+            cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) ?
+                zbin_oq_low : (cpi->mb.zbin_over_quant > zbin_oq_high) ?
+                    zbin_oq_high : cpi->mb.zbin_over_quant;

            Loop = Q != last_q;
        }
@ -4292,7 +4281,6 @@ static void encode_frame_to_data_rate
        /* Point to beginning of MODE_INFO arrays. */
        MODE_INFO *tmp = cm->mi;

-        cpi->inter_zz_count = 0;
        cpi->zeromv_count = 0;

        if(cm->frame_type != KEY_FRAME)
@ -4301,8 +4289,6 @@ static void encode_frame_to_data_rate
            {
                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
                {
-                    if(tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME)
-                        cpi->inter_zz_count++;
                    if(tmp->mbmi.mode == ZEROMV)
                        cpi->zeromv_count++;
                    tmp++;
@ -4732,67 +4718,6 @@ static void encode_frame_to_data_rate


 }
-
-
-static void check_gf_quality(VP8_COMP *cpi)
-{
-    VP8_COMMON *cm = &cpi->common;
-    int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
-    int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
-    int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
-
-    /* Gf refresh is not currently being signalled */
-    if (cpi->gf_update_recommended == 0)
-    {
-        if (cpi->common.frames_since_golden > 7)
-        {
-            /* Low use of gf */
-            if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
-            {
-                /* ...but last frame zero zero usage is reasonbable so a
-                 * new gf might be appropriate
-                 */
-                if (last_ref_zz_useage >= 25)
-                {
-                    cpi->gf_bad_count ++;
-
-                    /* Check that the condition is stable */
-                    if (cpi->gf_bad_count >= 8)
-                    {
-                        cpi->gf_update_recommended = 1;
-                        cpi->gf_bad_count = 0;
-                    }
-                }
-                else
-                    /* Restart count as the background is not stable enough */
-                    cpi->gf_bad_count = 0;
-            }
-            else
-                /* Gf useage has picked up so reset count */
-                cpi->gf_bad_count = 0;
-        }
-    }
-    /* If the signal is set but has not been read should we cancel it. */
-    else if (last_ref_zz_useage < 15)
-    {
-        cpi->gf_update_recommended = 0;
-        cpi->gf_bad_count = 0;
-    }
-
-#if 0
-    {
-        FILE *f = fopen("gfneeded.stt", "a");
-        fprintf(f, "%10d %10d %10d %10d %10ld \n",
-                cm->current_video_frame,
-                cpi->common.frames_since_golden,
-                gf_active_pct, gf_ref_usage_pct,
-                cpi->gf_update_recommended);
-        fclose(f);
-    }
-
-#endif
-}
-
 #if !(CONFIG_REALTIME_ONLY)
 static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
 {
@ -5096,8 +5021,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l

    if (cpi->compressor_speed == 2)
    {
-        if (cpi->oxcf.number_of_layers == 1)
-            check_gf_quality(cpi);
        vpx_usec_timer_start(&tsctimer);
        vpx_usec_timer_start(&ticktimer);
    }
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -43,7 +43,7 @@
 #define AF_THRESH   25
 #define AF_THRESH2  100
 #define ARF_DECAY_THRESH 12
-#define MAX_MODES 20
+

 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
@ -349,13 +349,9 @@ typedef struct VP8_COMP
    int ambient_err;

    unsigned int mode_check_freq[MAX_MODES];
-    unsigned int mode_test_hit_counts[MAX_MODES];
    unsigned int mode_chosen_counts[MAX_MODES];
-    unsigned int mbs_tested_so_far;

-    int rd_thresh_mult[MAX_MODES];
    int rd_baseline_thresh[MAX_MODES];
-    int rd_threshes[MAX_MODES];

    int RDMULT;
    int RDDIV ;
@ -363,9 +359,7 @@ typedef struct VP8_COMP
    CODING_CONTEXT coding_context;

    /* Rate targetting variables */
-    int64_t prediction_error;
    int64_t last_prediction_error;
-    int64_t intra_error;
    int64_t last_intra_error;

    int this_frame_target;
@ -418,12 +412,6 @@ typedef struct VP8_COMP
    int ni_frames;
    int avg_frame_qindex;

-    int zbin_over_quant;
-    int zbin_mode_boost;
-    int zbin_mode_boost_enabled;
-    int last_zbin_over_quant;
-    int last_zbin_mode_boost;
-
    int64_t total_byte_count;

    int buffered_mode;
@ -452,13 +440,6 @@ typedef struct VP8_COMP
    int drop_frames_allowed; /* Are we permitted to drop frames? */
    int drop_frame;          /* Drop this frame? */

-    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
-    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
-
-    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
-
-    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-
    vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
    char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

@ -486,7 +467,6 @@ typedef struct VP8_COMP
    int Speed;
    int compressor_speed;

-    int interquantizer;
    int auto_gold;
    int auto_adjust_gold_quantizer;
    int auto_worst_q;
@ -502,25 +482,16 @@ typedef struct VP8_COMP
    int last_skip_probs_q[3];
    int recent_ref_frame_usage[MAX_REF_FRAMES];

-    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
    int this_frame_percent_intra;
    int last_frame_percent_intra;

    int ref_frame_flags;

    SPEED_FEATURES sf;
-    int error_bins[1024];

-    /* Data used for real time conferencing mode to help determine if it
-     * would be good to update the gf
-     */
-    int inter_zz_count;
    /* Count ZEROMV on all reference frames. */
    int zeromv_count;
    int lf_zeromv_pct;
-    int gf_bad_count;
-    int gf_update_recommended;
-    int skip_true_count;

    unsigned char *segmentation_map;
    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
@ -659,7 +630,6 @@ typedef struct VP8_COMP
    /* Per MB activity measurement */
    unsigned int activity_avg;
    unsigned int * mb_activity_map;
-    int * mb_norm_activity_map;

    /* Record of which MBs still refer to last golden frame either
     * directly or through 0,0
@ -723,13 +693,10 @@ typedef struct VP8_COMP
    } rd_costs;
 } VP8_COMP;

-void control_data_rate(VP8_COMP *cpi);
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
+                        unsigned char *dest_end, unsigned long *size);

-void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char *dest_end, unsigned long *size);
-
-int rd_cost_intra_mb(MACROBLOCKD *x);
-
-void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);

 void vp8_set_speed_features(VP8_COMP *cpi);

--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@ -389,15 +389,16 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb)

 }

-static void update_mvcount(VP8_COMP *cpi, MACROBLOCKD *xd, int_mv *best_ref_mv)
+static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    /* Split MV modes currently not supported when RD is nopt enabled,
     * therefore, only need to modify MVcount in NEWMV mode. */
    if (xd->mode_info_context->mbmi.mode == NEWMV)
    {
-        cpi->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
+        x->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
                                      best_ref_mv->as_mv.row) >> 1)]++;
-        cpi->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
+        x->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
                                      best_ref_mv->as_mv.col) >> 1)]++;
    }
 }
@ -679,7 +680,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);

    /* Count of the number of MBs tested so far this frame */
-    cpi->mbs_tested_so_far++;
+    x->mbs_tested_so_far++;

    *returnintra = INT_MAX;
    x->skip = 0;
@ -700,7 +701,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        int this_rd = INT_MAX;
        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];

-        if (best_rd <= cpi->rd_threshes[mode_index])
+        if (best_rd <= x->rd_threshes[mode_index])
            continue;

        if (this_ref_frame < 0)
@ -745,22 +746,22 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        /* Check to see if the testing frequency for this mode is at its max
         * If so then prevent it from being tested and increase the threshold
         * for its testing */
-        if (cpi->mode_test_hit_counts[mode_index] &&
+        if (x->mode_test_hit_counts[mode_index] &&
                                         (cpi->mode_check_freq[mode_index] > 1))
        {
-            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
-                                         cpi->mode_test_hit_counts[mode_index]))
+            if (x->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         x->mode_test_hit_counts[mode_index]))
            {
                /* Increase the threshold for coding this mode to make it less
                 * likely to be chosen */
-                cpi->rd_thresh_mult[mode_index] += 4;
+                x->rd_thresh_mult[mode_index] += 4;

-                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-                cpi->rd_threshes[mode_index] =
+                x->rd_threshes[mode_index] =
                                 (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                                 cpi->rd_thresh_mult[mode_index];
+                                 x->rd_thresh_mult[mode_index];
                continue;
            }
        }
@ -768,7 +769,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        /* We have now reached the point where we are going to test the current
         * mode so increment the counter for the number of times it has been
         * tested */
-        cpi->mode_test_hit_counts[mode_index] ++;
+        x->mode_test_hit_counts[mode_index] ++;

        rate2 = 0;
        distortion2 = 0;
@ -1108,12 +1109,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            /* Testing this mode gave rise to an improvement in best error
             * score. Lower threshold a bit for next time
             */
-            cpi->rd_thresh_mult[mode_index] =
-                     (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-                     cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] =
+            x->rd_thresh_mult[mode_index] =
+                     (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            x->rd_threshes[mode_index] =
                                   (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                                   cpi->rd_thresh_mult[mode_index];
+                                   x->rd_thresh_mult[mode_index];
        }

        /* If the mode did not help improve the best error case then raise the
@ -1121,14 +1122,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         */
        else
        {
-            cpi->rd_thresh_mult[mode_index] += 4;
+            x->rd_thresh_mult[mode_index] += 4;

-            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-            cpi->rd_threshes[mode_index] =
+            x->rd_threshes[mode_index] =
                         (cpi->rd_baseline_thresh[mode_index] >> 7) *
-                         cpi->rd_thresh_mult[mode_index];
+                         x->rd_thresh_mult[mode_index];
        }

        if (x->skip)
@ -1138,16 +1139,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    /* Reduce the activation RD thresholds for the best choice mode */
    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
    {
-        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 3);

-        cpi->rd_thresh_mult[best_mode_index] =
-                        (cpi->rd_thresh_mult[best_mode_index]
+        x->rd_thresh_mult[best_mode_index] =
+                        (x->rd_thresh_mult[best_mode_index]
                        >= (MIN_THRESHMULT + best_adjustment)) ?
-                        cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        x->rd_thresh_mult[best_mode_index] - best_adjustment :
                        MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] =
+        x->rd_threshes[best_mode_index] =
                        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-                        cpi->rd_thresh_mult[best_mode_index];
+                        x->rd_thresh_mult[best_mode_index];
    }


@ -1159,7 +1160,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            this_rdbin = 1023;
        }

-        cpi->error_bins[this_rdbin] ++;
+        x->error_bins[this_rdbin] ++;
    }

 #if CONFIG_TEMPORAL_DENOISING
@ -1240,11 +1241,11 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
      != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;

-    update_mvcount(cpi, &x->e_mbd, &best_ref_mv);
+    update_mvcount(cpi, x, &best_ref_mv);
 }


-void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
    int error4x4, error16x16 = INT_MAX;
    int rate, best_rate = 0, distortion, best_sse;
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@ -18,7 +18,7 @@ extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                int recon_uvoffset, int *returnrate,
                                int *returndistortion, int *returnintra,
                                int mb_row, int mb_col);
-extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate);

 extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
                                      const vp8_variance_fn_ptr_t *vfp,
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@ -587,20 +587,20 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)

 #define ZBIN_EXTRA_Y \
    (( cpi->common.Y1dequant[QIndex][1] *  \
-    ( cpi->zbin_over_quant +  \
-      cpi->zbin_mode_boost +  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
      x->act_zbin_adj ) ) >> 7)

 #define ZBIN_EXTRA_UV \
    (( cpi->common.UVdequant[QIndex][1] *  \
-    ( cpi->zbin_over_quant +  \
-      cpi->zbin_mode_boost +  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
      x->act_zbin_adj ) ) >> 7)

 #define ZBIN_EXTRA_Y2 \
    (( cpi->common.Y2dequant[QIndex][1] *  \
-    ( (cpi->zbin_over_quant / 2) +  \
-       cpi->zbin_mode_boost +  \
+    ( (x->zbin_over_quant / 2) +  \
+       x->zbin_mode_boost +  \
       x->act_zbin_adj ) ) >> 7)

 void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
@ -702,15 +702,15 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
        /* save this macroblock QIndex for vp8_update_zbin_extra() */
        x->q_index = QIndex;

-        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
-        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
        x->last_act_zbin_adj = x->act_zbin_adj;



    }
-    else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
-            || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
+    else if(x->last_zbin_over_quant != x->zbin_over_quant
+            || x->last_zbin_mode_boost != x->zbin_mode_boost
            || x->last_act_zbin_adj != x->act_zbin_adj)
    {
        /* Y */
@ -729,8 +729,8 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
        zbin_extra = ZBIN_EXTRA_Y2;
        x->block[24].zbin_extra = (short)zbin_extra;

-        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
-        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
        x->last_act_zbin_adj = x->act_zbin_adj;
    }
 }
@ -764,7 +764,7 @@ void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
    /* Clear Zbin mode boost for default case */
-    cpi->zbin_mode_boost = 0;
+    cpi->mb.zbin_mode_boost = 0;

    /* MB level quantizer setup */
    vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@ -242,8 +242,8 @@ void vp8_save_coding_context(VP8_COMP *cpi)
    vp8_copy(cc->ymode_prob,   cpi->common.fc.ymode_prob);
    vp8_copy(cc->uv_mode_prob,  cpi->common.fc.uv_mode_prob);

-    vp8_copy(cc->ymode_count, cpi->ymode_count);
-    vp8_copy(cc->uv_mode_count, cpi->uv_mode_count);
+    vp8_copy(cc->ymode_count, cpi->mb.ymode_count);
+    vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count);


    /* Stats */
@ -280,8 +280,8 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
    vp8_copy(cpi->common.fc.ymode_prob,   cc->ymode_prob);
    vp8_copy(cpi->common.fc.uv_mode_prob,  cc->uv_mode_prob);

-    vp8_copy(cpi->ymode_count, cc->ymode_count);
-    vp8_copy(cpi->uv_mode_count, cc->uv_mode_count);
+    vp8_copy(cpi->mb.ymode_count, cc->ymode_count);
+    vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count);

    /* Stats */
 #ifdef MODE_STATS
@ -1109,7 +1109,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
    }
    else
    {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
            rate_correction_factor = cpi->gf_rate_correction_factor;
        else
            rate_correction_factor = cpi->rate_correction_factor;
@ -1122,9 +1124,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
    projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));

    /* Make some allowance for cpi->zbin_over_quant */
-    if (cpi->zbin_over_quant > 0)
+    if (cpi->mb.zbin_over_quant > 0)
    {
-        int Z = cpi->zbin_over_quant;
+        int Z = cpi->mb.zbin_over_quant;
        double Factor = 0.99;
        double factor_adjustment = 0.01 / 256.0;

@ -1186,7 +1188,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
        cpi->key_frame_rate_correction_factor = rate_correction_factor;
    else
    {
-        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
            cpi->gf_rate_correction_factor = rate_correction_factor;
        else
            cpi->rate_correction_factor = rate_correction_factor;
@ -1199,7 +1203,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
    int Q = cpi->active_worst_quality;

    /* Reset Zbin OQ value */
-    cpi->zbin_over_quant = 0;
+    cpi->mb.zbin_over_quant = 0;

    if (cpi->oxcf.fixed_q >= 0)
    {
@ -1209,11 +1213,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
        {
            Q = cpi->oxcf.key_q;
        }
-        else if (cpi->common.refresh_alt_ref_frame)
+        else if (cpi->oxcf.number_of_layers == 1 &&
+            cpi->common.refresh_alt_ref_frame)
        {
            Q = cpi->oxcf.alt_q;
        }
-        else if (cpi->common.refresh_golden_frame)
+        else if (cpi->oxcf.number_of_layers == 1  &&
+            cpi->common.refresh_golden_frame)
        {
            Q = cpi->oxcf.gold_q;
        }
@ -1232,7 +1238,9 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
            correction_factor = cpi->key_frame_rate_correction_factor;
        else
        {
-            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            if (cpi->oxcf.number_of_layers == 1 &&
+               (cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame))
                correction_factor = cpi->gf_rate_correction_factor;
            else
                correction_factor = cpi->rate_correction_factor;
@ -1281,7 +1289,10 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)

            if (cpi->common.frame_type == KEY_FRAME)
                zbin_oqmax = 0;
-            else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+            else if (cpi->oxcf.number_of_layers == 1 &&
+                (cpi->common.refresh_alt_ref_frame ||
+                (cpi->common.refresh_golden_frame &&
+                 !cpi->source_alt_ref_active)))
                zbin_oqmax = 16;
            else
                zbin_oqmax = ZBIN_OQ_MAX;
@ -1307,12 +1318,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
             * normal maximum by expanding the zero bin and hence
             * decreasing the number of low magnitude non zero coefficients.
             */
-            while (cpi->zbin_over_quant < zbin_oqmax)
+            while (cpi->mb.zbin_over_quant < zbin_oqmax)
            {
-                cpi->zbin_over_quant ++;
+                cpi->mb.zbin_over_quant ++;

-                if (cpi->zbin_over_quant > zbin_oqmax)
-                    cpi->zbin_over_quant = zbin_oqmax;
+                if (cpi->mb.zbin_over_quant > zbin_oqmax)
+                    cpi->mb.zbin_over_quant = zbin_oqmax;

                /* Adjust bits_per_mb_at_this_q estimate */
                bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -223,7 +223,7 @@ void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
 }

-void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)
 {
    int q;
    int i;
@ -238,15 +238,15 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
    cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));

    /* Extend rate multiplier along side quantizer zbin increases */
-    if (cpi->zbin_over_quant  > 0)
+    if (cpi->mb.zbin_over_quant  > 0)
    {
        double oq_factor;
        double modq;

        /* Experimental code using the same basic equation as used for Q above
-         * The units of cpi->zbin_over_quant are 1/128 of Q bin size
+         * The units of cpi->mb.zbin_over_quant are 1/128 of Q bin size
         */
-        oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+        oq_factor = 1.0 + ((double)0.0015625 * cpi->mb.zbin_over_quant);
        modq = (int)((double)capped_q * oq_factor);
        cpi->RDMULT = (int)(rdconst * (modq * modq));
    }
@ -265,6 +265,11 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)

    vp8_set_speed_features(cpi);

+    for (i = 0; i < MAX_MODES; i++)
+    {
+        x->mode_test_hit_counts[i] = 0;
+    }
+
    q = (int)pow(Qvalue, 1.25);

    if (q < 8)
@ -279,14 +284,14 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
        {
            if (cpi->sf.thresh_mult[i] < INT_MAX)
            {
-                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
            }
            else
            {
-                cpi->rd_threshes[i] = INT_MAX;
+                x->rd_threshes[i] = INT_MAX;
            }

-            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
        }
    }
    else
@ -297,14 +302,14 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
        {
            if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
            {
-                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
            }
            else
            {
-                cpi->rd_threshes[i] = INT_MAX;
+                x->rd_threshes[i] = INT_MAX;
            }

-            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
        }
    }

@ -625,7 +630,6 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
    d[12] = p[12];
 }
 static int rd_pick_intra4x4block(
-    VP8_COMP *cpi,
    MACROBLOCK *x,
    BLOCK *be,
    BLOCKD *b,
@ -701,7 +705,7 @@ static int rd_pick_intra4x4block(
    return best_rd;
 }

-static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
+static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
                                     int *rate_y, int *Distortion, int best_rd)
 {
    MACROBLOCKD *const xd = &mb->e_mbd;
@ -741,7 +745,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
        }

        total_rd += rd_pick_intra4x4block(
-            cpi, mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
+            mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
            ta + vp8_block2above[i],
            tl + vp8_block2left[i], &r, &ry, &d);

@ -766,8 +770,7 @@ static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
 }


-static int rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
-                                      MACROBLOCK *x,
+static int rd_pick_intra16x16mby_mode(MACROBLOCK *x,
                                      int *Rate,
                                      int *rate_y,
                                      int *Distortion)
@ -869,7 +872,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }

-static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
+static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
+                                    int *rate_tokenonly, int *distortion)
 {
    MB_PREDICTION_MODE mode;
    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
@ -1739,18 +1743,18 @@ static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
        {
            if (x->partition_info->bmi[i].mode == NEW4X4)
            {
-                cpi->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
+                x->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
                                          - best_ref_mv->as_mv.row) >> 1)]++;
-                cpi->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
+                x->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
                                          - best_ref_mv->as_mv.col) >> 1)]++;
            }
        }
    }
    else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
    {
-        cpi->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
+        x->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
                                          - best_ref_mv->as_mv.row) >> 1)]++;
-        cpi->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
+        x->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
                                          - best_ref_mv->as_mv.col) >> 1)]++;
    }
 }
@ -2011,7 +2015,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,

    *returnintra = INT_MAX;
    /* Count of the number of MBs tested so far this frame */
-    cpi->mbs_tested_so_far++;
+    x->mbs_tested_so_far++;

    x->skip = 0;

@ -2023,7 +2027,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];

        /* Test best rd so far against threshold for trying this mode. */
-        if (best_mode.rd <= cpi->rd_threshes[mode_index])
+        if (best_mode.rd <= x->rd_threshes[mode_index])
            continue;

        if (this_ref_frame < 0)
@ -2069,19 +2073,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         * max If so then prevent it from being tested and increase the
         * threshold for its testing
         */
-        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        if (x->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
        {
-            if (cpi->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+            if (x->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * x->mode_test_hit_counts[mode_index])
            {
                /* Increase the threshold for coding this mode to make it
                 * less likely to be chosen
                 */
-                cpi->rd_thresh_mult[mode_index] += 4;
+                x->rd_thresh_mult[mode_index] += 4;

-                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+                x->rd_threshes[mode_index] =
+                    (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                    x->rd_thresh_mult[mode_index];

                continue;
            }
@ -2091,28 +2097,28 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         * current mode so increment the counter for the number of times
         * it has been tested
         */
-        cpi->mode_test_hit_counts[mode_index] ++;
+        x->mode_test_hit_counts[mode_index] ++;

        /* Experimental code. Special case for gf and arf zeromv modes.
         * Increase zbin size to supress noise
         */
-        if (cpi->zbin_mode_boost_enabled)
+        if (x->zbin_mode_boost_enabled)
        {
            if ( this_ref_frame == INTRA_FRAME )
-                cpi->zbin_mode_boost = 0;
+                x->zbin_mode_boost = 0;
            else
            {
                if (vp8_mode_order[mode_index] == ZEROMV)
                {
                    if (this_ref_frame != LAST_FRAME)
-                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                    else
-                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                }
                else if (vp8_mode_order[mode_index] == SPLITMV)
-                    cpi->zbin_mode_boost = 0;
+                    x->zbin_mode_boost = 0;
                else
-                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
            }

            vp8_update_zbin_extra(cpi, x);
@ -2120,7 +2126,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,

        if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
        {
-            rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
+            rd_pick_intra_mbuv_mode(x, &uv_intra_rate,
                                    &uv_intra_rate_tokenonly,
                                    &uv_intra_distortion);
            uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
@ -2146,7 +2152,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
             */
            int distortion;
-            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
            rd.rate2 += rate;
            rd.distortion2 += distortion;

@ -2171,8 +2177,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            int this_rd_thresh;
            int distortion;

-            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
-            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ?
+                x->rd_threshes[THR_NEW1] : x->rd_threshes[THR_NEW3];
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ?
+                x->rd_threshes[THR_NEW2] : this_rd_thresh;

            tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
                                                     best_mode.yrd, mdcounts,
@ -2465,8 +2473,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            /* Testing this mode gave rise to an improvement in best error
             * score. Lower threshold a bit for next time
             */
-            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            x->rd_thresh_mult[mode_index] =
+                (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                    x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
        }

        /* If the mode did not help improve the best error case then raise
@ -2474,13 +2483,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         */
        else
        {
-            cpi->rd_thresh_mult[mode_index] += 4;
+            x->rd_thresh_mult[mode_index] += 4;

-            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
        }
+        x->rd_threshes[mode_index] =
+            (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                x->rd_thresh_mult[mode_index];

        if (x->skip)
            break;
@ -2490,10 +2500,16 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    /* Reduce the activation RD thresholds for the best choice mode */
    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
    {
-        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 2);

-        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+        x->rd_thresh_mult[best_mode_index] =
+            (x->rd_thresh_mult[best_mode_index] >=
+                (MIN_THRESHMULT + best_adjustment)) ?
+                    x->rd_thresh_mult[best_mode_index] - best_adjustment :
+                    MIN_THRESHMULT;
+        x->rd_threshes[best_mode_index] =
+            (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                x->rd_thresh_mult[best_mode_index];
    }

    /* Note how often each mode chosen as best */
@ -2595,7 +2611,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    rd_update_mvcount(cpi, x, &best_ref_mv);
 }

-void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
    int error4x4, error16x16;
    int rate4x4, rate16x16 = 0, rateuv;
@ -2607,15 +2623,13 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)

    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-    rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+    rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
    rate = rateuv;

-    error16x16 = rd_pick_intra16x16mby_mode(cpi, x,
-                                            &rate16x16, &rate16x16_tokenonly,
+    error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
                                            &dist16x16);

-    error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                         &rate4x4, &rate4x4_tokenonly,
+    error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly,
                                         &dist4x4, error16x16);

    if (error4x4 < error16x16)
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@ -65,9 +65,9 @@ static void insertsortsad(int arr[],int idx[], int len)
    }
 }

-extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
-extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
+extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);


 static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@ -23,7 +23,7 @@
 #ifdef ENTROPY_STATS
 _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 #endif
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
 void vp8_fix_contexts(MACROBLOCKD *x);

 #include "dct_value_tokens.h"
@ -102,11 +102,12 @@ static void fill_value_tokens()

 static void tokenize2nd_order_b
 (
-    MACROBLOCKD *x,
+    MACROBLOCK *x,
    TOKENEXTRA **tp,
    VP8_COMP *cpi
 )
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    int pt;             /* near block/prev token context index */
    int c;              /* start at DC */
    TOKENEXTRA *t = *tp;/* store tokens starting here */
@ -117,11 +118,11 @@ static void tokenize2nd_order_b
    int band, rc, v, token;
    int eob;

-    b = x->block + 24;
+    b = xd->block + 24;
    qcoeff_ptr = b->qcoeff;
-    a = (ENTROPY_CONTEXT *)x->above_context + 8;
-    l = (ENTROPY_CONTEXT *)x->left_context + 8;
-    eob = x->eobs[24];
+    a = (ENTROPY_CONTEXT *)xd->above_context + 8;
+    l = (ENTROPY_CONTEXT *)xd->left_context + 8;
+    eob = xd->eobs[24];
    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

    if(!eob)
@ -131,7 +132,7 @@ static void tokenize2nd_order_b
        t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
        t->skip_eob_node = 0;

-        ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+        ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
        t++;
        *tp = t;
        *a = *l = 0;
@ -145,7 +146,7 @@ static void tokenize2nd_order_b

    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
    t->skip_eob_node = 0;
-    ++cpi->coef_counts       [1] [0] [pt] [token];
+    ++x->coef_counts       [1] [0] [pt] [token];
    pt = vp8_prev_token_class[token];
    t++;
    c = 1;
@ -164,7 +165,7 @@ static void tokenize2nd_order_b

        t->skip_eob_node = ((pt == 0));

-        ++cpi->coef_counts       [1] [band] [pt] [token];
+        ++x->coef_counts       [1] [band] [pt] [token];

        pt = vp8_prev_token_class[token];
        t++;
@ -177,7 +178,7 @@ static void tokenize2nd_order_b

        t->skip_eob_node = 0;

-        ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+        ++x->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];

        t++;
    }
@ -189,12 +190,13 @@ static void tokenize2nd_order_b

 static void tokenize1st_order_b
 (
-    MACROBLOCKD *x,
+    MACROBLOCK *x,
    TOKENEXTRA **tp,
    int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
    VP8_COMP *cpi
 )
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    unsigned int block;
    const BLOCKD *b;
    int pt;             /* near block/prev token context index */
@ -207,15 +209,15 @@ static void tokenize1st_order_b
    int band, rc, v;
    int tmp1, tmp2;

-    b = x->block;
+    b = xd->block;
    /* Luma */
    for (block = 0; block < 16; block++, b++)
    {
        tmp1 = vp8_block2above[block];
        tmp2 = vp8_block2left[block];
        qcoeff_ptr = b->qcoeff;
-        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
-        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;

        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

@ -228,7 +230,7 @@ static void tokenize1st_order_b
            t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
            t->skip_eob_node = 0;

-            ++cpi->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
            t++;
            *tp = t;
            *a = *l = 0;
@ -243,7 +245,7 @@ static void tokenize1st_order_b

        t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
        t->skip_eob_node = 0;
-        ++cpi->coef_counts       [type] [c] [pt] [token];
+        ++x->coef_counts       [type] [c] [pt] [token];
        pt = vp8_prev_token_class[token];
        t++;
        c++;
@ -261,7 +263,7 @@ static void tokenize1st_order_b
            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

            t->skip_eob_node = (pt == 0);
-            ++cpi->coef_counts       [type] [band] [pt] [token];
+            ++x->coef_counts       [type] [band] [pt] [token];

            pt = vp8_prev_token_class[token];
            t++;
@ -273,7 +275,7 @@ static void tokenize1st_order_b
            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];

            t->skip_eob_node = 0;
-            ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];

            t++;
        }
@ -287,8 +289,8 @@ static void tokenize1st_order_b
        tmp1 = vp8_block2above[block];
        tmp2 = vp8_block2left[block];
        qcoeff_ptr = b->qcoeff;
-        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
-        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;

        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

@ -299,7 +301,7 @@ static void tokenize1st_order_b
            t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
            t->skip_eob_node = 0;

-            ++cpi->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
            t++;
            *tp = t;
            *a = *l = 0;
@ -314,7 +316,7 @@ static void tokenize1st_order_b

        t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
        t->skip_eob_node = 0;
-        ++cpi->coef_counts       [2] [0] [pt] [token];
+        ++x->coef_counts       [2] [0] [pt] [token];
        pt = vp8_prev_token_class[token];
        t++;
        c = 1;
@ -333,7 +335,7 @@ static void tokenize1st_order_b

            t->skip_eob_node = (pt == 0);

-            ++cpi->coef_counts       [2] [band] [pt] [token];
+            ++x->coef_counts       [2] [band] [pt] [token];

            pt = vp8_prev_token_class[token];
            t++;
@ -346,7 +348,7 @@ static void tokenize1st_order_b

            t->skip_eob_node = 0;

-            ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+            ++x->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];

            t++;
        }
@ -374,16 +376,18 @@ static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)
 }


-void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    int plane_type;
    int has_y2_block;

-    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
-                    && x->mode_info_context->mbmi.mode != SPLITMV);
+    has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED
+                    && xd->mode_info_context->mbmi.mode != SPLITMV);

-    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);
-    if (x->mode_info_context->mbmi.mb_skip_coeff)
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+        mb_is_skippable(xd, has_y2_block);
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
    {
        if (!cpi->common.mb_no_coeff_skip)
        {
@ -391,8 +395,8 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
        }
        else
        {
-            vp8_fix_contexts(x);
-            cpi->skip_true_count++;
+            vp8_fix_contexts(xd);
+            x->skip_true_count++;
        }

        return;
@ -488,7 +492,8 @@ static void stuff2nd_order_b
    TOKENEXTRA **tp,
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
    int pt; /* near block/prev token context index */
@ -498,13 +503,12 @@ static void stuff2nd_order_b
    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
    t->skip_eob_node = 0;
-    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
    ++t;

    *tp = t;
    pt = 0;
    *a = *l = pt;
-
 }

 static void stuff1st_order_b
@ -513,7 +517,8 @@ static void stuff1st_order_b
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,
    int type,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
    int pt; /* near block/prev token context index */
@ -524,20 +529,21 @@ static void stuff1st_order_b
    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
    t->skip_eob_node = 0;
-    ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
    ++t;
    *tp = t;
    pt = 0; /* 0 <-> all coeff data is zero */
    *a = *l = pt;
-
 }
+
 static
 void stuff1st_order_buv
 (
    TOKENEXTRA **tp,
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,
-    VP8_COMP *cpi
+    VP8_COMP *cpi,
+    MACROBLOCK *x
 )
 {
    int pt; /* near block/prev token context index */
@ -547,38 +553,38 @@ void stuff1st_order_buv
    t->Token = DCT_EOB_TOKEN;
    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
    t->skip_eob_node = 0;
-    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++x->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
    ++t;
    *tp = t;
    pt = 0; /* 0 <-> all coeff data is zero */
    *a = *l = pt;
-
 }

-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
-    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
-    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+    MACROBLOCKD *xd = &x->e_mbd;
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
    int plane_type;
    int b;
    plane_type = 3;
-    if((x->mode_info_context->mbmi.mode != B_PRED
-                        && x->mode_info_context->mbmi.mode != SPLITMV))
+    if((xd->mode_info_context->mbmi.mode != B_PRED
+                        && xd->mode_info_context->mbmi.mode != SPLITMV))
    {
        stuff2nd_order_b(t,
-                     A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+                     A + vp8_block2above[24], L + vp8_block2left[24], cpi, x);
        plane_type = 0;
    }

    for (b = 0; b < 16; b++)
        stuff1st_order_b(t,
                         A + vp8_block2above[b],
-                         L + vp8_block2left[b], plane_type, cpi);
+                         L + vp8_block2left[b], plane_type, cpi, x);

    for (b = 16; b < 24; b++)
        stuff1st_order_buv(t,
                           A + vp8_block2above[b],
-                           L + vp8_block2left[b], cpi);
+                           L + vp8_block2left[b], cpi, x);

 }
 void vp8_fix_contexts(MACROBLOCKD *x)
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@ -15,6 +15,7 @@
 #include "vp8_rtcd.h"

 #include <emmintrin.h>
+#include "vpx_ports/emmintrin_compat.h"

 union sum_union {
    __m128i v;
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@ -20,16 +20,9 @@ ifeq ($(ARCH_ARM),yes)
  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
 endif

-VP8_CX_SRCS-yes += vp8_cx_iface.c
+VP8_CX_SRCS-yes += vp8cx.mk

-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
+VP8_CX_SRCS-yes += vp8_cx_iface.c

 VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@ -9,7 +9,7 @@
 ##


-#VP8_CX_SRCS list is modified according to different platforms.
+VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk

 #File list for arm
 # encoder
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@ -16,6 +16,8 @@ VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
 VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
 VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)

+VP8_DX_SRCS-yes += vp8dx.mk
+
 VP8_DX_SRCS-yes += vp8_dx_iface.c

 VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_subpixel.h"
 #include "vp9/common/vp9_loopfilter.h"
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_onyxc_int.h"
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -14,7 +14,7 @@

 void vpx_log(const char *format, ...);

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_treecoder.h"
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@ -24,7 +24,7 @@
 **************************************************************************/
 #include <assert.h>
 #include <math.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@ -33,60 +33,6 @@ static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;

-// TODO: these transforms can be further converted into integer forms
-//       for complexity optimization
-static const float idct_4[16] = {
-  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
-  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
-  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,
-  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099
-};
-
-static const float iadst_4[16] = {
-  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,
-  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,
-  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,
-  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779
-};
-
-static const float idct_8[64] = {
-  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
-  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
- -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
- -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
-  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
-  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
-  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
-  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
- -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
- -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
-  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
-  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
-};
-
-static const float iadst_8[64] = {
-  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
-  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
-  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
- -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
-  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
- -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
-  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
-  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
-  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
-  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
-  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
- -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
-  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
- -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
-  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
-  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
-};
-
 static const int16_t idct_i4[16] = {
  8192,  10703,  8192,   4433,
  8192,   4433, -8192, -10703,
@ -139,75 +85,7 @@ static const int16_t iadst_i8[64] = {
   5354, -4184,  2871, -1460
 };

-static float idct_16[256] = {
-  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
-  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
-  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,
- -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
-  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
- -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,
-  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,
-  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
-  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,
-  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,
-  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,
- -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,
-  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,
- -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,
-  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,
-  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,
-  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,
-  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,
-  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,
- -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,
-  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,
- -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,
-  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,
-  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,
-  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,
-  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,
-  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,
- -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,
-  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,
- -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,
-  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,
-  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654
-};

-static float iadst_16[256] = {
-  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,
-  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,
-  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,
- -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
-  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,
- -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,
-  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
-  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
-  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,
-  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,
-  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,
- -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,
-  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,
- -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,
-  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,
-  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,
-  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,
-  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,
-  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,
- -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,
-  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,
- -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,
-  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,
-  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,
-  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,
-  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,
-  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,
- -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,
-  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,
- -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,
-  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,
-  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
-};

 static const int16_t idct_i16[256] = {
   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
@ -279,124 +157,6 @@ static const int16_t iadst_i16[256] = {
   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
 };

-void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int i, j, k;
-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                 // the implementation could be simplified in
-                                 // conjunction with integer transform
-    const int16_t *ip = input;
-    int16_t *op = output;
-    int shortpitch = pitch >> 1;
-
-    float *pfa = &bufa[0];
-    float *pfb = &bufb[0];
-
-    // pointers to vertical and horizontal transforms
-    const float *ptv, *pth;
-
-    assert(tx_type != DCT_DCT);
-    // load and convert residual array into floating-point
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = (float)ip[i];
-      }
-      pfa += tx_dim;
-      ip  += tx_dim;
-    }
-
-    // vertical transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case ADST_DCT  :
-        ptv = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        ptv = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfb[i] = 0 ;
-        for(k = 0; k < tx_dim; k++) {
-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-        }
-        pfa += 1;
-      }
-
-      pfb += tx_dim;
-      ptv += tx_dim;
-      pfa = &bufa[0];
-    }
-
-    // horizontal transformation
-    pfa = &bufa[0];
-    pfb = &bufb[0];
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = (tx_dim == 4) ? &iadst_4[0] :
-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-        break;
-
-      default :
-        pth = (tx_dim == 4) ? &idct_4[0] :
-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-        break;
-    }
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        pfa[i] = 0;
-        for(k = 0; k < tx_dim; k++) {
-          pfa[i] += pfb[k] * pth[k];
-        }
-        pth += tx_dim;
-       }
-
-      pfa += tx_dim;
-      pfb += tx_dim;
-
-      switch(tx_type) {
-        case ADST_ADST :
-        case  DCT_ADST :
-          pth = (tx_dim == 4) ? &iadst_4[0] :
-                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-          break;
-
-        default :
-          pth = (tx_dim == 4) ? &idct_4[0] :
-                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-          break;
-      }
-    }
-
-    // convert to short integer format and load BLOCKD buffer
-    op  = output;
-    pfa = &bufa[0];
-
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
-                               -(int16_t)( - pfa[i] / 8 + 0.49);
-      }
-
-      op += shortpitch;
-      pfa += tx_dim;
-    }
-  }
-  vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}

 /* Converted the transforms to integer form. */
 #define HORIZONTAL_SHIFT 14  // 16
@ -404,8 +164,9 @@ void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
 #define VERTICAL_SHIFT 17  // 15
 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                      TX_TYPE tx_type, int tx_dim) {
+                      TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
  int i, j, k;
+  int nz_dim;
  int16_t imbuf[256];

  const int16_t *ip = input;
@ -444,6 +205,19 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
      break;
  }

+  nz_dim = tx_dim;
+  if(tx_dim > 4) {
+    if(eobs < 36) {
+      vpx_memset(im, 0, 512);
+      nz_dim = 8;
+      if(eobs < 3) {
+        nz_dim = 2;
+      } else if(eobs < 10) {
+        nz_dim = 4;
+      }
+    }
+  }
+
  /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
   * from right to left:
   * 1. horizontal transform: Y= Z*Transposed_M2
@ -453,10 +227,10 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
   */
  /* Horizontal transformation */
  for (j = 0; j < tx_dim; j++) {
-    for (i = 0; i < tx_dim; i++) {
+    for (i = 0; i < nz_dim; i++) {
      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {
+      for (k = 0; k < nz_dim; k++) {
        temp += ip[k] * pth[k];
      }

@ -476,7 +250,7 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
    for (j = 0; j < tx_dim; j++) {
      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {
+      for (k = 0; k < nz_dim; k++) {
        temp += ptv[k] * im[k];
      }

--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@ -52,7 +52,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
    TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
    if (tx_type != DCT_DCT) {
      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,
-                   tx_type, 4);
+                   tx_type, 4, xd->block[i].eob);
    } else {
      vp9_inverse_transform_b_4x4(xd, i, 32);
    }
@ -91,7 +91,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
  for (i = 0; i < 9; i += 8) {
    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
    if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8);
+      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
+                 xd->block[i].eob);
    } else {
      vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                  &blockd[i].diff[0], 32);
@ -100,7 +101,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
  for (i = 2; i < 11; i += 8) {
    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
    if (tx_type != DCT_DCT) {
-      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8);
+      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
+                 xd->block[i + 2].eob);
    } else {
      vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                  &blockd[i].diff[0], 32);
@ -132,7 +134,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
  BLOCKD *bd = &xd->block[0];
  TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
  if (tx_type != DCT_DCT) {
-    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16);
+    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob);
  } else {
    vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                  &xd->block[0].diff[0], 32);
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_INVTRANS_H_
 #define VP9_COMMON_VP9_INVTRANS_H_

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"

--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@ -39,7 +39,7 @@ static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
  int block;

  uint8_t **y, **u, **v;
-  uint8_t **y2, **u2, **v2;
+  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
  BLOCKD *blockd = xd->block;
  int stride;

--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_textblit.h"
@ -32,7 +32,7 @@
    (0.071*(float)(t & 0xff)) + 128)

 /* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
+#if 0 && CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
  { RGB_TO_YUV(0x00FF00) },   /* Green */
@ -672,7 +672,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                        oci->post_proc_buffer.y_stride);
  }

-#if CONFIG_POSTPROC_VISUALIZER
+#if 0 && CONFIG_POSTPROC_VISUALIZER
  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
    char message[512];
    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"

--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -9,7 +9,7 @@
 */

 #include <stdio.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vpx_mem/vpx_mem.h"
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9_rtcd.h"
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -42,7 +42,7 @@ fi
 # Dequant
 #
 prototype void vp9_dequantize_b "struct blockd *x"
-specialize vp9_dequantize_b mmx
+specialize vp9_dequantize_b

 prototype void vp9_dequantize_b_2x2 "struct blockd *x"
 specialize vp9_dequantize_b_2x2
@ -69,13 +69,13 @@ prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8
 specialize vp9_dequant_dc_idct_add

 prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
-specialize vp9_dequant_dc_idct_add_y_block mmx
+specialize vp9_dequant_dc_idct_add_y_block

 prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
-specialize vp9_dequant_idct_add_y_block mmx
+specialize vp9_dequant_idct_add_y_block

 prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
-specialize vp9_dequant_idct_add_uv_block mmx
+specialize vp9_dequant_idct_add_uv_block

 #
 # RECON
@ -218,6 +218,7 @@ vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
 #
 # post proc
 #
+if [ "$CONFIG_POSTPROC" = "yes" ]; then
 prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
@ -233,6 +234,7 @@ vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
 prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
 specialize vp9_plane_add_noise mmx sse2
 vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
+fi

 prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_inner
@ -343,10 +345,10 @@ specialize vp9_bilinear_predict_avg4x4
 # dct
 #
 prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm_1 mmx
+specialize vp9_short_idct4x4llm_1

 prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm mmx
+specialize vp9_short_idct4x4llm

 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8
@ -366,7 +368,7 @@ specialize vp9_short_idct10_16x16
 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct32x32

-prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim"
+prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm

 #
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@ -15,7 +15,7 @@
 #include <math.h>
 #endif

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #if ARCH_X86 || ARCH_X86_64
 void vpx_reset_mmx_state(void);
 #define vp9_clear_system_state() vpx_reset_mmx_state()
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@ -9,11 +9,11 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_subpixel.h"

-extern const short vp9_six_tap_mmx[16][6 * 8];
+extern const short vp9_six_tap_mmx[8][6 * 8];

 extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
                                      unsigned short  *output_ptr,
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ b/vp9/common/x86/vp9_filter_sse2.c
@ -11,6 +11,7 @@
 #include <assert.h> // for alignment checks
 #include <emmintrin.h> // SSE2
 #include "vp9/common/vp9_filter.h"
+#include "vpx_ports/emmintrin_compat.h"
 #include "vpx_ports/mem.h" // for DECLARE_ALIGNED
 #include "vp9_rtcd.h"

--- a/vp9/common/x86/vp9_idctllm_sse2.asm
+++ b/vp9/common/x86/vp9_idctllm_sse2.asm
@ -21,7 +21,7 @@
 ;   int blk_stride      - 5
 ; )

-global sym(vp9_idct_dequant_0_2x_sse2)
+global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_0_2x_sse2):
    push        rbp
    mov         rbp, rsp
@ -97,7 +97,7 @@ sym(vp9_idct_dequant_0_2x_sse2):
    pop         rbp
    ret

-global sym(vp9_idct_dequant_full_2x_sse2)
+global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_full_2x_sse2):
    push        rbp
    mov         rbp, rsp
@ -362,7 +362,7 @@ sym(vp9_idct_dequant_full_2x_sse2):
 ;   int dst_stride      - 4
 ;   short *dc           - 5
 ; )
-global sym(vp9_idct_dequant_dc_0_2x_sse2)
+global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_dc_0_2x_sse2):
    push        rbp
    mov         rbp, rsp
@ -438,7 +438,7 @@ sym(vp9_idct_dequant_dc_0_2x_sse2):
    pop         rbp
    ret

-global sym(vp9_idct_dequant_dc_full_2x_sse2)
+global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE
 sym(vp9_idct_dequant_dc_full_2x_sse2):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ b/vp9/common/x86/vp9_iwalsh_mmx.asm
@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"

 ;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx)
+global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
 sym(vp9_short_inv_walsh4x4_1_mmx):
    push        rbp
    mov         rbp, rsp
@ -48,7 +48,7 @@ sym(vp9_short_inv_walsh4x4_1_mmx):
    ret

 ;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx)
+global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
 sym(vp9_short_inv_walsh4x4_mmx):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ b/vp9/common/x86/vp9_iwalsh_sse2.asm
@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"

 ;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2)
+global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp9_short_inv_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@ -21,7 +21,7 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_mmx)
+global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_mmx):
    push        rbp
    mov         rbp, rsp
@ -233,7 +233,7 @@ sym(vp9_loop_filter_horizontal_edge_mmx):
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_loop_filter_vertical_edge_mmx)
+global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE
 sym(vp9_loop_filter_vertical_edge_mmx):
    push        rbp
    mov         rbp, rsp
@ -600,7 +600,7 @@ sym(vp9_loop_filter_vertical_edge_mmx):
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
+global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
 sym(vp9_loop_filter_simple_horizontal_edge_mmx):
    push        rbp
    mov         rbp, rsp
@ -716,7 +716,7 @@ sym(vp9_loop_filter_simple_horizontal_edge_mmx):
 ;    int  src_pixel_step,
 ;    const char *blimit
 ;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx)
+global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
 sym(vp9_loop_filter_simple_vertical_edge_mmx):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@ -281,7 +281,7 @@
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_sse2)
+global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
@ -331,7 +331,7 @@ sym(vp9_loop_filter_horizontal_edge_sse2):
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
+global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
 sym(vp9_loop_filter_horizontal_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
@ -719,7 +719,7 @@ sym(vp9_loop_filter_horizontal_edge_uv_sse2):
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp9_loop_filter_vertical_edge_sse2)
+global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
 sym(vp9_loop_filter_vertical_edge_sse2):
    push        rbp
    mov         rbp, rsp
@ -786,7 +786,7 @@ sym(vp9_loop_filter_vertical_edge_sse2):
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2)
+global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
 sym(vp9_loop_filter_vertical_edge_uv_sse2):
    push        rbp
    mov         rbp, rsp
@ -851,7 +851,7 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
+global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
 sym(vp9_loop_filter_simple_horizontal_edge_sse2):
    push        rbp
    mov         rbp, rsp
@ -960,7 +960,7 @@ sym(vp9_loop_filter_simple_horizontal_edge_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2)
+global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
 sym(vp9_loop_filter_simple_vertical_edge_sse2):
    push        rbp         ; save old base pointer value.
    mov         rbp, rsp    ; set new base pointer value.
--- a/vp9/common/x86/vp9_loopfilter_x86.c
+++ b/vp9/common/x86/vp9_loopfilter_x86.c
@ -11,6 +11,7 @@
 #include <emmintrin.h>  // SSE2
 #include "vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"

 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
--- a/vp9/common/x86/vp9_mask_sse3.asm
+++ b/vp9/common/x86/vp9_mask_sse3.asm
@ -25,7 +25,7 @@
 ;    int yt,
 ;    int ut,
 ;    int vt)
-global sym(vp8_makemask_sse3)
+global sym(vp8_makemask_sse3) PRIVATE
 sym(vp8_makemask_sse3):
    push        rbp
    mov         rbp, rsp
@ -181,7 +181,7 @@ NextPairOfRows:
 ;void int vp8_growmaskmb_sse3(
 ;    unsigned char *om,
 ;    unsigned char *nm,
-global sym(vp8_growmaskmb_sse3)
+global sym(vp8_growmaskmb_sse3) PRIVATE
 sym(vp8_growmaskmb_sse3):
    push        rbp
    mov         rbp, rsp
@ -234,7 +234,7 @@ sym(vp8_growmaskmb_sse3):
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt)
+global sym(vp8_sad16x16_masked_wmt) PRIVATE
 sym(vp8_sad16x16_masked_wmt):
    push        rbp
    mov         rbp, rsp
@ -288,7 +288,7 @@ NextSadRow:
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt)
+global sym(vp8_sad16x16_unmasked_wmt) PRIVATE
 sym(vp8_sad16x16_unmasked_wmt):
    push        rbp
    mov         rbp, rsp
@ -343,7 +343,7 @@ next_vp8_sad16x16_unmasked_wmt:
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    unsigned char *mask)
-global sym(vp8_masked_predictor_wmt)
+global sym(vp8_masked_predictor_wmt) PRIVATE
 sym(vp8_masked_predictor_wmt):
    push        rbp
    mov         rbp, rsp
@ -395,7 +395,7 @@ next_vp8_masked_predictor_wmt:
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt)
+global sym(vp8_masked_predictor_uv_wmt) PRIVATE
 sym(vp8_masked_predictor_uv_wmt):
    push        rbp
    mov         rbp, rsp
@ -444,7 +444,7 @@ next_vp8_masked_predictor_uv_wmt:
 ;unsigned int vp8_uv_from_y_mask(
 ;    unsigned char *ymask,
 ;    unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask)
+global sym(vp8_uv_from_y_mask) PRIVATE
 sym(vp8_uv_from_y_mask):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@ -24,7 +24,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp9_post_proc_down_and_across_mmx)
+global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
 sym(vp9_post_proc_down_and_across_mmx):
    push        rbp
    mov         rbp, rsp
@ -265,7 +265,7 @@ sym(vp9_post_proc_down_and_across_mmx):
 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
 ;                             int pitch, int rows, int cols,int flimit)
 extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx)
+global sym(vp9_mbpost_proc_down_mmx) PRIVATE
 sym(vp9_mbpost_proc_down_mmx):
    push        rbp
    mov         rbp, rsp
@ -465,7 +465,7 @@ sym(vp9_mbpost_proc_down_mmx):
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp9_plane_add_noise_mmx)
+global sym(vp9_plane_add_noise_mmx) PRIVATE
 sym(vp9_plane_add_noise_mmx):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@ -21,7 +21,7 @@
 ;    int cols,
 ;    int flimit
 ;)
-global sym(vp9_post_proc_down_and_across_xmm)
+global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
 sym(vp9_post_proc_down_and_across_xmm):
    push        rbp
    mov         rbp, rsp
@ -251,7 +251,7 @@ sym(vp9_post_proc_down_and_across_xmm):
 ;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
 ;                            int pitch, int rows, int cols,int flimit)
 extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_xmm)
+global sym(vp9_mbpost_proc_down_xmm) PRIVATE
 sym(vp9_mbpost_proc_down_xmm):
    push        rbp
    mov         rbp, rsp
@ -451,7 +451,7 @@ sym(vp9_mbpost_proc_down_xmm):

 ;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
 ;                                int pitch, int rows, int cols,int flimit)
-global sym(vp9_mbpost_proc_across_ip_xmm)
+global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
 sym(vp9_mbpost_proc_across_ip_xmm):
    push        rbp
    mov         rbp, rsp
@ -630,7 +630,7 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 ;                            unsigned char bothclamp[16],
 ;                            unsigned int Width, unsigned int Height, int Pitch)
 extern sym(rand)
-global sym(vp9_plane_add_noise_wmt)
+global sym(vp9_plane_add_noise_wmt) PRIVATE
 sym(vp9_plane_add_noise_wmt):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ b/vp9/common/x86/vp9_recon_mmx.asm
@ -11,7 +11,7 @@

 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx)
+global sym(vp9_recon_b_mmx) PRIVATE
 sym(vp9_recon_b_mmx):
    push        rbp
    mov         rbp, rsp
@ -65,7 +65,7 @@ sym(vp9_recon_b_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem8x8_mmx)
+global sym(vp9_copy_mem8x8_mmx) PRIVATE
 sym(vp9_copy_mem8x8_mmx):
    push        rbp
    mov         rbp, rsp
@ -128,7 +128,7 @@ sym(vp9_copy_mem8x8_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem8x4_mmx)
+global sym(vp9_copy_mem8x4_mmx) PRIVATE
 sym(vp9_copy_mem8x4_mmx):
    push        rbp
    mov         rbp, rsp
@ -172,7 +172,7 @@ sym(vp9_copy_mem8x4_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem16x16_mmx)
+global sym(vp9_copy_mem16x16_mmx) PRIVATE
 sym(vp9_copy_mem16x16_mmx):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ b/vp9/common/x86/vp9_recon_sse2.asm
@ -11,7 +11,7 @@

 %include "vpx_ports/x86_abi_support.asm"
 ;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2)
+global sym(vp9_recon2b_sse2) PRIVATE
 sym(vp9_recon2b_sse2):
    push        rbp
    mov         rbp, rsp
@ -62,7 +62,7 @@ sym(vp9_recon2b_sse2):


 ;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2)
+global sym(vp9_recon4b_sse2) PRIVATE
 sym(vp9_recon4b_sse2):
    push        rbp
    mov         rbp, rsp
@ -132,7 +132,7 @@ sym(vp9_recon4b_sse2):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp9_copy_mem16x16_sse2)
+global sym(vp9_copy_mem16x16_sse2) PRIVATE
 sym(vp9_copy_mem16x16_sse2):
    push        rbp
    mov         rbp, rsp
@ -237,7 +237,7 @@ sym(vp9_copy_mem16x16_sse2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dc_mmx2)
+global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dc_mmx2):
    push        rbp
    mov         rbp, rsp
@ -310,7 +310,7 @@ sym(vp9_intra_pred_uv_dc_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dctop_mmx2)
+global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dctop_mmx2):
    push        rbp
    mov         rbp, rsp
@ -363,7 +363,7 @@ sym(vp9_intra_pred_uv_dctop_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dcleft_mmx2)
+global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE
 sym(vp9_intra_pred_uv_dcleft_mmx2):
    push        rbp
    mov         rbp, rsp
@ -428,7 +428,7 @@ sym(vp9_intra_pred_uv_dcleft_mmx2):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_dc128_mmx)
+global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE
 sym(vp9_intra_pred_uv_dc128_mmx):
    push        rbp
    mov         rbp, rsp
@ -465,7 +465,7 @@ sym(vp9_intra_pred_uv_dc128_mmx):
 ;    int src_stride,
 ;    )
 %macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1)
+global sym(vp9_intra_pred_uv_tm_%1) PRIVATE
 sym(vp9_intra_pred_uv_tm_%1):
    push        rbp
    mov         rbp, rsp
@ -545,7 +545,7 @@ vp9_intra_pred_uv_tm ssse3
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp9_intra_pred_uv_ve_mmx)
+global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE
 sym(vp9_intra_pred_uv_ve_mmx):
    push        rbp
    mov         rbp, rsp
@ -585,7 +585,7 @@ sym(vp9_intra_pred_uv_ve_mmx):
 ;    int src_stride,
 ;    )
 %macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1)
+global sym(vp9_intra_pred_uv_ho_%1) PRIVATE
 sym(vp9_intra_pred_uv_ho_%1):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_recon_wrapper_sse2.c
+++ b/vp9/common/x86/vp9_recon_wrapper_sse2.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_blockd.h"

--- a/vp9/common/x86/vp9_sadmxn_x86.c
+++ b/vp9/common/x86/vp9_sadmxn_x86.c
@ -12,6 +12,7 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/emmintrin_compat.h"

 #if HAVE_SSE2
 unsigned int vp9_sad16x3_sse2(
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@ -30,7 +30,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_ssse3)
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v8_ssse3):
    push        rbp
    mov         rbp, rsp
@ -148,7 +148,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_ssse3)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_v8_ssse3):
    push        rbp
    mov         rbp, rsp
@ -298,7 +298,7 @@ sym(vp9_filter_block1d16_v8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d8_h8_ssse3):
    push        rbp
    mov         rbp, rsp
@ -405,7 +405,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_ssse3)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
 sym(vp9_filter_block1d16_h8_ssse3):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ b/vp9/common/x86/vp9_subpixel_mmx.asm
@ -27,7 +27,7 @@
 ;    unsigned int    output_width,
 ;    short           * vp9_filter
 ;)
-global sym(vp9_filter_block1d_h6_mmx)
+global sym(vp9_filter_block1d_h6_mmx) PRIVATE
 sym(vp9_filter_block1d_h6_mmx):
    push        rbp
    mov         rbp, rsp
@ -124,7 +124,7 @@ sym(vp9_filter_block1d_h6_mmx):
 ;   unsigned int output_width,
 ;   short * vp9_filter
 ;)
-global sym(vp9_filter_block1dc_v6_mmx)
+global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
 sym(vp9_filter_block1dc_v6_mmx):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ b/vp9/common/x86/vp9_subpixel_sse2.asm
@ -32,7 +32,7 @@
 ;    unsigned int    output_width,
 ;    short           *vp9_filter
 ;)
-global sym(vp9_filter_block1d8_h6_sse2)
+global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
 sym(vp9_filter_block1d8_h6_sse2):
    push        rbp
    mov         rbp, rsp
@ -152,7 +152,7 @@ sym(vp9_filter_block1d8_h6_sse2):
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2)
+global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
 sym(vp9_filter_block1d16_h6_sse2):
    push        rbp
    mov         rbp, rsp
@ -328,7 +328,7 @@ sym(vp9_filter_block1d16_h6_sse2):
 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2)
+global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
 sym(vp9_filter_block1d8_v6_sse2):
    push        rbp
    mov         rbp, rsp
@ -423,7 +423,7 @@ sym(vp9_filter_block1d8_v6_sse2):
 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2)
+global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
 sym(vp9_filter_block1d16_v6_sse2):
    push        rbp
    mov         rbp, rsp
@ -533,7 +533,7 @@ sym(vp9_filter_block1d16_v6_sse2):
 ;    const short    *vp9_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2)
+global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
 sym(vp9_filter_block1d8_h6_only_sse2):
    push        rbp
    mov         rbp, rsp
@ -646,7 +646,7 @@ sym(vp9_filter_block1d8_h6_only_sse2):
 ;    const short    *vp9_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2)
+global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
 sym(vp9_filter_block1d16_h6_only_sse2):
    push        rbp
    mov         rbp, rsp
@ -811,7 +811,7 @@ sym(vp9_filter_block1d16_h6_only_sse2):
 ;    const short    *vp9_filter
 ;)
 ; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2)
+global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
 sym(vp9_filter_block1d8_v6_only_sse2):
    push        rbp
    mov         rbp, rsp
@ -903,7 +903,7 @@ sym(vp9_filter_block1d8_v6_only_sse2):
 ;    unsigned int    output_height,
 ;    unsigned int    output_width
 ;)
-global sym(vp9_unpack_block1d16_h6_sse2)
+global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
 sym(vp9_unpack_block1d16_h6_sse2):
    push        rbp
    mov         rbp, rsp
@ -962,7 +962,7 @@ sym(vp9_unpack_block1d16_h6_sse2):
 ;    int dst_pitch
 ;)
 extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2)
+global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
 sym(vp9_bilinear_predict16x16_sse2):
    push        rbp
    mov         rbp, rsp
@ -1231,7 +1231,7 @@ sym(vp9_bilinear_predict16x16_sse2):
 ;    int dst_pitch
 ;)
 extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2)
+global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
 sym(vp9_bilinear_predict8x8_sse2):
    push        rbp
    mov         rbp, rsp
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_ssse3.asm
@ -34,7 +34,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d8_h6_ssse3)
+global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d8_h6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -177,7 +177,7 @@ vp9_filter_block1d8_h4_ssse3:
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d16_h6_ssse3)
+global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d16_h6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -284,7 +284,7 @@ sym(vp9_filter_block1d16_h6_ssse3):
 ;    unsigned int    output_height,
 ;    unsigned int    vp9_filter_index
 ;)
-global sym(vp9_filter_block1d4_h6_ssse3)
+global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
 sym(vp9_filter_block1d4_h6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -413,7 +413,7 @@ sym(vp9_filter_block1d4_h6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d16_v6_ssse3)
+global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d16_v6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -601,7 +601,7 @@ sym(vp9_filter_block1d16_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d8_v6_ssse3)
+global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d8_v6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -741,7 +741,7 @@ sym(vp9_filter_block1d8_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp9_filter_index
 ;)
-global sym(vp9_filter_block1d4_v6_ssse3)
+global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
 sym(vp9_filter_block1d4_v6_ssse3):
    push        rbp
    mov         rbp, rsp
@ -880,7 +880,7 @@ sym(vp9_filter_block1d4_v6_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp9_bilinear_predict16x16_ssse3)
+global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
 sym(vp9_bilinear_predict16x16_ssse3):
    push        rbp
    mov         rbp, rsp
@ -1143,7 +1143,7 @@ sym(vp9_bilinear_predict16x16_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp9_bilinear_predict8x8_ssse3)
+global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
 sym(vp9_bilinear_predict8x8_ssse3):
    push        rbp
    mov         rbp, rsp
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@ -13,7 +13,7 @@

 #include <stddef.h>
 #include <limits.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"

--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@ -264,7 +264,8 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
  if (tx_type != DCT_DCT) {
    vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
                                    xd->block[0].dequant, xd->predictor,
-                                    xd->dst.y_buffer, 16, xd->dst.y_stride);
+                                    xd->dst.y_buffer, 16, xd->dst.y_stride,
+                                    xd->eobs[0]);
  } else {
    vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
                               xd->predictor, xd->dst.y_buffer,
@ -310,7 +311,8 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
      }
      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride);
+        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
+                                      xd->eobs[idx]);
      } else {
        vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,
                                   0, xd->eobs[idx]);
@ -409,7 +411,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                    b->dequant, b->predictor,
                                    *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride);
+                                    b->dst_stride, b->eob);
        } else {
          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                               *(b->base_dst) + b->dst, 16, b->dst_stride);
@ -454,7 +456,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
      if (tx_type != DCT_DCT) {
        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16, b->dst_stride);
+                                  *(b->base_dst) + b->dst, 16, b->dst_stride,
+                                  b->eob);
      } else {
        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                             *(b->base_dst) + b->dst, 16, b->dst_stride);
@ -516,7 +519,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
                                    b->dequant, b->predictor,
                                    *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride);
+                                    b->dst_stride, b->eob);
        } else {
          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                               *(b->base_dst) + b->dst, 16, b->dst_stride);
@ -570,7 +573,7 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
        tx_type, xd->qcoeff, xd->block[0].dequant,
        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->dst.y_stride);
+        xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob);
  } else {
    vp9_dequant_idct_add_16x16(
        xd->qcoeff, xd->block[0].dequant,
@ -609,7 +612,7 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
            + x_idx * 16 + (i & 1) * 8,
            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
            + x_idx * 16 + (i & 1) * 8,
-            stride, stride);
+            stride, stride, b->eob);
      } else {
        vp9_dequant_idct_add_8x8_c(
            q, dq,
@ -666,7 +669,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
            + x_idx * 16 + (i & 3) * 4,
            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
            + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride);
+            xd->dst.y_stride, xd->dst.y_stride, b->eob);
      } else {
        vp9_dequant_idct_add_c(
            b->qcoeff, b->dequant,
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@ -14,7 +14,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_common.h"
-
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                         uint8_t *dest, int stride, int width, int height) {
  int r, c;
@ -61,7 +60,7 @@ void vp9_dequantize_b_c(BLOCKD *d) {
 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
                               const int16_t *dq,
                               uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride) {
+                               int pitch, int stride, uint16_t eobs) {
  int16_t output[16];
  int16_t *diff_ptr = output;
  int i;
@ -70,7 +69,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
    input[i] = dq[i] * input[i];
  }

-  vp9_ihtllm(input, output, 4 << 1, tx_type, 4);
+  vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);

  vpx_memset(input, 0, 32);

@ -80,21 +79,25 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
                                   const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest,
-                                   int pitch, int stride) {
+                                   int pitch, int stride, uint16_t eobs) {
  int16_t output[64];
  int16_t *diff_ptr = output;
  int i;
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem8x8(pred, pitch, dest, stride);
+  } else if (eobs > 0) {
+    input[0] = dq[0] * input[0];
+    for (i = 1; i < 64; i++) {
+      input[i] = dq[1] * input[i];
+    }

-  input[0] = dq[0] * input[0];
-  for (i = 1; i < 64; i++) {
-    input[i] = dq[1] * input[i];
+    vp9_ihtllm(input, output, 16, tx_type, 8, eobs);
+
+    vpx_memset(input, 0, 128);
+
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
  }
-
-  vp9_ihtllm(input, output, 16, tx_type, 8);
-
-  vpx_memset(input, 0, 128);
-
-  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 }

 void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
@ -256,26 +259,31 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,

 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
                                     const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest, int pitch, int stride) {
+                                     uint8_t *dest, int pitch, int stride,
+                                     uint16_t eobs) {
  int16_t output[256];
  int16_t *diff_ptr = output;
  int i;
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem16x16(pred, pitch, dest, stride);
+  } else if (eobs > 0) {
+    input[0]= input[0] * dq[0];

-  input[0]= input[0] * dq[0];
+    // recover quantizer for 4 4x4 blocks
+    for (i = 1; i < 256; i++)
+      input[i] = input[i] * dq[1];

-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 256; i++)
-    input[i] = input[i] * dq[1];
+    // inverse hybrid transform
+    vp9_ihtllm(input, output, 32, tx_type, 16, eobs);

-  // inverse hybrid transform
-  vp9_ihtllm(input, output, 32, tx_type, 16);
+    // the idct halves ( >> 1) the pitch
+    // vp9_short_idct16x16_c(input, output, 32);

-  // the idct halves ( >> 1) the pitch
-  // vp9_short_idct16x16_c(input, output, 32);
+    vpx_memset(input, 0, 512);

-  vpx_memset(input, 0, 512);
-
-  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
+  }
 }

 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@ -11,108 +11,93 @@

 #ifndef VP9_DECODER_VP9_DEQUANTIZE_H_
 #define VP9_DECODER_VP9_DEQUANTIZE_H_
-
 #include "vp9/common/vp9_blockd.h"

 #if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(int16_t *input,
-                                            const int16_t *dq,
-                                            uint8_t *pred,
-                                            uint8_t *output,
+extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                            unsigned char *pred,
+                                            unsigned char *output,
                                            int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input,
-                                               const int16_t *dq,
-                                               uint8_t *pred,
-                                               uint8_t *output,
+extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                               unsigned char *pred,
+                                               unsigned char *output,
                                               int pitch, int stride, int dc);
 extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
                                                       const int16_t *dq,
-                                                       uint8_t *pre,
-                                                       uint8_t *dst,
+                                                       unsigned char *pre,
+                                                       unsigned char *dst,
                                                       int stride,
                                                       uint16_t *eobs,
                                                       const int16_t *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *pre,
-                                                    uint8_t *dst,
+extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                    unsigned char *pre,
+                                                    unsigned char *dst,
                                                    int stride,
                                                    uint16_t *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q,
-                                                     const int16_t *dq,
-                                                     uint8_t *pre,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
+extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                     unsigned char *pre,
+                                                     unsigned char *dst_u,
+                                                     unsigned char *dst_v,
                                                     int stride,
                                                     uint16_t *eobs);
-#endif  // CONFIG_LOSSLESS
+#endif

 typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                          uint8_t *pred, uint8_t *output,
-                                          int pitch, int stride);
+    unsigned char *pred, unsigned char *output, int pitch, int stride);
 typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                            uint8_t *pred, uint8_t *output,
-                                            int pitch, int stride, int dc);
+    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);

-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *pre, uint8_t *dst,
-                                                    int stride, uint16_t *eobs,
-                                                    const int16_t *dc);
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
+    const int16_t *dc);
 typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                 uint8_t *pre, uint8_t *dst,
-                                                 int stride, uint16_t *eobs);
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
 typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                  uint8_t *pre, uint8_t *dst_u,
-                                                  uint8_t *dst_v, int stride,
-                                                  uint16_t *eobs);
+    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
+    uint16_t *eobs);

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
-                               const int16_t *dq,
-                               uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride);
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
+                                    unsigned char *pred, unsigned char *dest,
+                                    int pitch, int stride, uint16_t eobs);

 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq, uint8_t *pred,
-                                   uint8_t *dest, int pitch, int stride);
+                                   const int16_t *dq, unsigned char *pred,
+                                   unsigned char *dest, int pitch, int stride,
+                                   uint16_t eobs);

 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest,
-                                     int pitch, int stride);
+                                     const int16_t *dq, unsigned char *pred,
+                                     unsigned char *dest,
+                                     int pitch, int stride, uint16_t eobs);

 #if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                   unsigned char *dst,
                                                   int stride,
                                                   uint16_t *eobs,
                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd);

-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
-                                                   const int16_t *dq,
-                                                   uint8_t *dst,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                   unsigned char *dst,
                                                   int stride,
                                                   uint16_t *eobs,
                                                   const int16_t *dc,
                                                   MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q,
-                                                 const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
                                                 int stride,
                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q,
-                                                 const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
                                                 int stride,
                                                 uint16_t *eobs,
                                                 MACROBLOCKD *xd);
-#endif  // CONFIG_SUPERBLOCKS
+#endif

-#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
+#endif
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@ -10,8 +10,7 @@

 #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
 #define VP9_DECODER_VP9_ONYXD_INT_H_
-
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_onyxc_int.h"
--- a/vp9/decoder/x86/vp9_idct_blk_mmx.c
+++ b/vp9/decoder/x86/vp9_idct_blk_mmx.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_dequantize.h"
 #include "vp9/decoder/x86/vp9_idct_mmx.h"
--- a/vp9/decoder/x86/vp9_idct_blk_sse2.c
+++ b/vp9/decoder/x86/vp9_idct_blk_sse2.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_dequantize.h"

--- a/vp9/decoder/x86/vp9_x86_dsystemdependent.c
+++ b/vp9/decoder/x86/vp9_x86_dsystemdependent.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx_ports/x86.h"
 #include "vp9/decoder/vp9_onyxd_int.h"

--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -11,7 +11,7 @@

 #include <assert.h>
 #include <math.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/common/vp9_blockd.h"
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -9,7 +9,7 @@
 */


-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@ -2123,8 +2123,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
  MACROBLOCK *const x = &cpi->mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  unsigned char *segment_id = &mbmi->segment_id;
-  int seg_ref_active;
  unsigned char ref_pred_flag;

 #if CONFIG_SUPERBLOCKS
@ -2170,8 +2168,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,

    vp9_update_zbin_extra(cpi, x);

-    seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
-
    // SET VARIOUS PREDICTION FLAGS

    // Did the chosen reference frame match its predicted value.
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_reconintra.h"
@ -70,7 +70,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
  if (tx_type != DCT_DCT) {
    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
    vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4);
+    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
  } else {
    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
    x->quantize_b_4x4(be, b) ;
@ -191,7 +191,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
                tx_type, 8);
      x->quantize_b_8x8(x->block + idx, xd->block + idx);
      vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                   tx_type, 8);
+                   tx_type, 8, xd->block[idx].eob);
    } else {
      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
      x->quantize_b_8x8(x->block + idx, xd->block + idx);
@ -205,7 +205,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
      if (tx_type != DCT_DCT) {
        vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
        vp9_ht_quantize_b_4x4(be, b, tx_type);
-        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4);
+        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
      } else {
        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
        x->quantize_b_4x4(be, b);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -8,7 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_quantize.h"
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_ENCODEMB_H_
 #define VP9_ENCODER_VP9_ENCODEMB_H_

-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"

 typedef struct {
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@ -41,9 +41,10 @@
 #define RMAX       128.0
 #define GF_RMAX    96.0
 #define ERR_DIVISOR   150.0
+#define MIN_DECAY_FACTOR 0.1

-#define KF_MB_INTRA_MIN 300
-#define GF_MB_INTRA_MIN 200
+#define KF_MB_INTRA_MIN 150
+#define GF_MB_INTRA_MIN 100

 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

@ -800,6 +801,7 @@ static double bitcost(double prob) {

 static long long estimate_modemvcost(VP9_COMP *cpi,
                                     FIRSTPASS_STATS *fpstats) {
+#if 0
  int mv_cost;
  int mode_cost;

@ -828,6 +830,7 @@ static long long estimate_modemvcost(VP9_COMP *cpi,

  // return mv_cost + mode_cost;
  // TODO PGW Fix overhead costs for extended Q range
+#endif
  return 0;
 }

@ -1405,10 +1408,9 @@ static int calc_arf_boost(
    // Cumulative effect of prediction quality decay
    if (!flash_detected) {
      decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
    }

    boost_score += (decay_accumulator *
@ -1443,10 +1445,9 @@ static int calc_arf_boost(
    // Cumulative effect of prediction quality decay
    if (!flash_detected) {
      decay_accumulator =
-        decay_accumulator *
-        get_prediction_decay_rate(cpi, &this_frame);
-      decay_accumulator =
-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
    }

    boost_score += (decay_accumulator *
@ -1632,7 +1633,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
        ((mv_ratio_accumulator > 100.0) ||
         (abs_mv_in_out_accumulator > 3.0) ||
         (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < 12.5))
+         ((boost_score - old_boost_score) < IIFACTOR))
      )) {
      boost_score = old_boost_score;
      break;
@ -1952,12 +1953,9 @@ void vp9_second_pass(VP9_COMP *cpi) {
  FIRSTPASS_STATS this_frame;
  FIRSTPASS_STATS this_frame_copy;

-  double this_frame_error;
  double this_frame_intra_error;
  double this_frame_coded_error;

-  FIRSTPASS_STATS *start_pos;
-
  int overhead_bits;

  if (!cpi->twopass.stats_in) {
@ -1971,12 +1969,9 @@ void vp9_second_pass(VP9_COMP *cpi) {
  if (EOF == input_stats(cpi, &this_frame))
    return;

-  this_frame_error = this_frame.ssim_weighted_pred_err;
  this_frame_intra_error = this_frame.intra_error;
  this_frame_coded_error = this_frame.coded_error;

-  start_pos = cpi->twopass.stats_in;
-
  // keyframe and section processing !
  if (cpi->twopass.frames_to_key == 0) {
    // Define next KF group and assign bits to it
@ -2396,7 +2391,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    if (!detect_flash(cpi, 0)) {
      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
      decay_accumulator = decay_accumulator * loop_decay_rate;
-      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                            ? MIN_DECAY_FACTOR : decay_accumulator;
    }

    boost_score += (decay_accumulator * r);
@ -2436,14 +2432,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    int allocation_chunks;
    int alt_kf_bits;

-    if (kf_boost < 300) {
-      kf_boost += (cpi->twopass.frames_to_key * 3);
-      if (kf_boost > 300)
-        kf_boost = 300;
-    }
+    if (kf_boost < (cpi->twopass.frames_to_key * 5))
+      kf_boost = (cpi->twopass.frames_to_key * 5);

-    if (kf_boost < 250)                                                      // Min KF boost
-      kf_boost = 250;
+    if (kf_boost < 300) // Min KF boost
+      kf_boost = 300;

    // Make a note of baseline boost and the zero motion
    // accumulator value for use elsewhere.
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@ -27,7 +27,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
  BLOCKD *d = &xd->block[0];
  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
  unsigned int best_err;
-  int step_param, further_steps;
+  int step_param;

  int tmp_col_min = x->mv_col_min;
  int tmp_col_max = x->mv_col_max;
@ -38,10 +38,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
  // Further step/diamond searches as necessary
  if (cpi->Speed < 8) {
    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
  } else {
    step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
  }

  vp9_clamp_mv_min_max(x, ref_mv);
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@ -12,7 +12,7 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include <stdio.h>
 #include <limits.h>
 #include <math.h>
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -148,7 +148,6 @@ static int calculate_minq_index(double maxq,
                                double x3, double x2, double x, double c) {
  int i;
  double minqtarget;
-  double thisq;

  minqtarget = ((x3 * maxq * maxq * maxq) +
                (x2 * maxq * maxq) +
@ -159,7 +158,6 @@ static int calculate_minq_index(double maxq,
    minqtarget = maxq;

  for (i = 0; i < QINDEX_RANGE; i++) {
-    thisq = vp9_convert_qindex_to_q(i);
    if (minqtarget <= vp9_convert_qindex_to_q(i))
      return i;
  }
@ -2925,8 +2923,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,

  int Loop = FALSE;
  int loop_count;
-  int this_q;
-  int last_zbin_oq;

  int q_low;
  int q_high;
@ -2940,8 +2936,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  int overshoot_seen = FALSE;
  int undershoot_seen = FALSE;

-  int loop_size_estimate = 0;
-
  SPEED_FEATURES *sf = &cpi->sf;
 #if RESET_FOREACH_FILTER
  int q_low0;
@ -2949,6 +2943,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  int zbin_oq_high0;
  int zbin_oq_low0 = 0;
  int Q0;
+  int last_zbin_oq;
  int last_zbin_oq0;
  int active_best_quality0;
  int active_worst_quality0;
@ -3163,7 +3158,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    // Determine initial Q to try
    Q = vp9_regulate_q(cpi, cpi->this_frame_target);
  }
+#if RESET_FOREACH_FILTER
  last_zbin_oq = cpi->zbin_over_quant;
+#endif

  // Set highest allowed value for Zbin over quant
  if (cm->frame_type == KEY_FRAME)
@ -3267,7 +3264,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    vp9_clear_system_state();  // __asm emms;

    vp9_set_quantizer(cpi, Q);
-    this_q = Q;

    if (loop_count == 0) {

@ -3503,7 +3499,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,

      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
      Loop = ((Q != last_q)) ? TRUE : FALSE;
+#if RESET_FOREACH_FILTER
      last_zbin_oq = cpi->zbin_over_quant;
+#endif
    } else
      Loop = FALSE;

@ -3692,9 +3690,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   * needed in motion search besides loopfilter */
  cm->last_frame_type = cm->frame_type;

-  // Keep a copy of the size estimate used in the loop
-  loop_size_estimate = cpi->projected_frame_size;
-
  // Update rate control heuristics
  cpi->total_byte_count += (*size);
  cpi->projected_frame_size = (*size) << 3;
@ -3795,7 +3790,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
              "%10.3f %8d %10d %10d %10d\n",
              cpi->common.current_video_frame, cpi->this_frame_target,
-              cpi->projected_frame_size, loop_size_estimate,
+              cpi->projected_frame_size, 0, //loop_size_estimate,
              (cpi->projected_frame_size - cpi->this_frame_target),
              (int)cpi->total_target_vs_actual,
              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
@ -3825,7 +3820,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
              "%8d %10d %10d %10d\n",
              cpi->common.current_video_frame,
              cpi->this_frame_target, cpi->projected_frame_size,
-              loop_size_estimate,
+              0, //loop_size_estimate,
              (cpi->projected_frame_size - cpi->this_frame_target),
              (int)cpi->total_target_vs_actual,
              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@ -13,7 +13,7 @@
 #define VP9_ENCODER_VP9_ONYX_INT_H_

 #include <stdio.h>
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_onyx.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_tokenize.h"
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@ -24,11 +24,9 @@ void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
  uint8_t *src_y, *dst_y;
  int yheight;
  int ystride;
-  int border;
  int yoffset;
  int linestocopy;

-  border   = src_ybc->border;
  yheight  = src_ybc->y_height;
  ystride  = src_ybc->y_stride;

--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -1328,7 +1328,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,

  // inverse transform
  if (best_tx_type != DCT_DCT)
-    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4);
+    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);
  else
    xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);

@ -1518,7 +1518,7 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
                                          int *skippable,
                                          int64_t txfm_cache[NB_TXFM_MODES]) {
  MB_PREDICTION_MODE mode;
-  TX_SIZE UNINITIALIZED_IS_SAFE(txfm_size);
+  TX_SIZE txfm_size = 0;
  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
 #if CONFIG_COMP_INTRA_PRED
  MB_PREDICTION_MODE mode2;
@ -1562,7 +1562,6 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,

      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-
      if (this_rd < best_rd) {
        mode_selected = mode;
        txfm_size = mbmi->txfm_size;
@ -1796,6 +1795,7 @@ static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
    mic->bmi[ib].as_mode.second = best_second_mode;
 #endif
  }
+
  *Rate = cost;
  *rate_y = tot_rate_y;
  *Distortion = distortion;
@ -3889,6 +3889,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  unsigned int ref_costs[MAX_REF_FRAMES];
  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];

+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y1dc_delta_q);
+
  vpx_memset(mode8x8, 0, sizeof(mode8x8));
  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@ -4086,16 +4089,17 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    if (!mbmi->ref_frame) {
      switch (this_mode) {
        default:
-        case DC_PRED:
        case V_PRED:
        case H_PRED:
-        case TM_PRED:
        case D45_PRED:
        case D135_PRED:
        case D117_PRED:
        case D153_PRED:
        case D27_PRED:
        case D63_PRED:
+          rate2 += intra_cost_penalty;
+        case DC_PRED:
+        case TM_PRED:
          mbmi->ref_frame = INTRA_FRAME;
          // FIXME compound intra prediction
          vp9_build_intra_predictors_mby(&x->e_mbd);
@ -4129,6 +4133,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
                                             cpi->update_context);
          rate2 += rate;
+          rate2 += intra_cost_penalty;
          distortion2 += distortion;

          if (tmp_rd < best_yrd) {
@ -4221,6 +4226,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          }

          rate2 += rate;
+          rate2 += intra_cost_penalty;
          distortion2 += distortion;

          /* TODO: uv rate maybe over-estimated here since there is UV intra
@ -4730,7 +4736,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
  int mode16x16;
  int mode8x8[2][4];
  int dist;
-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
+  int modeuv, uv_intra_skippable, uv_intra_skippable_8x8;
  int y_intra16x16_skippable = 0;
  int64_t txfm_cache[NB_TXFM_MODES];
  TX_SIZE txfm_size_16x16;
@ -4743,13 +4749,11 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
  if (cpi->common.txfm_mode != ONLY_4X4) {
    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
                                &distuv8x8, &uv_intra_skippable_8x8);
-    modeuv8x8 = mbmi->uv_mode;
  } else {
    uv_intra_skippable_8x8 = uv_intra_skippable;
    rateuv8x8 = rateuv;
    distuv8x8 = distuv;
    rateuv8x8_tokenonly = rateuv_tokenonly;
-    modeuv8x8 = modeuv;
  }

  // current macroblock under rate-distortion optimization test loop
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@ -11,7 +11,7 @@

 #include <stdlib.h>
 #include "vp9/common/vp9_sadmxn.h"
-#include "vpx_ports/config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

 unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@ -130,7 +130,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                              int error_thresh) {
  MACROBLOCK *x = &cpi->mb;
  int step_param;
-  int further_steps;
  int sadpb = x->sadperbit16;
  int bestsme = INT_MAX;

@ -164,11 +163,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
  if (cpi->Speed < 8) {
    step_param = cpi->sf.first_step +
                 ((cpi->Speed > 5) ? 1 : 0);
-    further_steps =
-      (cpi->sf.max_step_search_steps - 1) - step_param;
  } else {
    step_param = cpi->sf.first_step + 2;
-    further_steps = 0;
  }

  /*cpi->sf.search_method == HEX*/
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"

 ;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx)
+global sym(vp9_short_fdct4x4_mmx) PRIVATE
 sym(vp9_short_fdct4x4_mmx):
    push        rbp
    mov         rbp,        rsp
--- a/Show More
+++ b/Show More