Use memcpy for save/restore_predictor

The save_predictor and restore_predictor functions perform a 1D backup of the 2D predictor block. Use memcpy to get a faster copy operation than 4 individual load/stores. Change-Id: Ia609ed71fbff1ade6fa677186efce9ee29167fd6
Merge "Improve vp8_sad16x16_sse3 function"
2011-02-15 10:22:21 -05:00 · 2011-02-14 14:09:25 -08:00 · 2011-02-14 13:58:12 -08:00 · 2011-02-14 16:34:33 -05:00 · 2011-02-14 16:23:49 -05:00 · 2011-02-14 11:29:22 -08:00
153 changed files with 7293 additions and 6080 deletions
--- a/args.c
+++ b/args.c
@@ -135,6 +135,17 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
                     def->long_name, long_val);
        fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
        if(def->enums)
        {
            const struct arg_enum_list *listptr;
            fprintf(fp, "  %-37s\t  ", "");
            for(listptr = def->enums; listptr->name; listptr++)
                fprintf(fp, "%s%s", listptr->name,
                        listptr[1].name ? ", " : "\n");
        }
    }
 }
@@ -218,3 +229,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg)
    return rat;
 }
 int arg_parse_enum(const struct arg *arg)
 {
    const struct arg_enum_list *listptr;
    long int                    rawval;
    char                       *endptr;
    /* First see if the value can be parsed as a raw value */
    rawval = strtol(arg->val, &endptr, 10);
    if (arg->val[0] != '\0' && endptr[0] == '\0')
    {
        /* Got a raw value, make sure it's valid */
        for(listptr = arg->def->enums; listptr->name; listptr++)
            if(listptr->val == rawval)
                return rawval;
    }
    /* Next see if it can be parsed as a string */
    for(listptr = arg->def->enums; listptr->name; listptr++)
        if(!strcmp(arg->val, listptr->name))
            return listptr->val;
    die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
    return 0;
 }
 int arg_parse_enum_or_int(const struct arg *arg)
 {
    if(arg->def->enums)
        return arg_parse_enum(arg);
    return arg_parse_int(arg);
 }
--- a/args.h
+++ b/args.h
@@ -22,14 +22,23 @@ struct arg
    const struct arg_def  *def;
 };
 struct arg_enum_list
 {
    const char *name;
    int         val;
 };
 #define ARG_ENUM_LIST_END {0}
 typedef struct arg_def
 {
    const char *short_name;
    const char *long_name;
    int         has_val;
    const char *desc;
    const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d}
+#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
 #define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
 #define ARG_DEF_LIST_END {0}
 struct arg arg_init(char **argv);
@@ -41,4 +50,5 @@ char **argv_dup(int argc, const char **argv);
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
 int arg_parse_enum_or_int(const struct arg *arg);
 #endif
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -17,6 +17,8 @@ for i; do
        on_of=1
    elif [ "$i" == "-v" ]; then
        verbose=1
    elif [ "$i" == "-g" ]; then
        args="${args} --debug"
    elif [ "$on_of" == "1" ]; then
        outfile=$i
        on_of=0
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -78,6 +78,7 @@ Build options:
  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
@@ -442,6 +443,9 @@ process_common_cmdline() {
        ;;
        --cpu=*) tune_cpu="$optval"
        ;;
        --extra-cflags=*)
        extra_cflags="${optval}"
        ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        echo "${CMDLINE_SELECT} ${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null || die_unknown $opt
@@ -547,6 +551,10 @@ process_common_toolchain() {
                tgt_isa=universal
                tgt_os=darwin9
                ;;
            *darwin10*)
                tgt_isa=x86_64
                tgt_os=darwin10
                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -606,6 +614,12 @@ process_common_toolchain() {
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
            add_ldflags "-mmacosx-version-min=10.5"
            ;;
        *-darwin10-*)
            add_cflags  "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
            add_cflags  "-mmacosx-version-min=10.6"
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
            add_ldflags "-mmacosx-version-min=10.6"
            ;;
    esac
    # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -655,7 +669,7 @@ process_common_toolchain() {
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
            fi
-
+            enabled debug && add_asflags -g
            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
            ;;
        rvct)
@@ -680,16 +694,24 @@ process_common_toolchain() {
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
            enabled debug && add_asflags -g
            add_cflags --gnu
            add_cflags --enum_is_int
            add_cflags --wchar32
        ;;
        esac
        case ${tgt_os} in
        none*)
            disable multithread
            disable os_support
            ;;
        darwin*)
            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin9-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -703,14 +725,14 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}
-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.1.sdk"
+            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.sdk"
            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS3.1.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.2.sdk
            # Add the paths for the alternate libc
 #            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin9/4.0.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done
@@ -732,13 +754,9 @@ process_common_toolchain() {
                    || die "Must supply --libc when targetting *-linux-rvct"
                # Set up compiler
                add_cflags --gnu
                add_cflags --enum_is_int
                add_cflags --library_interface=aeabi_glibc
                add_cflags --no_hide_all
                add_cflags --wchar32
                add_cflags --dwarf2
                add_cflags --gnu
                # Set up linker
                add_ldflags --sysv --no_startup --no_ref_cpp_init
@@ -824,6 +842,7 @@ process_common_toolchain() {
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
        soft_enable sse4_1
        case  ${tgt_os} in
            win*)
@@ -879,7 +898,7 @@ process_common_toolchain() {
        case  ${tgt_os} in
            win*)
                add_asflags -f win${bits}
-                enabled debug && add_asflags -g dwarf2
+                enabled debug && add_asflags -g cv8
            ;;
            linux*|solaris*)
                add_asflags -f elf${bits}
@@ -961,6 +980,12 @@ EOF
        add_cflags -D_LARGEFILE_SOURCE
        add_cflags -D_FILE_OFFSET_BITS=64
    fi
    # append any user defined extra cflags
    if [ -n "${extra_cflags}" ] ; then
        check_add_cflags ${extra_cflags} || \
        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
    fi
 }
 process_toolchain() {
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -590,7 +590,7 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
    //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
-    fp = fopen("vpx_asm_offsets.asm", "w");
+    fp = fopen("assembly_offsets.asm", "w");
    if (fp == NULL)
    {
--- a/21
+++ b/21
@@ -40,7 +40,7 @@ Advanced options:
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
-  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
+  ${toggle_postproc_visualizer}   macro block / block level visualizers
 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -78,11 +78,13 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv5te-wince-vs8"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
 all_platforms="${all_platforms} armv6-symbian-gcc"
 all_platforms="${all_platforms} armv6-wince-vs8"
 all_platforms="${all_platforms} iwmmxt-linux-rvct"
@@ -94,6 +96,7 @@ all_platforms="${all_platforms} iwmmxt2-wince-vs8"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -114,6 +117,7 @@ all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -157,6 +161,7 @@ enable fast_unaligned #allow unaligned accesses, if supported by hw
 enable md5
 enable spatial_resampling
 enable multithread
 enable os_support
 [ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8; do
@@ -199,6 +204,7 @@ ARCH_EXT_LIST="
    sse2
    sse3
    ssse3
    sse4_1
    altivec
 "
@@ -248,7 +254,8 @@ CONFIG_LIST="
    realtime_only
    shared
    small
-    arm_asm_detok
+    postproc_visualizer
    os_support
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -287,7 +294,7 @@ CMDLINE_SELECT="
    realtime_only
    shared
    small
-    arm_asm_detok
+    postproc_visualizer
 "
 process_cmdline() {
@@ -295,7 +302,7 @@ process_cmdline() {
        optval="${opt#*=}"
        case "$opt" in
        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
-        *) process_common_cmdline $opt
+        *) process_common_cmdline "$opt"
        ;;
        esac
    done
@@ -324,8 +331,6 @@ post_process_cmdline() {
    for c in ${CODECS}; do
        enabled ${c} && enable ${c##*_}s
    done
 }
@@ -535,6 +540,10 @@ process_toolchain() {
    # Other toolchain specific defaults
    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
    if enabled postproc_visualizer; then
        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
    fi
 }
--- a/examples.mk
+++ b/examples.mk
@@ -17,6 +17,7 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -28,6 +29,7 @@ vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
 vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
@@ -91,8 +93,16 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 # Handle extra library flags depending on codec configuration
 CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 # We should not link to math library (libm) on RVCT
 # when building for bare-metal targets
 ifeq ($(CONFIG_OS_SUPPORT), yes)
 CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 else
    ifeq ($(CONFIG_GCC), yes)
    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
    endif
 endif
 #
 # End of specified files. The rest of the build rules should happen
 # automagically from here.
--- a/examples/vp8_set_maps.txt
+++ b/examples/vp8_set_maps.txt
@@ -78,8 +78,8 @@ if(frame_cnt + 1 == 22) {
 } else if(frame_cnt + 1 == 44) {
    vpx_active_map_t  active;
-    active.rows = 240/16;
+    active.rows = cfg.g_h/16;
-    active.cols = 320/16;
+    active.cols = cfg.g_w/16;
    /* pass in null map to disable active_map*/
    active.active_map = NULL;
--- a/libs.mk
+++ b/libs.mk
@@ -230,10 +230,39 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
 #$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(BUILD_PFX)vpx_asm_offsets.asm
 $(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
 $(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 #
 # Calculate platform- and compiler-specific offsets for hand coded assembly
 #
 ifeq ($(ARCH_ARM), yes)
  asm_com_offsets.asm: obj_int_extract
  asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
  OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
  CLEAN-OBJS += asm_com_offsets.asm
  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
  ifeq ($(CONFIG_VP8_ENCODER), yes)
    asm_enc_offsets.asm: obj_int_extract
    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
    CLEAN-OBJS += asm_enc_offsets.asm
    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
  endif
  ifeq ($(CONFIG_VP8_DECODER), yes)
    asm_dec_offsets.asm: obj_int_extract
    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
 	./obj_int_extract rvds $< $(ADS2GAS) > $@
    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
    CLEAN-OBJS += asm_dec_offsets.asm
    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
  endif
 endif
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -20,8 +20,6 @@
 * Still in the public domain.
 */
 #include <sys/types.h>    /* for stupid systems */
 #include <string.h>   /* for memcpy() */
 #include "md5_utils.h"
--- a/tools_common.c
+++ b/tools_common.c
@@ -0,0 +1,24 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <stdio.h>
 #include "tools_common.h"
 #ifdef _WIN32
 #include <io.h>
 #include <fcntl.h>
 #endif
 FILE* set_binary_mode(FILE *stream)
 {
    (void)stream;
 #ifdef _WIN32
    _setmode(_fileno(stream), _O_BINARY);
 #endif
    return stream;
 }
--- a/vp8/decoder/arm/detokenize_arm.h
+++ b/vp8/decoder/arm/detokenize_arm.h
@@ -7,16 +7,10 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef TOOLS_COMMON_H
 #define TOOLS_COMMON_H
-
+/* Sets a stdio stream into binary mode */
-#ifndef DETOKENIZE_ARM_H
+FILE* set_binary_mode(FILE *stream);
 #define DETOKENIZE_ARM_H
 #if HAVE_ARMV6
 #if CONFIG_ARM_ASM_DETOK
 void vp8_init_detokenizer(VP8D_COMP *dx);
 void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
 #endif
 #endif
 #endif
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -16,10 +16,10 @@
 ;-------------------------------------
 ; r0    unsigned char  *src_ptr,
-; r1    unsigned short *output_ptr,
+; r1    unsigned short *dst_ptr,
-; r2    unsigned int src_pixels_per_line,
+; r2    unsigned int    src_pitch,
-; r3    unsigned int output_height,
+; r3    unsigned int    height,
-; stack    unsigned int output_width,
+; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
@@ -27,7 +27,7 @@
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
    mov     r12, r3                         ; outer-loop counter
    sub     r2, r2, r4                      ; src increment for height loop
@@ -38,10 +38,10 @@
    ldr     r5, [r11]                       ; load up filter coefficients
-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row
    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_1st_filter
@@ -140,17 +140,17 @@
 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
+; r1    unsigned char  *dst_ptr,
-; r2    int output_pitch,
+; r2    int             dst_pitch,
-; r3    unsigned int  output_height,
+; r3    unsigned int    height,
-; stack unsigned int  output_width,
+; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}
    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -243,8 +243,6 @@ skip_secondpass_hloop
    ENDP
 ;-----------------
    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter8_coeff_
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -10,112 +10,15 @@
 #include <math.h>
 #include "filter.h"
 #include "subpixel.h"
-
+#include "arm/bilinearfilter_arm.h"
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 static const short bilinear_filters[8][2] =
 {
    { 128,   0 },
    { 112,  16 },
    {  96,  32 },
    {  80,  48 },
    {  64,  64 },
    {  48,  80 },
    {  32,  96 },
    {  16, 112 }
 };
 extern void vp8_filter_block2d_bil_first_pass_armv6
 (
    unsigned char *src_ptr,
    unsigned short *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int output_height,
    unsigned int output_width,
    const short *vp8_filter
 );
 extern void vp8_filter_block2d_bil_second_pass_armv6
 (
    unsigned short *src_ptr,
    unsigned char  *output_ptr,
    int output_pitch,
    unsigned int  output_height,
    unsigned int  output_width,
    const short *vp8_filter
 );
 #if 0
 void vp8_filter_block2d_bil_first_pass_6
 (
    unsigned char *src_ptr,
    unsigned short *output_ptr,
    unsigned int src_pixels_per_line,
    unsigned int output_height,
    unsigned int output_width,
    const short *vp8_filter
 )
 {
    unsigned int i, j;
    for ( i=0; i<output_height; i++ )
    {
        for ( j=0; j<output_width; j++ )
        {
            /* Apply bilinear filter */
            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
                               ((int)src_ptr[1] * vp8_filter[1]) +
                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }
        /* Next row... */
        src_ptr    += src_pixels_per_line - output_width;
        output_ptr += output_width;
    }
 }
 void vp8_filter_block2d_bil_second_pass_6
 (
    unsigned short *src_ptr,
    unsigned char  *output_ptr,
    int output_pitch,
    unsigned int  output_height,
    unsigned int  output_width,
    const short *vp8_filter
 )
 {
    unsigned int  i,j;
    int  Temp;
    for ( i=0; i<output_height; i++ )
    {
        for ( j=0; j<output_width; j++ )
        {
            /* Apply filter */
            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
                    ((int)src_ptr[output_width] * vp8_filter[1]) +
                    (VP8_FILTER_WEIGHT/2);
            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }
        /* Next row... */
        /*src_ptr    += src_pixels_per_line - output_width;*/
        output_ptr += output_pitch;
    }
 }
 #endif
 void vp8_filter_block2d_bil_armv6
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
+    unsigned char *dst_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
    const short   *HFilter,
    const short   *VFilter,
@@ -123,15 +26,13 @@ void vp8_filter_block2d_bil_armv6
    int            Height
 )
 {
-
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
    /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
@@ -148,8 +49,8 @@ void vp8_bilinear_predict4x4_armv6
    const short  *HFilter;
    const short  *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +68,8 @@ void vp8_bilinear_predict8x8_armv6
    const short  *HFilter;
    const short  *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +87,8 @@ void vp8_bilinear_predict8x4_armv6
    const short  *HFilter;
    const short  *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +106,8 @@ void vp8_bilinear_predict16x16_armv6
    const short  *HFilter;
    const short  *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef BILINEARFILTER_ARM_H
 #define BILINEARFILTER_ARM_H
 extern void vp8_filter_block2d_bil_first_pass_armv6
 (
    const unsigned char  *src_ptr,
    unsigned short       *dst_ptr,
    unsigned int          src_pitch,
    unsigned int          height,
    unsigned int          width,
    const short          *vp8_filter
 );
 extern void vp8_filter_block2d_bil_second_pass_armv6
 (
    const unsigned short *src_ptr,
    unsigned char        *dst_ptr,
    int                   dst_pitch,
    unsigned int          height,
    unsigned int          width,
    const short         *vp8_filter
 );
 #endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -11,26 +11,10 @@
 #include "vpx_ports/config.h"
 #include <math.h>
 #include "filter.h"
 #include "subpixel.h"
 #include "vpx_ports/mem.h"
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
 {
    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
    { 0, -6,  123,   12,  -1,  0 },
    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
    { 0, -9,   93,   50,  -6,  0 },
    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
 };
 extern void vp8_filter_block2d_first_pass_armv6
 (
    unsigned char *src_ptr,
@@ -93,11 +77,11 @@ void vp8_sixtap_predict_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* Vfilter is null. First pass only */
    if (xoffset && !yoffset)
@@ -129,47 +113,6 @@ void vp8_sixtap_predict_armv6
    }
 }
 #if 0
 void vp8_sixtap_predict8x4_armv6
 (
    unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int  dst_pitch
 )
 {
    const short  *HFilter;
    const short  *VFilter;
    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
    /*if (xoffset && !yoffset)
    {
        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
    }*/
    /* Hfilter is null. Second pass only */
    /*else if (!xoffset && yoffset)
    {
        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
    }
    else
    {
        if (yoffset & 0x1)
            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
        else*/
        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
    /*}*/
 }
 #endif
 void vp8_sixtap_predict8x8_armv6
 (
    unsigned char  *src_ptr,
@@ -182,10 +125,10 @@ void vp8_sixtap_predict8x8_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    if (xoffset && !yoffset)
    {
@@ -224,10 +167,10 @@ void vp8_sixtap_predict16x16_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    if (xoffset && !yoffset)
    {
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -41,13 +41,13 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -57,7 +57,7 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Vertical MB Filtering */
@@ -65,13 +65,13 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -81,7 +81,7 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Horizontal B Filtering */
@@ -94,10 +94,10 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -122,10 +122,10 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -148,10 +148,10 @@ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }
 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -161,7 +161,7 @@ void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Vertical MB Filtering */
@@ -169,10 +169,10 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }
 void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -182,7 +182,7 @@ void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Horizontal B Filtering */
@@ -195,7 +195,7 @@ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }
 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -220,7 +220,7 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }
 void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -350,10 +350,7 @@ filt_blk2d_spo16x16_loop_neon
    ENDP
 ;-----------------
-    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _bifilter16_coeff_
    DCD     bifilter16_coeff
 bifilter16_coeff
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -123,10 +123,7 @@ skip_secondpass_filter
    ENDP
 ;-----------------
-    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _bifilter4_coeff_
    DCD     bifilter4_coeff
 bifilter4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -128,10 +128,7 @@ skip_secondpass_filter
    ENDP
 ;-----------------
-    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _bifilter8x4_coeff_
    DCD     bifilter8x4_coeff
 bifilter8x4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -176,10 +176,7 @@ skip_secondpass_filter
    ENDP
 ;-----------------
-    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _bifilter8_coeff_
    DCD     bifilter8_coeff
 bifilter8_coeff
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -397,7 +397,8 @@
    bx          lr
    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-    AREA    loopfilter_dat, DATA, READONLY
+;-----------------
 _lf_coeff_
    DCD     lf_coeff
 lf_coeff
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -104,10 +104,7 @@
    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
 ;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _lfhy_coeff_
    DCD     lfhy_coeff
 lfhy_coeff
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -145,10 +145,7 @@
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 ;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _vlfy_coeff_
    DCD     vlfy_coeff
 vlfy_coeff
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -505,7 +505,8 @@
    bx          lr
    ENDP        ; |vp8_mbloop_filter_neon|
-    AREA    mbloopfilter_dat, DATA, READONLY
+;-----------------
 _mblf_coeff_
    DCD     mblf_coeff
 mblf_coeff
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -113,10 +113,7 @@
    ENDP
 ;-----------------
-    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _idct_coeff_
    DCD     idct_coeff
 idct_coeff
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -476,10 +476,7 @@ secondpass_only_inner_loop_neon
    ENDP
 ;-----------------
-    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter16_coeff_
    DCD     filter16_coeff
 filter16_coeff
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -407,10 +407,7 @@ secondpass_filter4x4_only
    ENDP
 ;-----------------
-    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter4_coeff_
    DCD     filter4_coeff
 filter4_coeff
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -458,10 +458,7 @@ secondpass_filter8x4_only
    ENDP
 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -509,10 +509,7 @@ filt_blk2d_spo8x8_loop_neon
    ENDP
 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -0,0 +1,49 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include <stddef.h>
 #include "vpx_scale/yv12config.h"
 #define ct_assert(name,cond) \
    static void assert_##name(void) UNUSED;\
    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
 #define DEFINE(sym, val) int sym = val;
 /*
 #define BLANK() asm volatile("\n->" : : )
 */
 /*
 * int main(void)
 * {
 */
 //vpx_scale
 DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
 DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
 DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
 DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
 DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
 DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
 DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
 DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
 DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
 /*
 * return 0;
 * }
 */
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -282,6 +282,8 @@ typedef struct
    void *current_bc;
    int corrupted;
 #if CONFIG_RUNTIME_CPU_DETECT
    struct VP8_COMMON_RTCD  *rtcd;
 #endif
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -36,6 +36,14 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
    7, 11, 14, 15,
 };
 DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
 {
    1,  2,  6,  7,
    3,  5,  8, 13,
    4,  9, 12, 14,
   10, 11, 15, 16
 };
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);
 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
@@ -106,23 +114,20 @@ static void init_bit_trees()
    init_bit_tree(cat6, 11);
 }
 static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0, 0},
+    { 0, 0, 0, 0},
-    { 0, 0, 0, 0, 1},
+    { 0, 0, 0, 1},
-    { 0, 0, 0, 0, 2},
+    { 0, 0, 0, 2},
-    { 0, 0, 0, 0, 3},
+    { 0, 0, 0, 3},
-    { 0, 0, 0, 0, 4},
+    { 0, 0, 0, 4},
-    { cat1, Pcat1, bcc1, 1, 5},
+    { cat1, Pcat1, 1, 5},
-    { cat2, Pcat2, bcc2, 2, 7},
+    { cat2, Pcat2, 2, 7},
-    { cat3, Pcat3, bcc3, 3, 11},
+    { cat3, Pcat3, 3, 11},
-    { cat4, Pcat4, bcc4, 4, 19},
+    { cat4, Pcat4, 4, 19},
-    { cat5, Pcat5, bcc5, 5, 35},
+    { cat5, Pcat5, 5, 35},
-    { cat6, Pcat6, bcc6, 11, 67},
+    { cat6, Pcat6, 11, 67},
-    { 0, 0, 0, 0, 0}
+    { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -24,10 +24,10 @@
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
 #define vp8_coef_tokens 12
@@ -42,7 +42,6 @@ typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
    vp8bc_index_t *prob_bc;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
@@ -95,6 +94,7 @@ struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
 extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter_c.c
@@ -10,13 +10,10 @@
 #include <stdlib.h>
 #include "filter.h"
 #include "vpx_ports/mem.h"
-#define BLOCK_HEIGHT_WIDTH 4
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 static const int bilinear_filters[8][2] =
 {
    { 128,   0 },
    { 112,  16 },
@@ -28,8 +25,7 @@ static const int bilinear_filters[8][2] =
    {  16, 112 }
 };
-
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 static const short sub_pel_filters[8][6] =
 {
    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
@@ -40,9 +36,6 @@ static const short sub_pel_filters[8][6] =
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
 };
 void vp8_filter_block2d_first_pass
@@ -146,7 +139,7 @@ void vp8_filter_block2d
    const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */
    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
@@ -195,8 +188,8 @@ void vp8_sixtap_predict_c
    const short  *HFilter;
    const short  *VFilter;
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
@@ -212,10 +205,10 @@ void vp8_sixtap_predict8x8_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
@@ -238,10 +231,10 @@ void vp8_sixtap_predict8x4_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -264,11 +257,11 @@ void vp8_sixtap_predict16x16_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
@@ -284,56 +277,49 @@ void vp8_sixtap_predict16x16_c
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
 *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32  src_stride : Stride of source block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32  height     : Block height.
- *                  UINT32 output_height     : Input block height.
+ *                  UINT32  width      : Block width.
 *                  UINT32 output_width      : Input block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  either horizontal or vertical direction to produce the
+ *                  in the horizontal direction to produce the filtered output
- *                  filtered output block. Used to implement first-pass
+ *                  block. Used to implement first-pass of 2-D separable filter.
 *                  of 2-D separable filter.
 *
 *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *                  pixel_step defines whether the filter is applied
 *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
 *                  It defines the offset required to move from one input
 *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_first_pass
 (
    unsigned char  *src_ptr,
-    unsigned short *output_ptr,
+    unsigned short *dst_ptr,
-    unsigned int src_pixels_per_line,
+    unsigned int    src_stride,
-    int pixel_step,
+    unsigned int    height,
-    unsigned int output_height,
+    unsigned int    width,
-    unsigned int output_width,
+    const short    *vp8_filter
    const int *vp8_filter
 )
 {
    unsigned int i, j;
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }
        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
+        src_ptr += src_stride - width;
-        output_ptr += output_width;
+        dst_ptr += width;
    }
 }
@@ -342,59 +328,50 @@ void vp8_filter_block2d_bil_first_pass
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
 *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32  height     : Block height.
- *                  UINT32 output_height     : Input block height.
+ *                  UINT32  width      : Block width.
 *                  UINT32 output_width      : Input block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  either horizontal or vertical direction to produce the
+ *                  in the vertical direction to produce the filtered output
- *                  filtered output block. Used to implement second-pass
+ *                  block. Used to implement second-pass of 2-D separable filter.
 *                  of 2-D separable filter.
 *
 *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
 *                  pixel_step defines whether the filter is applied
 *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
 *                  It defines the offset required to move from one input
 *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
+    unsigned char  *dst_ptr,
-    int output_pitch,
+    int             dst_pitch,
-    unsigned int  src_pixels_per_line,
+    unsigned int    height,
-    unsigned int  pixel_step,
+    unsigned int    width,
-    unsigned int  output_height,
+    const short    *vp8_filter
    unsigned int  output_width,
    const int *vp8_filter
 )
 {
    unsigned int  i, j;
    int  Temp;
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply filter */
            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }
        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
+        dst_ptr += dst_pitch;
        output_ptr += output_pitch;
    }
 }
@@ -404,11 +381,14 @@ void vp8_filter_block2d_bil_second_pass
 *  ROUTINE       : filter_block2d_bil
 *
 *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32  src_pitch        : Stride of source block.
 *                  UINT32  dst_pitch        : Stride of destination block.
 *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
 *                  INT32  *VFilter          : Array of 2 vertical filter taps.
 *                  INT32  Width             : Block width
 *                  INT32  Height            : Block height
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@@ -422,23 +402,23 @@ void vp8_filter_block2d_bil_second_pass
 void vp8_filter_block2d_bil
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
+    unsigned char *dst_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const int      *HFilter,
+    const short   *HFilter,
-    const int      *VFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {
-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
@@ -452,11 +432,11 @@ void vp8_bilinear_predict4x4_c
    int dst_pitch
 )
 {
-    const int  *HFilter;
+    const short *HFilter;
-    const int  *VFilter;
+    const short *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
    {
        int i;
@@ -490,11 +470,11 @@ void vp8_bilinear_predict8x8_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
+    const short *HFilter;
-    const int  *VFilter;
+    const short *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
@@ -510,11 +490,11 @@ void vp8_bilinear_predict8x4_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
+    const short *HFilter;
-    const int  *VFilter;
+    const short *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
@@ -530,11 +510,11 @@ void vp8_bilinear_predict16x16_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
+    const short *HFilter;
-    const int  *VFilter;
+    const short *VFilter;
-    HFilter = bilinear_filters[xoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/filter.h
+++ b/vp8/common/filter.h
@@ -0,0 +1,22 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef FILTER_H
 #define FILTER_H
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP8_FILTER_WEIGHT 128
 #define VP8_FILTER_SHIFT  7
 extern const short vp8_bilinear_filters[8][2];
 extern const short vp8_sub_pel_filters[8][6];
 #endif //FILTER_H
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,47 +11,9 @@
 #include "findnearmv.h"
 #define FINDNEAR_SEARCH_SITES   3
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
 typedef union
 {
    unsigned int as_int;
    MV           as_mv;
 } int_mv;        /* facilitates rapid equality tests */
 static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
 {
    MV xmv;
    xmv = x->mbmi.mv.as_mv;
    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
    {
        xmv.row *= -1;
        xmv.col *= -1;
    }
    mvp->as_mv = xmv;
 }
 void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
 {
    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
 }
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -82,7 +44,7 @@ void vp8_find_near_mvs
        if (above->mbmi.mv.as_int)
        {
            (++mv)->as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
            ++cntx;
        }
@@ -97,7 +59,7 @@ void vp8_find_near_mvs
            int_mv this_mv;
            this_mv.as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
            if (this_mv.as_int != mv->as_int)
            {
@@ -119,7 +81,7 @@ void vp8_find_near_mvs
            int_mv this_mv;
            this_mv.as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
            if (this_mv.as_int != mv->as_int)
            {
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -17,6 +17,41 @@
 #include "modecont.h"
 #include "treecoder.h"
 typedef union
 {
    unsigned int as_int;
    MV           as_mv;
 } int_mv;        /* facilitates rapid equality tests */
 static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
 {
    MV xmv;
    xmv = mvp->as_mv;
    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
    {
        xmv.row *= -1;
        xmv.col *= -1;
    }
    mvp->as_mv = xmv;
 }
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
 static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
 {
    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
 }
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -35,8 +70,4 @@ const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);
 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
 #endif
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -69,7 +69,9 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif
 #endif
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -28,13 +28,13 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -44,7 +44,7 @@ void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Vertical MB Filtering */
@@ -52,13 +52,13 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -68,7 +68,7 @@ void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
 /* Horizontal B Filtering */
@@ -81,10 +81,10 @@ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -109,10 +109,10 @@ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -137,8 +137,6 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
    int block_inside_limit = 0;
    int HEVThresh;
    const int yhedge_boost  = 2;
    const int uvhedge_boost = 2;
    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
    for (i = 0; i <= MAX_LOOP_FILTER; i++)
@@ -182,15 +180,9 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
        for (j = 0; j < 16; j++)
        {
            lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
+            lfi[i].mbflim[j] = filt_lvl + 2;
            lfi[i].mbthr[j] = HEVThresh;
            lfi[i].flim[j] = filt_lvl;
            lfi[i].thr[j] = HEVThresh;
            lfi[i].uvlim[j] = block_inside_limit;
            lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost;
            lfi[i].uvmbthr[j] = HEVThresh;
            lfi[i].uvflim[j] = filt_lvl;
            lfi[i].uvthr[j] = HEVThresh;
        }
    }
@@ -249,57 +241,52 @@ void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
        for (j = 0; j < 16; j++)
        {
            /*lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
+            lfi[i].mbflim[j] = filt_lvl+2;*/
            lfi[i].mbthr[j] = HEVThresh;
            /*lfi[i].flim[j] = filt_lvl;*/
            lfi[i].thr[j] = HEVThresh;
            /*lfi[i].uvlim[j] = block_inside_limit;
            lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
            lfi[i].uvmbthr[j] = HEVThresh;
            /*lfi[i].uvflim[j] = filt_lvl;*/
            lfi[i].uvthr[j] = HEVThresh;
        }
    }
 }
-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
 {
    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
    if (mbd->mode_ref_lf_delta_enabled)
    {
        /* Apply delta for reference frame */
-        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+        filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
        /* Apply delta for mode */
        if (mbmi->ref_frame == INTRA_FRAME)
        {
            /* Only the split mode BPRED has a further special case */
            if (mbmi->mode == B_PRED)
-                *filter_level +=  mbd->mode_lf_deltas[0];
+                filter_level +=  mbd->mode_lf_deltas[0];
        }
        else
        {
            /* Zero motion mode */
            if (mbmi->mode == ZEROMV)
-                *filter_level +=  mbd->mode_lf_deltas[1];
+                filter_level +=  mbd->mode_lf_deltas[1];
            /* Split MB motion mode */
            else if (mbmi->mode == SPLITMV)
-                *filter_level +=  mbd->mode_lf_deltas[3];
+                filter_level +=  mbd->mode_lf_deltas[3];
            /* All other inter motion modes (Nearest, Near, New) */
            else
-                *filter_level +=  mbd->mode_lf_deltas[2];
+                filter_level +=  mbd->mode_lf_deltas[2];
        }
        /* Range check */
-        if (*filter_level > MAX_LOOP_FILTER)
+        if (filter_level > MAX_LOOP_FILTER)
-            *filter_level = MAX_LOOP_FILTER;
+            filter_level = MAX_LOOP_FILTER;
-        else if (*filter_level < 0)
+        else if (filter_level < 0)
-            *filter_level = 0;
+            filter_level = 0;
    }
    return filter_level;
 }
@@ -373,7 +360,7 @@ void vp8_loop_filter_frame
             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
             * Apply any context driven MB level adjustment
             */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
            if (filter_level)
            {
@@ -473,7 +460,7 @@ void vp8_loop_filter_frame_yonly
            filter_level = baseline_filter_level[Segment];
            /* Apply any context driven MB level adjustment */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
            if (filter_level)
            {
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -32,12 +32,6 @@ typedef struct
    DECLARE_ALIGNED(16, signed char, flim[16]);
    DECLARE_ALIGNED(16, signed char, thr[16]);
    DECLARE_ALIGNED(16, signed char, mbflim[16]);
    DECLARE_ALIGNED(16, signed char, mbthr[16]);
    DECLARE_ALIGNED(16, signed char, uvlim[16]);
    DECLARE_ALIGNED(16, signed char, uvflim[16]);
    DECLARE_ALIGNED(16, signed char, uvthr[16]);
    DECLARE_ALIGNED(16, signed char, uvmbflim[16]);
    DECLARE_ALIGNED(16, signed char, uvmbthr[16]);
 } loop_filter_info;
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -18,6 +18,7 @@ extern "C"
 #endif
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "type_aliases.h"
 #include "ppflags.h"
@@ -45,7 +46,8 @@ extern "C"
    typedef enum
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
-        USAGE_LOCAL_FILE_PLAYBACK   = 0x1
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
        USAGE_CONSTRAINED_QUALITY   = 0x2
    } END_USAGE;
@@ -149,6 +151,7 @@ extern "C"
        int fixed_q;
        int worst_allowed_q;
        int best_allowed_q;
        int cq_level;
        // allow internal resizing ( currently disabled in the build !!!!!)
        int allow_spatial_resampling;
@@ -186,9 +189,10 @@ extern "C"
        int arnr_strength ;
        int arnr_type     ;
        struct vpx_fixed_buf         two_pass_stats_in;
        struct vpx_codec_pkt_list  *output_pkt_list;
        vp8e_tuning tuning;
    } VP8_CONFIG;
@@ -204,7 +208,7 @@ extern "C"
 // and not just a copy of the pointer..
    int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
    int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
-    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+    int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
    int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
    int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -105,7 +105,7 @@ typedef struct VP8Common
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;
-    FRAME_TYPE last_frame_type;  /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for loopfilter init checking and motion search. */
    FRAME_TYPE frame_type;
    int show_frame;
@@ -200,7 +200,7 @@ typedef struct VP8Common
 } VP8_COMMON;
-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
 void vp8_init_loop_filter(VP8_COMMON *cm);
 void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@ extern "C"
    int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
    int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
-    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags);
+    int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);
    int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
    int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
 /* global constants */
-
+#if CONFIG_POSTPROC_VISUALIZER
 static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
@@ -41,13 +41,32 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
    { RGB_TO_YUV(0xFF0000) }    /* Red */
 };
-static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
 {
    { RGB_TO_YUV(0x6633ff) },   /* Purple */
    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
    { RGB_TO_YUV(0xff33cc) },   /* Pink */
    { RGB_TO_YUV(0xff3366) },   /* Coral */
    { RGB_TO_YUV(0x3366ff) },   /* Blue */
    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
    { RGB_TO_YUV(0xff6633) },   /* Orange */
    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
    { RGB_TO_YUV(0x8ab800) },   /* Green */
    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
    { RGB_TO_YUV(0xccff33) },   /* Yellow */
 };
 static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
 {
    { RGB_TO_YUV(0x00ff00) },   /* Blue */
    { RGB_TO_YUV(0x0000ff) },   /* Green */
    { RGB_TO_YUV(0xffff00) },   /* Yellow */
    { RGB_TO_YUV(0xff0000) },   /* Red */
 };
 #endif
 static const short kernel5[] =
 {
@@ -476,7 +495,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
 */
-void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
@@ -484,10 +503,10 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);
-    y += stride + 2;
+    y += 2*stride + 2;
-    for (i = 0; i < 14; i++)
+    for (i = 0; i < 12; i++)
    {
-        for (j = 0; j < 14; j++)
+        for (j = 0; j < 12; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
@@ -511,6 +530,104 @@ void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
    }
 }
 /* Blend only the edge of the macro block.  Leave center
 * unblended to allow for other visualizations to be layered.
 */
 void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
    int y1_const = y1*((1<<16)-alpha);
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 16; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
        y += stride;
    }
    for (i = 0; i < 12; i++)
    {
        y[0]  = (y[0]*alpha  + y1_const)>>16;
        y[1]  = (y[1]*alpha  + y1_const)>>16;
        y[14] = (y[14]*alpha + y1_const)>>16;
        y[15] = (y[15]*alpha + y1_const)>>16;
        y += stride;
    }
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 16; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
        y += stride;
    }
    stride >>= 1;
    for (j = 0; j < 8; j++)
    {
        u[j] = (u[j]*alpha + u1_const)>>16;
        v[j] = (v[j]*alpha + v1_const)>>16;
    }
    u += stride;
    v += stride;
    for (i = 0; i < 6; i++)
    {
        u[0] = (u[0]*alpha + u1_const)>>16;
        v[0] = (v[0]*alpha + v1_const)>>16;
        u[7] = (u[7]*alpha + u1_const)>>16;
        v[7] = (v[7]*alpha + v1_const)>>16;
        u += stride;
        v += stride;
    }
    for (j = 0; j < 8; j++)
    {
        u[j] = (u[j]*alpha + u1_const)>>16;
        v[j] = (v[j]*alpha + v1_const)>>16;
    }
 }
 void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
                        int y1, int u1, int v1, int alpha, int stride)
 {
    int i, j;
    int y1_const = y1*((1<<16)-alpha);
    int u1_const = u1*((1<<16)-alpha);
    int v1_const = v1*((1<<16)-alpha);
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            y[j] = (y[j]*alpha + y1_const)>>16;
        }
        y += stride;
    }
    stride >>= 1;
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            u[j] = (u[j]*alpha + u1_const)>>16;
            v[j] = (v[j]*alpha + v1_const)>>16;
        }
        u += stride;
        v += stride;
    }
 }
 static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
 {
    int dx;
@@ -522,7 +639,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;
        *x1 = width;
-        if (dy)
+        if (dx)
            *y1 = ((width-x0)*dy)/dx + y0;
    }
    if (*x1 < 0)
@@ -531,7 +648,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;
        *x1 = 0;
-        if (dy)
+        if (dx)
            *y1 = ((0-x0)*dy)/dx + y0;
    }
    if (*y1 > height)
@@ -540,7 +657,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;
        *y1 = height;
-        if (dx)
+        if (dy)
            *x1 = ((height-y0)*dx)/dy + x0;
    }
    if (*y1 < 0)
@@ -549,7 +666,7 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
        dy = *y1 - y0;
        *y1 = 0;
-        if (dx)
+        if (dy)
            *x1 = ((0-y0)*dx)/dy + x0;
    }
 }
@@ -561,10 +678,12 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 #define RTCD_VTABLE(oci) NULL
 #endif
-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
    char message[512];
    int q = oci->filter_level * 10 / 6;
    int flags = ppflags->post_proc_flag;
    int deblock_level = ppflags->deblocking_level;
    int noise_level = ppflags->noise_level;
    if (!oci->frame_to_show)
        return -1;
@@ -621,8 +740,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
         oci->post_proc_buffer.y_stride);
    }
-    if (flags & VP8D_DEBUG_LEVEL1)
+#if CONFIG_POSTPROC_VISUALIZER
    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
    {
        char message[512];
        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                (oci->frame_type == KEY_FRAME),
                oci->refresh_golden_frame,
@@ -633,7 +754,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }
-    if (flags & VP8D_DEBUG_LEVEL2)
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -665,7 +786,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }
-    if (flags & VP8D_DEBUG_LEVEL3)
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
    {
        int i, j;
        unsigned char *y_ptr;
@@ -700,45 +821,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        }
    }
-    if (flags & VP8D_DEBUG_LEVEL4)
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
        char message[512];
        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
 #if 0
        int i, j;
        unsigned char *y_ptr;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int mb_rows = post->y_height >> 4;
        int mb_cols = post->y_width  >> 4;
        int mb_index = 0;
        MODE_INFO *mi = oci->mi;
        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
        /* vp8_filter each macro block */
        for (i = 0; i < mb_rows; i++)
        {
            for (j = 0; j < mb_cols; j++)
            {
                char zz[4];
                sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
                vp8_blit_text(zz, y_ptr, post->y_stride);
                mb_index ++;
                y_ptr += 16;
            }
            mb_index ++; /* border */
            y_ptr += post->y_stride  * 16 - post->y_width;
        }
 #endif
    }
    /* Draw motion vectors */
-    if (flags & VP8D_DEBUG_LEVEL5)
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
    {
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
@@ -749,29 +840,144 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        MODE_INFO *mi = oci->mi;
        int x0, y0;
-        for (y0 = 8; y0 < (height + 8); y0 += 16)
+        for (y0 = 0; y0 < height; y0 += 16)
        {
-            for (x0 = 8; x0 < (width + 8); x0 += 16)
+            for (x0 = 0; x0 < width; x0 += 16)
            {
                int x1, y1;
-               if (mi->mbmi.mode >= NEARESTMV)
+
                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
                {
                    mi++;
                    continue;
                }
                if (mi->mbmi.mode == SPLITMV)
                {
                    switch (mi->mbmi.partitioning)
                    {
                        case 0 :    /* mv_top_bottom */
                        {
                            B_MODE_INFO *bmi = &mi->bmi[0];
                            MV *mv = &bmi->mv.as_mv;
                            x1 = x0 + 8 + (mv->col >> 3);
                            y1 = y0 + 4 + (mv->row >> 3);
                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
                            bmi = &mi->bmi[8];
                            x1 = x0 + 8 + (mv->col >> 3);
                            y1 = y0 +12 + (mv->row >> 3);
                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
                            break;
                        }
                        case 1 :    /* mv_left_right */
                        {
                            B_MODE_INFO *bmi = &mi->bmi[0];
                            MV *mv = &bmi->mv.as_mv;
                            x1 = x0 + 4 + (mv->col >> 3);
                            y1 = y0 + 8 + (mv->row >> 3);
                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
                            bmi = &mi->bmi[2];
                            x1 = x0 +12 + (mv->col >> 3);
                            y1 = y0 + 8 + (mv->row >> 3);
                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
                            break;
                        }
                        case 2 :    /* mv_quarters   */
                        {
                            B_MODE_INFO *bmi = &mi->bmi[0];
                            MV *mv = &bmi->mv.as_mv;
                            x1 = x0 + 4 + (mv->col >> 3);
                            y1 = y0 + 4 + (mv->row >> 3);
                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
                            bmi = &mi->bmi[2];
                            x1 = x0 +12 + (mv->col >> 3);
                            y1 = y0 + 4 + (mv->row >> 3);
                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
                            bmi = &mi->bmi[8];
                            x1 = x0 + 4 + (mv->col >> 3);
                            y1 = y0 +12 + (mv->row >> 3);
                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
                            bmi = &mi->bmi[10];
                            x1 = x0 +12 + (mv->col >> 3);
                            y1 = y0 +12 + (mv->row >> 3);
                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
                            break;
                        }
                        default :
                        {
                            B_MODE_INFO *bmi = mi->bmi;
                            int bx0, by0;
                            for (by0 = y0; by0 < (y0+16); by0 += 4)
                            {
                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
                                {
                                    MV *mv = &bmi->mv.as_mv;
                                    x1 = bx0 + 2 + (mv->col >> 3);
                                    y1 = by0 + 2 + (mv->row >> 3);
                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
                                    bmi++;
                                }
                            }
                        }
                    }
                }
                else if (mi->mbmi.mode >= NEARESTMV)
                {
                    MV *mv = &mi->mbmi.mv.as_mv;
                    const int lx0 = x0 + 8;
                    const int ly0 = y0 + 8;
-                    x1 = x0 + (mv->col >> 3);
+                    x1 = lx0 + (mv->col >> 3);
-                    y1 = y0 + (mv->row >> 3);
+                    y1 = ly0 + (mv->row >> 3);
-                    if (x1 != x0 && y1 != y0)
+                    if (x1 != lx0 && y1 != ly0)
                    {
-                        constrain_line (x0, &x1, y0-1, &y1, width, height);
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0-1,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);
-                        constrain_line (x0, &x1, y0+1, &y1, width, height);
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
-                        vp8_blit_line  (x0,  x1, y0+1,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
                    }
                    else
-                        vp8_blit_line  (x0,  x1, y0,  y1, y_buffer, y_stride);
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
                }
                mi++;
            }
            mi++;
@@ -779,9 +985,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }
    /* Color in block modes */
-    if (flags & VP8D_DEBUG_LEVEL6)
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -791,18 +998,54 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;
-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j += 16)
+            for (x = 0; x < width; x += 16)
            {
                int Y = 0, U = 0, V = 0;
                if (mi->mbmi.mode == B_PRED &&
                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
                {
                    int by, bx;
                    unsigned char *yl, *ul, *vl;
                    B_MODE_INFO *bmi = mi->bmi;
                    yl = y_ptr + x;
                    ul = u_ptr + (x>>1);
                    vl = v_ptr + (x>>1);
                    for (by = 0; by < 16; by += 4)
                    {
                        for (bx = 0; bx < 16; bx += 4)
                        {
                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
                                || (ppflags->display_mb_modes_flag & B_PRED))
                            {
                                Y = B_PREDICTION_MODE_colors[bmi->mode][0];
                                U = B_PREDICTION_MODE_colors[bmi->mode][1];
                                V = B_PREDICTION_MODE_colors[bmi->mode][2];
                                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
                            }
                            bmi++;
                        }
                        yl += y_stride*4;
                        ul += y_stride*1;
                        vl += y_stride*1;
                    }
                }
                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
                {
                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
                }
                mi++;
            }
@@ -815,9 +1058,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
    }
    /* Color in frame reference blocks */
-    if (flags & VP8D_DEBUG_LEVEL7)
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
    {
-        int i, j;
+        int y, x;
        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
        int width  = post->y_width;
        int height = post->y_height;
@@ -827,18 +1070,21 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
        int y_stride = oci->post_proc_buffer.y_stride;
        MODE_INFO *mi = oci->mi;
-        for (i = 0; i < height; i += 16)
+        for (y = 0; y < height; y += 16)
        {
-            for (j = 0; j < width; j +=16)
+            for (x = 0; x < width; x +=16)
            {
                int Y = 0, U = 0, V = 0;
                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
                {
                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
-                    (&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
                }
                mi++;
            }
@@ -849,6 +1095,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
            mi++;
        }
    }
 #endif
    *dest = oci->post_proc_buffer;
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -24,7 +24,15 @@
              char whiteclamp[16], char bothclamp[16],\
              unsigned int w, unsigned int h, int pitch)
-#define prototype_postproc_blend_mb(sym)\
+#define prototype_postproc_blend_mb_inner(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)
 #define prototype_postproc_blend_mb_outer(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)
 #define prototype_postproc_blend_b(sym)\
    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
              int y1, int u1, int v1, int alpha, int stride)
@@ -52,22 +60,36 @@ extern prototype_postproc(vp8_postproc_downacross);
 #endif
 extern prototype_postproc_addnoise(vp8_postproc_addnoise);
-#ifndef vp8_postproc_blend_mb
+#ifndef vp8_postproc_blend_mb_inner
-#define vp8_postproc_blend_mb vp8_blend_mb_c
+#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
 #endif
-extern prototype_postproc_blend_mb(vp8_postproc_blend_mb);
+extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
 #ifndef vp8_postproc_blend_mb_outer
 #define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
 #endif
 extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
 #ifndef vp8_postproc_blend_b
 #define vp8_postproc_blend_b vp8_blend_b_c
 #endif
 extern prototype_postproc_blend_b(vp8_postproc_blend_b);
 typedef prototype_postproc((*vp8_postproc_fn_t));
 typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
 typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb((*vp8_postproc_blend_mb_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
 typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
 typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
 typedef struct
 {
    vp8_postproc_inplace_fn_t           down;
    vp8_postproc_inplace_fn_t           across;
    vp8_postproc_fn_t                   downacross;
    vp8_postproc_addnoise_fn_t          addnoise;
-    vp8_postproc_blend_mb_fn_t  blend_mb;
+    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
    vp8_postproc_blend_b_fn_t           blend_b;
 } vp8_postproc_rtcd_vtable_t;
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -89,7 +111,7 @@ struct postproc_state
 #include "onyxc_int.h"
 #include "ppflags.h"
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        int deblock_level, int noise_level, int flags);
+                        vp8_ppflags_t *flags);
 void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -56,10 +56,10 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }
 void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -77,10 +77,10 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }
 void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -104,7 +104,7 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }
 void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -127,7 +127,7 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }
 void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -17,13 +17,24 @@ enum
    VP8D_DEBLOCK                = 1<<0,
    VP8D_DEMACROBLOCK           = 1<<1,
    VP8D_ADDNOISE               = 1<<2,
-    VP8D_DEBUG_LEVEL1   = 1<<3,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
-    VP8D_DEBUG_LEVEL2   = 1<<4,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
-    VP8D_DEBUG_LEVEL3   = 1<<5,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
-    VP8D_DEBUG_LEVEL4   = 1<<6,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
-    VP8D_DEBUG_LEVEL5   = 1<<7,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
-    VP8D_DEBUG_LEVEL6   = 1<<8,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
-    VP8D_DEBUG_LEVEL7   = 1<<9
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
 };
 typedef struct
 {
    int post_proc_flag;
    int deblocking_level;
    int noise_level;
    int display_ref_frame_flag;
    int display_mb_modes_flag;
    int display_b_modes_flag;
    int display_mv_flag;
 } vp8_ppflags_t;
 #endif
--- a/vp8/common/preproc.h
+++ b/vp8/common/preproc.h
@@ -1,46 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 /****************************************************************************
 *
 *   Module Title :     preproc.h
 *
 *   Description  :     simple preprocessor
 *
 ****************************************************************************/
 #ifndef __INC_PREPROC_H
 #define __INC_PREPROC_H
 /****************************************************************************
 *  Types
 ****************************************************************************/
 typedef struct
 {
    unsigned char *frame_buffer;
    int frame;
    unsigned int *fixed_divide;
    unsigned char *frame_buffer_alloc;
    unsigned int *fixed_divide_alloc;
 } pre_proc_instance;
 /****************************************************************************
 *  Functions.
 ****************************************************************************/
 void pre_proc_machine_specific_config(void);
 void delete_pre_proc(pre_proc_instance *ppi);
 int init_pre_proc(pre_proc_instance *ppi, int frame_size);
 extern void spatial_filter_c(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int width, int height, int pitch, int strength);
 extern void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
 #endif
--- a/vp8/common/preprocif.h
+++ b/vp8/common/preprocif.h
@@ -1,76 +0,0 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 /****************************************************************************
 *
 *   Module Title :     preproc_if.h
 *
 *   Description  :     Pre-processor interface header file.
 *
 ****************************************************************************/
 #ifndef __PREPROC_IF_H
 #define __PREPROC_IF_H
 /****************************************************************************
 *  Header Files
 ****************************************************************************/
 #include "type_aliases.h"
 /****************************************************************************
 *  Types
 ****************************************************************************/
 typedef struct
 {
    UINT8 *Yuv0ptr;
    UINT8 *Yuv1ptr;
    UINT8   *frag_info;              // blocks coded : passed in
    UINT32   frag_info_element_size;   // size of each element
    UINT32   frag_info_coded_mask;     // mask to get at whether fragment is coded
    UINT32 *region_index;            // Gives pixel index for top left of each block
    UINT32 video_frame_height;
    UINT32 video_frame_width;
    UINT8 hfrag_pixels;
    UINT8 vfrag_pixels;
 } SCAN_CONFIG_DATA;
 typedef enum
 {
    SCP_FILTER_ON_OFF,
    SCP_SET_SRF_OFFSET,
    SCP_SET_EBO_ON_OFF,
    SCP_SET_VCAP_LEVEL_OFFSET,
    SCP_SET_SHOW_LOCAL
 } SCP_SETTINGS;
 typedef struct PP_INSTANCE *x_pp_inst;
 /****************************************************************************
 *  Module statics
 ****************************************************************************/
 /* Controls whether Early break out is on or off in default case */
 #define EARLY_BREAKOUT_DEFAULT  TRUE
 /****************************************************************************
 *  Functions
 ****************************************************************************/
 extern  void set_scan_param(x_pp_inst ppi, UINT32 param_id, INT32 param_value);
 extern  UINT32 yuvanalyse_frame(x_pp_inst ppi, UINT32 *KFIndicator);
 extern  x_pp_inst create_pp_instance(void);
 extern  void delete_pp_instance(x_pp_inst *);
 extern  BOOL scan_yuvinit(x_pp_inst,  SCAN_CONFIG_DATA *scan_config_ptr);
 #endif
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -14,6 +14,8 @@
 #define VPXINFINITE 10000       /* 10second. */
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 /* Thread management macros */
 #ifdef _WIN32
 /* Win32 */
@@ -88,4 +90,6 @@
 #define x86_pause_hint()
 #endif
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 #endif
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -45,13 +45,13 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
@@ -62,7 +62,7 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
@@ -71,13 +71,13 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }
@@ -88,7 +88,7 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
@@ -102,10 +102,10 @@ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
@@ -132,10 +132,10 @@ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
    if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }
@@ -159,10 +159,10 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }
@@ -173,7 +173,7 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
@@ -182,10 +182,10 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }
@@ -196,7 +196,7 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }
@@ -210,7 +210,7 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }
@@ -237,7 +237,7 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -14,7 +14,6 @@
 #include "blockd.h"
 #include "pragmas.h"
 #include "postproc.h"
 #include "dboolhuff.h"
 #include "dequantize.h"
 #include "onyxd_int.h"
@@ -35,12 +34,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
 #if 0 /*For use with RTCD, when implemented*/
        pbi->dboolhuff.start             = vp8dx_start_decode_c;
        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
 #endif
    }
 #endif
@@ -54,12 +47,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
 #if 0 /*For use with RTCD, when implemented*/
        pbi->dboolhuff.start             = vp8dx_start_decode_c;
        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
 #endif
    }
 #endif
 #endif
--- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -1,163 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_decode_value_v6|
    EXPORT  |vp8dx_start_decode_v6|
    EXPORT  |vp8dx_stop_decode_v6|
    EXPORT  |vp8dx_decode_bool_v6|
    ARM
    REQUIRE8
    PRESERVE8
    INCLUDE vpx_asm_offsets.asm
 br      RN  r0
 prob    RN  r1
 bits    RN  r1
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;   int z = 0;
 ;   int bit;
 ;   for ( bit=bits-1; bit>=0; bit-- )
 ;   {
 ;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
 ;   }
 ;   return z;
 ;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
 |vp8_decode_value_v6| PROC
    stmdb   sp!, {r4 - r6, lr}
    mov     r4, br
    mov     r5, bits
    mov     r6, #0
    subs    r5, r5, #1
    bmi     decode_value_exit
 decode_value_loop
    mov     prob, #0x80
    mov     br, r4
    bl      vp8dx_decode_bool_v6_internal     ; needed for conversion to s file
    orr     r6, r6, r0, lsl r5
    subs    r5, r5, #1
    bpl     decode_value_loop
 decode_value_exit
    mov     r0, r6
    ldmia   sp!, {r4 - r6, pc}
    ENDP    ; |vp8_decode_value_v6|
 ;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
 |vp8dx_start_decode_v6| PROC
    stmdb   sp!, {r4 - r5, lr}
    mov     r2, #0
    mov     r3, #255
    str     r2, [br, #bool_decoder_lowvalue]
    str     r3, [br, #bool_decoder_range]
    str     r1, [br, #bool_decoder_buffer]
    mov     r3, #8
    mov     r2, #4
    str     r3, [br, #bool_decoder_count]
    str     r2, [br, #bool_decoder_pos]
    ldrb    r2, [r1, #3]
    ldrb    r3, [r1, #2]
    ldrb    r4, [r1, #1]
    ldrb    r5, [r1]
    orr     r1, r2, r3, lsl #8
    orr     r1, r1, r4, lsl #16
    orr     r1, r1, r5, lsl #24
    str     r1, [br, #bool_decoder_value]
    ldmia   sp!, {r4 - r5, pc}
    ENDP    ; |vp8dx_start_decode_v6|
 ;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
 |vp8dx_stop_decode_v6| PROC
    mov     pc, lr
    ENDP    ; |vp8dx_stop_decode_v6|
 ; bigsplit  RN  r1
 ; buffer_v  RN  r1
 ; count_v       RN  r4
 ; range_v       RN  r2
 ; value_v       RN  r3
 ; pos_v     RN  r5
 ; split     RN  r6
 ; bit           RN  lr
 ;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
 |vp8dx_decode_bool_v6| PROC
 vp8dx_decode_bool_v6_internal
    stmdb   sp!, {r4 - r6, lr}
    ldr     r2, [br, #bool_decoder_range]
    ldr     r3, [br, #bool_decoder_value]
    mov     r6, r2, lsl #8
    sub     r6, r6, #256                ;   split = 1 +  (((range-1) * probability) >> 8)
    mov     r12, #1
    smlawb  r6, r6, prob, r12
    mov     lr, #0
    subs    r5, r3, r6, lsl #24
    ;cmp        r3, r1
    movhs   lr, #1
    movhs   r3, r5
    subhs   r2, r2, r6
    movlo   r2, r6
    cmp     r2, #0x80
    blt     range_less_0x80
    ;strd   r2, r3, [br, #bool_decoder_range]
    str     r2, [br, #bool_decoder_range]
    str     r3, [br, #bool_decoder_value]
    mov     r0, lr
    ldmia   sp!, {r4 - r6, pc}
 range_less_0x80
    ldr     r5, [br, #bool_decoder_pos]
    ldr     r1, [br, #bool_decoder_buffer]
    ldr     r4, [br, #bool_decoder_count]
    add     r1, r1, r5
    clz       r12, r2
    sub       r12, r12, #24
    subs      r4, r4, r12
    ldrleb    r6, [r1], #1
    mov       r2, r2, lsl r12
    mov       r3, r3, lsl r12
    addle     r4, r4, #8
    rsble     r12, r4, #8
    addle     r5, r5, #1
    orrle     r3, r3, r6, lsl r12
    ;strd       r2, r3, [br, #bool_decoder_range]
    ;strd       r4, r5, [br, #bool_decoder_count]
    str         r2, [br, #bool_decoder_range]
    str         r3, [br, #bool_decoder_value]
    str         r4, [br, #bool_decoder_count]
    str         r5, [br, #bool_decoder_pos]
    mov     r0, lr
    ldmia   sp!, {r4 - r6, pc}
    ENDP    ; |vp8dx_decode_bool_v6|
    END
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -1,43 +0,0 @@
 #ifndef DBOOLHUFF_ARM_H
 #define DBOOLHUFF_ARM_H
 /* JLK
 * There are currently no arm-optimized versions of
 * these functions. As they are implemented, they
 * can be uncommented below and added to
 * arm/dsystemdependent.c
 *
 * The existing asm code is likely so different as
 * to be useless. However, its been left (for now)
 * for reference.
 */
 #if 0
 #if HAVE_ARMV6
 #undef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_v6
 #undef vp8_dbool_fill
 #define vp8_dbool_fill vp8_bool_decoder_fill_v6
 #undef vp8_dbool_debool
 #define vp8_dbool_debool vp8_decode_bool_v6
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_v6
 #endif /* HAVE_ARMV6 */
 #if HAVE_ARMV7
 #undef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_neon
 #undef vp8_dbool_fill
 #define vp8_dbool_fill vp8_bool_decoder_fill_neon
 #undef vp8_dbool_debool
 #define vp8_dbool_debool vp8_decode_bool_neon
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_neon
 #endif /* HAVE_ARMV7 */
 #endif
 #endif /* DBOOLHUFF_ARM_H */
--- a/vp8/decoder/arm/detokenize.asm
+++ b/vp8/decoder/arm/detokenize.asm
@@ -1,320 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_decode_mb_tokens_v6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
    INCLUDE vpx_asm_offsets.asm
 l_qcoeff    EQU     0
 l_i         EQU     4
 l_type      EQU     8
 l_stop      EQU     12
 l_c         EQU     16
 l_l_ptr     EQU     20
 l_a_ptr     EQU     24
 l_bc        EQU     28
 l_coef_ptr  EQU     32
 l_stacksize EQU     64
 ;; constant offsets -- these should be created at build time
 c_block2above_offset         EQU 25
 c_entropy_nodes              EQU 11
 c_dct_eob_token              EQU 11
 |vp8_decode_mb_tokens_v6| PROC
    stmdb       sp!, {r4 - r11, lr}
    sub         sp, sp, #l_stacksize
    mov         r7, r1                      ; type
    mov         r9, r0                      ; detoken
    ldr         r1, [r9, #detok_current_bc]
    ldr         r0, [r9, #detok_qcoeff_start_ptr]
    mov         r11, #0                     ; i
    mov         r3, #16                     ; stop
    cmp         r7, #1                      ; type ?= 1
    addeq       r11, r11, #24               ; i = 24
    addeq       r3, r3, #8                  ; stop = 24
    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
    str         r0, [sp, #l_qcoeff]
    str         r11, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r1, [sp, #l_bc]
    add         lr, r9, r7, lsl #2          ; detoken + type*4
    ldr         r8, [r1, #bool_decoder_user_buffer]
    ldr         r10, [lr, #detok_coef_probs]
    ldr         r5, [r1, #bool_decoder_count]
    ldr         r6, [r1, #bool_decoder_range]
    ldr         r4, [r1, #bool_decoder_value]
    str         r10, [sp, #l_coef_ptr]
 BLOCK_LOOP
    ldr         r3, [r9, #detok_ptr_block2leftabove]
    ldr         r1, [r9, #detok_L]
    ldr         r2, [r9, #detok_A]
    ldrb        r12, [r3, r11]!             ; block2left[i]
    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
    cmp         r7, #0                      ; c = !type
    moveq       r7, #1
    movne       r7, #0
    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
 ; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
    cmp         r0, #0                      ; *l ?= 0
    movne       r0, #1
    cmp         r3, #0                      ; *a ?= 0
    addne       r0, r0, #1                  ; t
    str         r1, [sp, #l_l_ptr]          ; save &l
    str         r2, [sp, #l_a_ptr]          ; save &a
    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
    mov         r1, #0                      ; t = 0
    str         r7, [sp, #l_c]
    ;align 4
 COEFF_LOOP
    ldr         r3, [r9, #detok_ptr_coef_bands_x]
    ldr         lr, [r9, #detok_coef_tree_ptr]
    ;STALL
    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
    ;STALL
    ;STALL
    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
 get_token_loop
    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
    mov         r3, r6, lsl #8              ; range << 8
    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
    mov         r10, #1                     ; 1
    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
    ;++
    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
    addhs       r1, r1, #1                  ; t += 1
    movhs       r4, r3                      ; value -= bigsplit (split << 24)
    subhs       r2, r6, r2                  ; range -= split
 ;   movlo       r6, r2                      ; range = split
    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
 ; NORMALIZE
    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range <<= shift
    mov         r4, r4, lsl r3              ; value <<= shift
 ; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
    addle         r5, r5, #8                ; count += 8
    rsble         r3, r5, #24               ; 24 - count
    addle         r8, r8, #1                ; bufptr++
    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
    cmp         r1, #0                      ; t ?= 0
    bgt         get_token_loop              ; while (t > 0)
    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
    beq         END_OF_BLOCK                ; break
    rsb         lr, r1, #0                  ; v = -t;
    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
    ble         SKIP_EXTRABITS
    ldr         r3, [r9, #detok_teb_base_ptr]
    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
 extrabits_loop
    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
    ldrb        r2, [r3, #4]                ; probability. why +4?
    mov         r3, r6, lsl #8              ; range << 8
    sub         r3, r3, #256                ; range << 8 + 1 << 8
    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
    ldrb        r12, [r8]                   ; *bufptr
    ;++
    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
    movhs       r4, r10                     ; value = value - (split << 24)
    subhs       r2, r6, r2                  ; range = range - split
    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
 ; NORMALIZE
    clz         r3, r2                      ; shift - leading zeros in split
    sub         r3, r3, #24                 ; don't count first 3 bytes
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range = range << shift
    mov         r4, r4, lsl r3              ; value <<= shift
    addle       r5, r5, #8                  ; count += BR_COUNT
    addle       r8, r8, #1                  ; bufptr++
    rsble       r3, r5, #24                 ; BR_COUNT - count
    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
    subs        r0, r0, #1                  ; bits_count --
    bpl         extrabits_loop
 SKIP_EXTRABITS
    ldr         r11, [sp, #l_qcoeff]
    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
    cmp         r1, #0                      ; check for nonzero token - if (t)
    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
    add         r3, r6, #1                  ; range + 1
    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
    movhs       r4, r3                      ; value -= (split << 24)
    subhs       r2, r6, r2                  ; range -= split
    mvnhs       r3, lr                      ; -v
    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
 ; NORMALIZE
    clz         r3, r2                      ; leading 0s in split
    sub         r3, r3, #24                 ; shift
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range <<= shift
    mov         r4, r4, lsl r3              ; value <<= shift
    ldrleb      r2, [r8], #1                ; *(bufptr++)
    addle       r5, r5, #8                  ; count += 8
    rsble       r3, r5, #24                 ; BR_COUNT - count
    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
    cmn         r1, #1                      ; t < -ONE_TOKEN
    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
    mvn         r1, #1                      ; t = -1 ???? C is -2
 SKIP_EOB_CHECK
    ldr         r7, [sp, #l_c]              ; c
    ldr         r3, [r9, #detok_scan]
    add         r1, r1, #2                  ; t+= 2
    cmp         r7, #15                     ; c should will be one higher
    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
    add         r7, r7, #1                  ; c++
    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
    str         r7, [sp, #l_c]              ; store c
    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
    blt         COEFF_LOOP
    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
 END_OF_BLOCK
    ldr         r3, [sp, #l_type]           ; type
    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
    ldr         r11, [sp, #l_i]             ; i
    ldr         r12, [sp, #l_stop]          ; stop
    cmp         r3, #0                      ; type ?= 0
    moveq       r1, #1
    movne       r1, #0
    add         r3, r11, r9                 ; detok + i
    cmp         r7, r1                      ; c ?= !type
    strb        r7, [r3, #detok_eob]        ; eob[i] = c
    ldr         r7, [sp, #l_l_ptr]          ; l
    ldr         r2, [sp, #l_a_ptr]          ; a
    movne       r3, #1                      ; t
    moveq       r3, #0
    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
    add         r11, r11, #1                ; i++
    strb        r3, [r7]                    ; *l = t
    strb        r3, [r2]                    ; *a = t
    str         r0, [sp, #l_qcoeff]         ; qcoeff
    str         r11, [sp, #l_i]             ; i
    cmp         r11, r12                    ; i < stop
    ldr         r7, [sp, #l_type]           ; type
    blt         BLOCK_LOOP
    cmp         r11, #25                    ; i ?= 25
    bne         ln2_decode_mb_to
    ldr         r12, [r9, #detok_qcoeff_start_ptr]
    ldr         r10, [r9, #detok_coef_probs]
    mov         r7, #0                      ; type/i = 0
    mov         r3, #16                     ; stop = 16
    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
    str         r7, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
    b           BLOCK_LOOP
 ln2_decode_mb_to
    cmp         r11, #16                    ; i ?= 16
    bne         ln1_decode_mb_to
    mov         r10, #detok_coef_probs
    add         r10, r10, #2*4              ; coef_probs[type]
    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
    mov         r7, #2                      ; type = 2
    mov         r3, #24                     ; stop = 24
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
    b           BLOCK_LOOP
 ln1_decode_mb_to
    ldr         r2, [sp, #l_bc]
    mov         r0, #0
    nop
    str         r8, [r2, #bool_decoder_user_buffer]
    str         r5, [r2, #bool_decoder_count]
    str         r4, [r2, #bool_decoder_value]
    str         r6, [r2, #bool_decoder_range]
    add         sp, sp, #l_stacksize
    ldmia       sp!, {r4 - r11, pc}
    ENDP  ; |vp8_decode_mb_tokens_v6|
    END
--- a/vp8/decoder/arm/neon/dboolhuff_neon.asm
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -1,160 +0,0 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_decode_value_neon|
    EXPORT  |vp8dx_start_decode_neon|
    EXPORT  |vp8dx_stop_decode_neon|
    EXPORT  |vp8dx_decode_bool_neon|
    ARM
    REQUIRE8
    PRESERVE8
    INCLUDE vpx_asm_offsets.asm
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;   int z = 0;
 ;   int bit;
 ;   for ( bit=bits-1; bit>=0; bit-- )
 ;   {
 ;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
 ;   }
 ;   return z;
 ;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
 |vp8_decode_value_neon| PROC
    stmdb   sp!, {r4 - r6, lr}
    mov     r4, r0
    mov     r5, r1
    mov     r6, #0
    subs    r5, r5, #1
    bmi     decode_value_exit
 decode_value_loop
    mov     r1, #0x80
    mov     r0, r4
    bl      vp8dx_decode_bool_neon_internal       ; needed for conversion to s file
    orr     r6, r6, r0, lsl r5
    subs    r5, r5, #1
    bpl     decode_value_loop
 decode_value_exit
    mov     r0, r6
    ldmia   sp!, {r4 - r6, pc}
    ENDP    ; |vp8_decode_value_neon|
 ;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
 |vp8dx_start_decode_neon| PROC
    stmdb   sp!, {r4 - r5, lr}
    mov     r2, #0
    mov     r3, #255
    str     r2, [r0, #bool_decoder_lowvalue]
    str     r3, [r0, #bool_decoder_range]
    str     r1, [r0, #bool_decoder_buffer]
    mov     r3, #8
    mov     r2, #4
    str     r3, [r0, #bool_decoder_count]
    str     r2, [r0, #bool_decoder_pos]
    ldrb    r2, [r1, #3]
    ldrb    r3, [r1, #2]
    ldrb    r4, [r1, #1]
    ldrb    r5, [r1]
    orr     r1, r2, r3, lsl #8
    orr     r1, r1, r4, lsl #16
    orr     r1, r1, r5, lsl #24
    str     r1, [r0, #bool_decoder_value]
    ldmia   sp!, {r4 - r5, pc}
    ENDP    ; |vp8dx_start_decode_neon|
 ;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
 |vp8dx_stop_decode_neon| PROC
    mov     pc, lr
    ENDP    ; |vp8dx_stop_decode_neon|
 ; bigsplit  RN  r1
 ; buffer_v  RN  r1
 ; count_v       RN  r4
 ; range_v       RN  r2
 ; value_v       RN  r3
 ; pos_v     RN  r5
 ; split     RN  r6
 ; bit           RN  lr
 ;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
 |vp8dx_decode_bool_neon| PROC
 vp8dx_decode_bool_neon_internal
 ;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
 ;before memory allocation
    stmdb   sp!, {r4 - r5, lr}
    ldr     r2, [r0, #bool_decoder_range]       ;load range (r2), value(r3)
    ldr     r3, [r0, #bool_decoder_value]
    ;ldrd   r2, r3, [r0, #bool_decoder_range]   ;ldrd costs 2 cycles
    ;
    mov     r4, r2, lsl #8
    sub     r4, r4, #256
    mov     r12, #1
    smlawb  r4, r4, r1, r12         ;split = 1 +  (((range-1) * probability) >> 8)
    mov     lr, r0
    mov     r0, #0                  ;bit = 0
    ;
    subs    r5, r3, r4, lsl #24
    subhs   r2, r2, r4              ;range = br->range-split
    movlo   r2, r4                  ;range = split
    movhs   r0, #1                  ;bit = 1
    movhs   r3, r5                  ;value = value-bigsplit
    cmp     r2, #0x80
    blt     range_less_0x80
    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
    ldmia   sp!, {r4 - r5, pc}
 range_less_0x80
    ldrd    r4, r5, [lr, #bool_decoder_count]   ;load count, pos, buffer
    ldr     r1, [lr, #bool_decoder_buffer]
    clz     r12, r2
    add     r1, r1, r5
    sub     r12, r12, #24
    subs    r4, r4, r12             ;count -= shift
    mov     r2, r2, lsl r12         ;range <<= shift
    mov     r3, r3, lsl r12         ;value <<= shift
    addle   r4, r4, #8              ;count += 8
    ldrleb  r12, [r1], #1           ;br->buffer[br->pos]
    rsble   r1, r4, #8              ;-count
    addle   r5, r5, #1              ;br->pos++
    orrle   r3, r3, r12, lsl r1     ;value |= (br->buffer[br->pos]) << (-count)
    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
    strd    r4, r5, [lr, #bool_decoder_count]
    ldmia   sp!, {r4 - r5, pc}
    ENDP    ; |vp8dx_decode_bool_neon|
    END
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,13 +12,7 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>
 #if CONFIG_VP8_ENCODER
 #include "vpx_scale/yv12config.h"
 #endif
 #if CONFIG_VP8_DECODER
 #include "onyxd_int.h"
 #endif
 #define DEFINE(sym, val) int sym = val;
@@ -31,29 +25,6 @@
 * {
 */
 #if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
 DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
 DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
 DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
 DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
 DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
 DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
 DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
 DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
 DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
 #endif
 #if CONFIG_VP8_DECODER
 DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
 DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
 DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
 DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
 DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
 DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
 DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
 DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
 DEFINE(detok_scan,                              offsetof(DETOK, scan));
 DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
 DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
@@ -77,7 +48,6 @@ DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
 DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
 DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
 #endif
 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -26,7 +26,8 @@ DECLARE_ALIGNED(16, const unsigned char, vp8dx_bitreader_norm[256]) =
 };
-int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
+int vp8dx_start_decode(BOOL_DECODER *br,
                       const unsigned char *source,
                       unsigned int source_sz)
 {
    br->user_buffer_end = source+source_sz;
@@ -39,13 +40,13 @@ int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
        return 1;
    /* Populate the buffer */
-    vp8dx_bool_decoder_fill_c(br);
+    vp8dx_bool_decoder_fill(br);
    return 0;
 }
-void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
    const unsigned char *bufptr;
    const unsigned char *bufend;
@@ -62,69 +63,3 @@ void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
    br->value = value;
    br->count = count;
 }
 #if 0
 /*
 * Until optimized versions of these functions are available, we
 * keep the implementation in the header to allow inlining.
 *
 * The RTCD-style invocations are still in place so this can
 * be switched by just uncommenting these functions here and
 * the DBOOLHUFF_INVOKE calls in the header.
 */
 int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
 {
    unsigned int bit=0;
    VP8_BD_VALUE value;
    unsigned int split;
    VP8_BD_VALUE bigsplit;
    int count;
    unsigned int range;
    value = br->value;
    count = br->count;
    range = br->range;
    split = 1 + (((range-1) * probability) >> 8);
    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
    range = split;
    if(value >= bigsplit)
    {
        range = br->range-split;
        value = value-bigsplit;
        bit = 1;
    }
    /*if(range>=0x80)
    {
        br->value = value;
        br->range = range;
        return bit;
    }*/
    {
        register unsigned int shift = vp8dx_bitreader_norm[range];
        range <<= shift;
        value <<= shift;
        count -= shift;
    }
    br->value = value;
    br->count = count;
    br->range = range;
    if (count < 0)
        vp8dx_bool_decoder_fill_c(br);
    return bit;
 }
 int vp8dx_decode_value_c(BOOL_DECODER *br, int bits)
 {
    int z = 0;
    int bit;
    for ( bit=bits-1; bit>=0; bit-- )
    {
        z |= (vp8dx_decode_bool(br, 0x80)<<bit);
    }
    return z;
 }
 #endif
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -25,10 +25,6 @@ typedef size_t VP8_BD_VALUE;
  Even relatively modest values like 100 would work fine.*/
 # define VP8_LOTS_OF_BITS (0x40000000)
 struct vp8_dboolhuff_rtcd_vtable;
 typedef struct
 {
    const unsigned char *user_buffer_end;
@@ -36,82 +32,15 @@ typedef struct
    VP8_BD_VALUE         value;
    int                  count;
    unsigned int         range;
 #if CONFIG_RUNTIME_CPU_DETECT
    struct vp8_dboolhuff_rtcd_vtable *rtcd;
 #endif
 } BOOL_DECODER;
 #define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
    const unsigned char *source, unsigned int source_sz)
 #define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
 #define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
 #define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
 #if ARCH_ARM
 #include "arm/dboolhuff_arm.h"
 #endif
 #ifndef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_c
 #endif
 #ifndef vp8_dbool_fill
 #define vp8_dbool_fill vp8dx_bool_decoder_fill_c
 #endif
 #ifndef vp8_dbool_debool
 #define vp8_dbool_debool vp8dx_decode_bool_c
 #endif
 #ifndef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8dx_decode_value_c
 #endif
 extern prototype_dbool_start(vp8_dbool_start);
 extern prototype_dbool_fill(vp8_dbool_fill);
 extern prototype_dbool_debool(vp8_dbool_debool);
 extern prototype_dbool_devalue(vp8_dbool_devalue);
 typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
 typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
 typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
 typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
 typedef struct vp8_dboolhuff_rtcd_vtable {
    vp8_dbool_start_fn_t   start;
    vp8_dbool_fill_fn_t    fill;
    vp8_dbool_debool_fn_t  debool;
    vp8_dbool_devalue_fn_t devalue;
 } vp8_dboolhuff_rtcd_vtable_t;
 /* There are no processor-specific versions of these
 * functions right now. Disable RTCD to avoid using
 * function pointers which gives a speed boost
 */
 /*#ifdef ENABLE_RUNTIME_CPU_DETECT
 #define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
 #define IF_RTCD(x) (x)
 #else*/
 #define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
 #define IF_RTCD(x) NULL
 /*#endif*/
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
-/* wrapper functions to hide RTCD. static means inline means hopefully no
+int vp8dx_start_decode(BOOL_DECODER *br,
- * penalty
+                       const unsigned char *source,
- */
+                       unsigned int source_sz);
-static int vp8dx_start_decode(BOOL_DECODER *br,
+
-        struct vp8_dboolhuff_rtcd_vtable *rtcd,
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
        const unsigned char *source, unsigned int source_sz) {
 #if CONFIG_RUNTIME_CPU_DETECT
    br->rtcd = rtcd;
 #endif
    return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
 }
 static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
    DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
 }
 /*The refill loop is used in several places, so define it in a macro to make
   sure they're all consistent.
@@ -138,12 +67,6 @@ static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
  /*
   * Until optimized versions of this function are available, we
   * keep the implementation in the header to allow inlining.
   *
   *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
   */
    unsigned int bit = 0;
    VP8_BD_VALUE value;
    unsigned int split;
@@ -167,13 +90,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
        bit = 1;
    }
    /*if(range>=0x80)
    {
        br->value = value;
        br->range = range;
        return bit
    }*/
    {
        register unsigned int shift = vp8dx_bitreader_norm[range];
        range <<= shift;
@@ -190,12 +106,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
 static int vp8_decode_value(BOOL_DECODER *br, int bits)
 {
  /*
   * Until optimized versions of this function are available, we
   * keep the implementation in the header to allow inlining.
   *
   *return DBOOLHUFF_INVOKE(br->rtcd, devalue)(br, bits);
   */
    int z = 0;
    int bit;
@@ -206,4 +116,29 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits)
    return z;
 }
 static int vp8dx_bool_error(BOOL_DECODER *br)
 {
  /* Check if we have reached the end of the buffer.
   *
   * Variable 'count' stores the number of bits in the 'value' buffer,
   * minus 8. So if count == 8, there are 16 bits available to be read.
   * Normally, count is filled with 8 and one byte is filled into the
   * value buffer. When we reach the end of the buffer, count is instead
   * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real
   * bits from the bitstream. So the last bit in the bitstream will be
   * represented by count == VP8_LOTS_OF_BITS - 16.
   */
    if ((br->count > VP8_BD_VALUE_SIZE)
        && (br->count <= VP8_LOTS_OF_BITS - 16))
    {
       /* We have tried to decode bits after the end of
        * stream was encountered.
        */
        return 1;
    }
    /* No error. */
    return 0;
 }
 #endif
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -19,7 +19,7 @@
 extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
-extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
 extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 #endif
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -381,6 +381,12 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
        if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
        {
            /* propagate errors from reference frames */
            xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
        }
        vp8_build_uvmvs(xd, pc->full_pixel);
        /*
@@ -391,6 +397,8 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        */
        vp8_decode_macroblock(pbi, xd);
        /* check if the boolean decoder has suffered an error */
        xd->corrupted |= vp8dx_bool_error(xd->current_bc);
        recon_yoffset += 16;
        recon_uvoffset += 8;
@@ -461,13 +469,13 @@ static void setup_token_decoder(VP8D_COMP *pbi,
            partition_size = user_data_end - partition;
        }
-        if (user_data_end - partition < partition_size)
+        if (partition + partition_size > user_data_end
            || partition + partition_size < partition)
            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                               "Truncated packet or corrupt partition "
                               "%d length", i + 1);
-        if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
+        if (vp8dx_start_decode(bool_decoder, partition, partition_size))
                               partition, partition_size))
            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                               "Failed to allocate bool decoder %d", i + 1);
@@ -476,15 +484,16 @@ static void setup_token_decoder(VP8D_COMP *pbi,
        bool_decoder++;
    }
 #if CONFIG_MULTITHREAD
    /* Clamp number of decoder threads */
    if (pbi->decoding_thread_count > num_part - 1)
        pbi->decoding_thread_count = num_part - 1;
 #endif
 }
 static void stop_token_decoder(VP8D_COMP *pbi)
 {
    int i;
    VP8_COMMON *pc = &pbi->common;
    if (pc->multi_token_partition != ONE_PARTITION)
@@ -555,6 +564,7 @@ static void init_frame(VP8D_COMP *pbi)
    xd->frame_type = pc->frame_type;
    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_stride = pc->mode_info_stride;
    xd->corrupted = 0; /* init without corruption */
 }
 int vp8_decode_frame(VP8D_COMP *pbi)
@@ -570,6 +580,10 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    int i, j, k, l;
    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
    /* start with no corruption of current frame */
    xd->corrupted = 0;
    pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
    if (data_end - data < 3)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet");
@@ -580,7 +594,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
    data += 3;
-    if (data_end - data < first_partition_length_in_bytes)
+    if (data + first_partition_length_in_bytes > data_end
        || data + first_partition_length_in_bytes < data)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet or corrupt partition 0 length");
    vp8_setup_version(pc);
@@ -637,8 +652,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    init_frame(pbi);
-    if (vp8dx_start_decode(bc, IF_RTCD(&pbi->dboolhuff),
+    if (vp8dx_start_decode(bc, data, data_end - data))
                           data, data_end - data))
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
@@ -832,7 +846,9 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
    /* set up frame new frame for intra coded blocks */
 #if CONFIG_MULTITHREAD
    if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
 #endif
        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
    vp8_setup_block_dptrs(xd);
@@ -852,6 +868,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
    {
        vp8mt_decode_mb_rows(pbi, xd);
@@ -866,6 +883,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
    }
    else
 #endif
    {
        int ibc = 0;
        int num_part = 1 << pc->multi_token_partition;
@@ -890,6 +908,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    stop_token_decoder(pbi);
    /* Collect information about decoder corruption. */
    /* 1. Check first boolean decoder for errors. */
    pc->yv12_fb[pc->new_fb_idx].corrupted =
        vp8dx_bool_error(bc);
    /* 2. Check the macroblock information */
    pc->yv12_fb[pc->new_fb_idx].corrupted |=
        xd->corrupted;
    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
    /* If this was a kf or Gf note the Q used */
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -74,37 +74,6 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
    }
 }
 #if CONFIG_ARM_ASM_DETOK
 /* mashup of vp8_block2left and vp8_block2above so we only need one pointer
 * for the assembly version.
 */
 DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
 {
    /* vp8_block2left */
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
    /* vp8_block2above */
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
 void vp8_init_detokenizer(VP8D_COMP *dx)
 {
    const VP8_COMMON *const oc = & dx->common;
    MACROBLOCKD *x = & dx->mb;
    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
    dx->detoken.scan = vp8_default_zig_zag1d;
    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
 }
 #endif
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 #define FILL \
    if(count < 0) \
@@ -202,35 +171,6 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
    }\
    NORMALIZE
 #if CONFIG_ARM_ASM_DETOK
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    int eobtotal = 0;
    int i, type;
    dx->detoken.current_bc = x->current_bc;
    dx->detoken.A = x->above_context;
    dx->detoken.L = x->left_context;
    type = 3;
    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
    {
        type = 1;
        eobtotal -= 16;
    }
    vp8_decode_mb_tokens_v6(&dx->detoken, type);
    for (i = 0; i < 25; i++)
    {
        x->eobs[i] = dx->detoken.eob[i];
        eobtotal += dx->detoken.eob[i];
    }
    return eobtotal;
 }
 #else
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
@@ -423,4 +363,3 @@ BLOCK_FINISHED:
    return eobtotal;
 }
 #endif /*!CONFIG_ASM_DETOK*/
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -14,10 +14,6 @@
 #include "onyxd_int.h"
 #if ARCH_ARM
 #include "arm/detokenize_arm.h"
 #endif
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -27,12 +27,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
    pbi->dboolhuff.start             = vp8dx_start_decode_c;
    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
 #if 0 /*For use with RTCD, when implemented*/
    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
 #endif
 #endif
 #if ARCH_X86 || ARCH_X86_64
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -114,8 +114,10 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
    pbi->ready_for_new_data = 1;
    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
 #if CONFIG_MULTITHREAD
    pbi->max_threads = oxcf->max_threads;
    vp8_decoder_create_threads(pbi);
 #endif
    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
@@ -131,9 +133,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
        cm->last_sharpness_level = cm->sharpness_level;
    }
 #if CONFIG_ARM_ASM_DETOK
    vp8_init_detokenizer(pbi);
 #endif
    pbi->common.error.setjmp = 0;
    return (VP8D_PTR) pbi;
 }
@@ -149,8 +148,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd)
        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
 #endif
    vp8_decoder_remove_threads(pbi);
 #endif
    vp8_remove_common(&pbi->common);
    vpx_free(pbi);
 }
@@ -254,12 +253,7 @@ static void ref_cnt_fb (int *buf, int *idx, int new_idx)
 /* If any buffer copy / swapping is signalled it should be done here. */
 static int swap_frame_buffers (VP8_COMMON *cm)
 {
-    int fb_to_update_with, err = 0;
+    int err = 0;
    if (cm->refresh_last_frame)
        fb_to_update_with = cm->lst_fb_idx;
    else
        fb_to_update_with = cm->new_fb_idx;
    /* The alternate reference frame or golden frame can be updated
     *  using the new, last, or golden/alt ref frame.  If it
@@ -271,7 +265,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;
        if (cm->copy_buffer_to_arf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_arf == 2)
            new_fb = cm->gld_fb_idx;
        else
@@ -285,7 +279,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;
        if (cm->copy_buffer_to_gf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_gf == 2)
            new_fb = cm->alt_fb_idx;
        else
@@ -334,6 +328,23 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.error_code = VPX_CODEC_OK;
    if (size == 0)
    {
       /* This is used to signal that we are missing frames.
        * We do not know if the missing frame(s) was supposed to update
        * any of the reference buffers, but we act conservative and
        * mark only the last buffer as corrupted.
        */
        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
        /* Signal that we have no frame to show. */
        cm->show_frame = 0;
        /* Nothing more to do. */
        return 0;
    }
 #if HAVE_ARMV7
 #if CONFIG_RUNTIME_CPU_DETECT
    if (cm->rtcd.flags & HAS_NEON)
@@ -356,6 +367,13 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        }
 #endif
        pbi->common.error.setjmp = 0;
       /* We do not know if the missing frame(s) was supposed to update
        * any of the reference buffers, but we act conservative and
        * mark only the last buffer as corrupted.
        */
        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
        return -1;
@@ -388,6 +406,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        return retcode;
    }
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
    {
        if (swap_frame_buffers (cm))
@@ -405,6 +424,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
            return -1;
        }
    } else
 #endif
    {
        if (swap_frame_buffers (cm))
        {
@@ -506,7 +526,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->common.error.setjmp = 0;
    return retcode;
 }
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
 {
    int ret = -1;
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +544,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
    sd->clrtype = pbi->common.clr_type;
 #if CONFIG_POSTPROC
-    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
    if (pbi->common.frame_to_show)
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -87,14 +87,15 @@ typedef struct VP8Decompressor
    unsigned int time_decoding;
    unsigned int time_loop_filtering;
 #if CONFIG_MULTITHREAD
    /* variable for threading */
    volatile int b_multithreaded_rd;
    int max_threads;
    int current_mb_col_main;
    int decoding_thread_count;
    int allocated_decoding_thread_count;
    /* variable for threading */
 #if CONFIG_MULTITHREAD
    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
    int sync_range;
    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
@@ -125,7 +126,6 @@ typedef struct VP8Decompressor
 #if CONFIG_RUNTIME_CPU_DETECT
    vp8_dequant_rtcd_vtable_t        dequant;
    struct vp8_dboolhuff_rtcd_vtable dboolhuff;
 #endif
--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -21,7 +21,6 @@
 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -146,17 +145,10 @@ void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    case MB_MODE_COUNT:
        break;
    }
 #else
    (void) pbi;
    (void) x;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -289,17 +281,10 @@ void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_r
    case MB_MODE_COUNT:
        break;
    }
 #else
    (void) pbi;
    (void) x;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;    /*[16];*/
    unsigned char uleft_buf[8];
@@ -452,17 +437,10 @@ void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_ro
    case MB_MODE_COUNT:
        break;
    }
 #else
    (void) pbi;
    (void) x;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;   /*[16];*/
    unsigned char uleft_buf[8];
@@ -621,12 +599,6 @@ void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_
    case MB_MODE_COUNT:
        break;
    }
 #else
    (void) pbi;
    (void) x;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
@@ -638,7 +610,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
                          int mb_col,
                          int num)
 {
 #if CONFIG_MULTITHREAD
    int i, r, c;
    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
@@ -935,15 +906,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
    }
 #else
    (void) pbi;
    (void) xd;
    (void) b_mode;
    (void) predictor;
    (void) mb_row;
    (void) mb_col;
    (void) num;
 #endif
 }
 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
@@ -951,7 +913,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
 */
 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
    unsigned int *src_ptr;
    unsigned int *dst_ptr0;
@@ -973,10 +934,4 @@ void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    *dst_ptr0 = *src_ptr;
    *dst_ptr1 = *src_ptr;
    *dst_ptr2 = *src_ptr;
 #else
    (void) pbi;
    (void) x;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -9,7 +9,7 @@
 */
-#ifndef WIN32
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
 #ifdef __APPLE__
@@ -38,7 +38,6 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i, j;
@@ -88,18 +87,11 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
    for (i=0; i< pc->mb_rows; i++)
        pbi->mt_current_mb_col[i]=-1;
 #else
    (void) pbi;
    (void) xd;
    (void) mbrd;
    (void) count;
 #endif
 }
 void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
    VP8_COMMON *pc = &pbi->common;
@@ -222,18 +214,11 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
                    (xd->qcoeff+16*16, xd->block[16].dequant,
                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
 #else
    (void) pbi;
    (void) xd;
    (void) mb_row;
    (void) mb_col;
 #endif
 }
 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
 #if CONFIG_MULTITHREAD
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -320,7 +305,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                             * Apply any context driven MB level adjustment
                             */
-                            vp8_adjust_mb_lf_value(xd, &filter_level);
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                        }
                        /* Distance of Mb to the various image edges.
@@ -438,9 +423,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
            sem_post(&pbi->h_event_end_decoding);
        }
    }
 #else
    (void) p_data;
 #endif
    return 0 ;
 }
@@ -448,10 +430,8 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
    int core_count = 0;
    int ithread;
    int i;
    pbi->b_multithreaded_rd = 0;
    pbi->allocated_decoding_thread_count = 0;
@@ -483,16 +463,11 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
    }
 #else
    (void) pbi;
 #endif
 }
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;
@@ -590,15 +565,11 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
            pbi->mt_vleft_col = NULL ;
        }
    }
 #else
    (void) pbi;
 #endif
 }
-int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;
    int uv_width;
@@ -647,18 +618,11 @@ int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
    return 0;
 #else
    (void) pbi;
    (void) width;
 #endif
 }
 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
    /* shutdown MB Decoding thread; */
    if (pbi->b_multithreaded_rd)
    {
@@ -704,15 +668,11 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            pbi->de_thread_data = NULL;
        }
    }
 #else
    (void) pbi;
 #endif
 }
 void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
@@ -722,7 +682,6 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
    /*int mb_row;
    int mb_col;
    int baseline_filter_level[MAX_MB_SEGMENTS];*/
    int filter_level;
    int alt_flt_enabled = mbd->segmentation_enabled;
    int i;
@@ -755,22 +714,17 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
        vp8_init_loop_filter(cm);
    else if (frame_type != cm->last_frame_type)
        vp8_frame_init_loop_filter(lfi, frame_type);
 #else
    (void) pbi;
    (void) default_filt_lvl;
 #endif
 }
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
 #if CONFIG_MULTITHREAD
    int mb_row;
    VP8_COMMON *pc = &pbi->common;
    int ibc = 0;
    int num_part = 1 << pbi->common.multi_token_partition;
-    int i, j;
+    int i;
    volatile int *last_row_current_mb_col = NULL;
    int nsync = pbi->sync_range;
@@ -810,7 +764,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
    {
        int i;
        xd->current_bc = &pbi->mbc[mb_row%num_part];
@@ -867,7 +820,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                     * Apply any context driven MB level adjustment
                     */
-                    vp8_adjust_mb_lf_value(xd, &filter_level);
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                }
                /* Distance of Mb to the various image edges.
@@ -894,9 +847,18 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
                if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
                {
                    /* propagate errors from reference frames */
                    xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
                }
                vp8_build_uvmvs(xd, pc->full_pixel);
                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
                /* check if the boolean decoder has suffered an error */
                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
                if (pbi->common.filter_level)
                {
                    /* Save decoded MB last row data for next-row decoding */
@@ -976,8 +938,4 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
    }
    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
 #else
    (void) pbi;
    (void) xd;
 #endif
 }
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -14,7 +14,7 @@
    EXPORT |vp8_stop_encode|
    EXPORT |vp8_encode_value|
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -11,7 +11,7 @@
    EXPORT |vp8cx_pack_tokens_armv5|
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
    ARM
    REQUIRE8
@@ -29,10 +29,9 @@
    push    {r4-r11, lr}
    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 20
+    ;  sizeof (TOKENEXTRA) is 8
    add     r2, r2, r2, lsl #2          ; xcount
    sub     sp, sp, #12
-    add     r2, r1, r2, lsl #2          ; stop = p + xcount
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
    str     r2, [sp, #0]
    str     r3, [sp, #8]                ; save vp8_coef_encodings
    ldr     r2, [r0, #vp8_writer_lowvalue]
@@ -41,13 +40,13 @@
    b       check_p_lt_stop
 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #8]                ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -142,12 +141,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #48]               ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
+    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -155,7 +153,7 @@ token_count_lt_zero
 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -11,7 +11,7 @@
    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
    ARM
    REQUIRE8
@@ -62,13 +62,13 @@ mb_row_loop
    ; actuall work gets done here!
 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #20]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -163,12 +163,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #8]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
+    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -176,7 +175,7 @@ token_count_lt_zero
 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -11,7 +11,7 @@
    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
    ARM
    REQUIRE8
@@ -65,6 +65,8 @@
 numparts_loop
    ldr     r10, [sp, #40]              ; ptr
    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
    sub     r5, r5, r11                 ; move start point with each partition
                                        ; mb_rows starts at i
    str     r5,  [sp, #12]
    ; Reset all of the VP8 Writer data for each partition that
@@ -90,13 +92,13 @@ mb_row_loop
    ; actual work gets done here!
 while_p_lt_stop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r4, [sp, #80]               ; vp8_coef_encodings
    mov     lr, #0
    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-    ldr     r7, [r1, #tokenextra_skip_eob_node]
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
    ldr     r6, [r4, #vp8_token_value]  ; v
    ldr     r8, [r4, #vp8_token_len]    ; n
@@ -191,12 +193,11 @@ token_count_lt_zero
    subs    r8, r8, #1                  ; --n
    bne     token_loop
-    ldr     r6, [r1, #tokenextra_token] ; t
+    ldrb    r6, [r1, #tokenextra_token] ; t
    ldr     r7, [sp, #84]                ; vp8_extra_bits
    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 20
+    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
    cmp     r4, #0
@@ -204,7 +205,7 @@ token_count_lt_zero
 ;   if( b->base_val)
    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
    cmp     r8, #0                      ; if( L)
    beq     no_extra_bits
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,147 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_variance16x16_armv6|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance16x16_armv6| PROC
    stmfd   sp!, {r4-r12, lr}
    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     r8, #0              ; initialize sum = 0
    mov     r11, #0             ; initialize sse = 0
 loop
    ; 1st 4 pixels
    ldr     r4, [r0, #0x0]      ; load 4 src pixels
    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    adds    r8, r8, r4          ; add positive differences to sum
    subs    r8, r8, r5          ; substract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r4, [r0, #0x4]      ; load 4 src pixels
    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; substract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 3rd 4 pixels
    ldr     r4, [r0, #0x8]      ; load 4 src pixels
    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; substract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 4th 4 pixels
    ldr     r4, [r0, #0xc]      ; load 4 src pixels
    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; substract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    subs    r12, r12, #1
    bne     loop
    ; return stuff
    ldr     r6, [sp, #0x28]     ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
    ldmfd   sp!, {r4-r12, pc}
    ENDP
    END
--- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
@@ -112,10 +112,7 @@
    ENDP
 ;-----------------
-    AREA    fastfdct_dat, DATA, READONLY
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _ffdct_coeff_
    DCD     ffdct_coeff
 ffdct_coeff
--- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
@@ -165,10 +165,7 @@
    ENDP
 ;-----------------
-    AREA    fastfdct8x4_dat, DATA, READONLY
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _ffdct8_coeff_
    DCD     ffdct8_coeff
 ffdct8_coeff
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -122,10 +122,7 @@
    ENDP
 ;-----------------
-    AREA    dct4x4_dat, DATA, READONLY
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _dct_matrix_
    DCD     dct_matrix
 dct_matrix
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -9,7 +9,7 @@
 ;
-    EXPORT  |vp8_sub_pixel_variance16x16_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
    ARM
    REQUIRE8
    PRESERVE8
@@ -24,7 +24,7 @@
 ; stack(r6) unsigned int *sse
 ;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
-|vp8_sub_pixel_variance16x16_neon| PROC
+|vp8_sub_pixel_variance16x16_neon_func| PROC
    push            {r4-r6, lr}
    ldr             r12, _BilinearTaps_coeff_
@@ -416,10 +416,7 @@ sub_pixel_variance16x16_neon_loop
    ENDP
 ;-----------------
-    AREA    vp8e_bilinear_taps_dat, DATA, READWRITE          ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -215,10 +215,7 @@ sub_pixel_variance8x8_neon_loop
    ENDP
 ;-----------------
-    AREA    bilinear_taps_dat, DATA, READWRITE           ;read/write by default
+
 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor
 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
 }
 /*
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -0,0 +1,71 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "variance.h"
 #include "filter.h"
 #include "arm/bilinearfilter_arm.h"
 #if HAVE_ARMV6
 unsigned int vp8_sub_pixel_variance16x16_armv6
 (
    const unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    const unsigned char *dst_ptr,
    int dst_pixels_per_line,
    unsigned int *sse
 )
 {
    unsigned short first_pass[36*16];
    unsigned char  second_pass[20*16];
    const short *HFilter, *VFilter;
    HFilter = vp8_bilinear_filters[xoffset];
    VFilter = vp8_bilinear_filters[yoffset];
    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
                                            src_pixels_per_line,
                                            17, 16, HFilter);
    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                             16, 16, 16, VFilter);
    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
                                   dst_pixels_per_line, sse);
 }
 #endif
 #if HAVE_ARMV7
 unsigned int vp8_sub_pixel_variance16x16_neon
 (
    const unsigned char  *src_ptr,
    int  src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    const unsigned char *dst_ptr,
    int dst_pixels_per_line,
    unsigned int *sse
 )
 {
  if (xoffset == 4 && yoffset == 0)
    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  else if (xoffset == 0 && yoffset == 4)
    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  else if (xoffset == 4 && yoffset == 4)
    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  else
    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
 }
 #endif
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -12,6 +12,23 @@
 #ifndef VARIANCE_ARM_H
 #define VARIANCE_ARM_H
 #if HAVE_ARMV6
 extern prototype_variance(vp8_variance16x16_armv6);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
 #undef  vp8_variance_var16x16
 #define vp8_variance_var16x16 vp8_variance16x16_armv6
 #endif /* !CONFIG_RUNTIME_CPU_DETECT */
 #endif /* HAVE_ARMV6 */
 #if HAVE_ARMV7
 extern prototype_sad(vp8_sad4x4_neon);
 extern prototype_sad(vp8_sad8x8_neon);
@@ -30,6 +47,7 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func);
 extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,9 +12,9 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>
-#include "../treewriter.h"
+#include "treewriter.h"
-#include "../tokenize.h"
+#include "tokenize.h"
-#include "../onyx_int.h"
+#include "onyx_int.h"
 #define ct_assert(name,cond) \
    static void assert_##name(void) UNUSED;\
@@ -31,6 +31,7 @@
 * {
 */
 //pack tokens
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
 DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
@@ -51,7 +52,6 @@ DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
 DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
 DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
 DEFINE(vp8_extra_bit_struct_prob_bc,               offsetof(vp8_extra_bit_struct, prob_bc));
 DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
 DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
@@ -65,10 +65,12 @@ DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
 DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
-// These two sizes are used in vp7cx_pack_tokens.  They are hard coded
+// These two sizes are used in vp8cx_pack_tokens.  They are hard coded
 // so if the size changes this will have to be adjusted.
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
+#if HAVE_ARMV5TE
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
 ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
 #endif
 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1654,10 +1654,12 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
    {
        vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
-        if (!cpi->b_multi_threaded)
+#if CONFIG_MULTITHREAD
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+        if (cpi->b_multi_threaded)
        else
            pack_mb_row_tokens(cpi, &cpi->bc2);
        else
 #endif
            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
        vp8_stop_encode(&cpi->bc2);
        oh.first_partition_length_in_bytes = cpi->bc.pos ;
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,6 +33,7 @@ typedef struct
    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
    short *quant;
    short *quant_fast;
    short *quant_shift;
    short *zbin;
    short *zrun_zbin_boost;
@@ -81,6 +82,7 @@ typedef struct
    int errthresh;
    int rddiv;
    int rdmult;
    INT64 activity_sum;
    int mvcosts[2][MVvals+1];
    int *mvcost[2];
@@ -110,6 +112,7 @@ typedef struct
    unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
    int optimize;
    int q_index;
    void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -62,7 +62,6 @@ unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 static const int qrounding_factors[129] =
 {
    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -78,12 +77,18 @@ static const int qrounding_factors[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
    48
 };
 static const int qzbin_factors[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -94,17 +99,11 @@ static const int qzbin_factors[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
+    80
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80,
 };
 static const int qrounding_factors_y2[129] =
 {
    56, 56, 56, 56, 48, 48, 56, 56,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
@@ -120,12 +119,18 @@ static const int qrounding_factors_y2[129] =
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48,
-    48,
+    48, 48, 48, 48, 48, 48, 48, 48,
    48
 };
 static const int qzbin_factors_y2[129] =
 {
-    72, 72, 72, 72, 80, 80, 72, 72,
+    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    84, 84, 84, 84, 84, 84, 84, 84,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
@@ -136,17 +141,15 @@ static const int qzbin_factors_y2[129] =
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
-    80, 80, 80, 80, 80, 80, 80, 80,
+    80
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80, 80, 80, 80, 80, 80, 80, 80,
    80,
 };
-//#define EXACT_QUANT
+#define EXACT_QUANT
 #ifdef EXACT_QUANT
-static void vp8cx_invert_quant(short *quant, short *shift, short d)
+static void vp8cx_invert_quant(int improved_quant, short *quant,
                               short *shift, short d)
 {
    if(improved_quant)
    {
        unsigned t;
        int l;
@@ -157,6 +160,12 @@ static void vp8cx_invert_quant(short *quant, short *shift, short d)
        *quant = (short)(t - (1<<16));
        *shift = l;
    }
    else
    {
        *quant = (1 << 16) / d;
        *shift = 0;
    }
 }
 void vp8cx_init_quantizer(VP8_COMP *cpi)
 {
@@ -170,7 +179,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
    {
        // dc values
        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
                           cpi->Y1quant_shift[Q] + 0, quant_val);
        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -178,7 +188,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
                           cpi->Y2quant_shift[Q] + 0, quant_val);
        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -186,7 +197,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
        vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
                           cpi->UVquant_shift[Q] + 0, quant_val);
        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -199,7 +211,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            int rc = vp8_default_zig_zag1d[i];
            quant_val = vp8_ac_yquant(Q);
-            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
+            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
                               cpi->Y1quant_shift[Q] + rc, quant_val);
            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -207,7 +220,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
+            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
                               cpi->Y2quant_shift[Q] + rc, quant_val);
            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
@@ -215,7 +229,8 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
+            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
            vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
                               cpi->UVquant_shift[Q] + rc, quant_val);
            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
@@ -316,6 +331,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 0; i < 16; i++)
    {
        x->block[i].quant = cpi->Y1quant[QIndex];
        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
        x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
        x->block[i].zbin = cpi->Y1zbin[QIndex];
        x->block[i].round = cpi->Y1round[QIndex];
@@ -330,6 +346,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    for (i = 16; i < 24; i++)
    {
        x->block[i].quant = cpi->UVquant[QIndex];
        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
        x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
        x->block[i].zbin = cpi->UVzbin[QIndex];
        x->block[i].round = cpi->UVround[QIndex];
@@ -340,6 +357,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    // Y2
    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
    x->block[24].quant = cpi->Y2quant[QIndex];
    x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
    x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -347,22 +365,100 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
    x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
    x->block[24].zbin_extra = (short)zbin_extra;
    /* save this macroblock QIndex for vp8_update_zbin_extra() */
    x->q_index = QIndex;
 }
 void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
 {
    int i;
    int QIndex = x->q_index;
    int zbin_extra;
    // Y
    zbin_extra = (cpi->common.Y1dequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
    for (i = 0; i < 16; i++)
    {
        x->block[i].zbin_extra = (short)zbin_extra;
    }
    // UV
    zbin_extra = (cpi->common.UVdequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
    for (i = 16; i < 24; i++)
    {
        x->block[i].zbin_extra = (short)zbin_extra;
    }
    // Y2
    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
    x->block[24].zbin_extra = (short)zbin_extra;
 }
 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
-    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
+    // Clear Zbin mode boost for default case
-    // when these values are not all zero.
+    cpi->zbin_mode_boost = 0;
    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
    {
        vp8cx_init_quantizer(cpi);
    }
    // MB level quantizer setup
    vp8cx_mb_init_quantizer(cpi, &cpi->mb);
 }
 /* activity_avg must be positive, or flat regions could get a zero weight
 *  (infinite lambda), which confounds analysis.
 * This also avoids the need for divide by zero checks in
 *  vp8_activity_masking().
 */
 #define VP8_ACTIVITY_AVG_MIN (64)
 /* This is used as a reference when computing the source variance for the
 *  purposes of activity masking.
 * Eventually this should be replaced by custom no-reference routines,
 *  which will be faster.
 */
 static const unsigned char VP8_VAR_OFFS[16]=
 {
    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
 };
 unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
 {
    unsigned int act;
    unsigned int sse;
    int sum;
    unsigned int a;
    unsigned int b;
    /* TODO: This could also be done over smaller areas (8x8), but that would
     *  require extensive changes elsewhere, as lambda is assumed to be fixed
     *  over an entire MB in most of the code.
     * Another option is to compute four 8x8 variances, and pick a single
     *  lambda using a non-linear combination (e.g., the smallest, or second
     *  smallest, etc.).
     */
    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,
     x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);
    /* This requires a full 32 bits of precision. */
    act = (sse<<8) - sum*sum;
    /* Drop 4 to give us some headroom to work with. */
    act = (act + 8) >> 4;
    /* If the region is flat, lower the activity some more. */
    if (act < 8<<12)
        act = act < 5<<12 ? act : 5<<12;
    /* TODO: For non-flat regions, edge regions should receive less masking
     *  than textured regions, but identifying edge regions quickly and
     *  reliably enough is still a subject of experimentation.
     * This will be most noticable near edges with a complex shape (e.g.,
     *  text), but the 4x4 transform size should make this less of a problem
     *  than it would be for an 8x8 transform.
     */
    /* Apply the masking to the RD multiplier. */
    a = act + 4*cpi->activity_avg;
    b = 4*act + cpi->activity_avg;
    x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);
    return act;
 }
 static
 void encode_mb_row(VP8_COMP *cpi,
@@ -374,6 +470,7 @@ void encode_mb_row(VP8_COMP *cpi,
                   int *segment_counts,
                   int *totalrate)
 {
    INT64 activity_sum = 0;
    int i;
    int recon_yoffset, recon_uvoffset;
    int mb_col;
@@ -383,6 +480,16 @@ void encode_mb_row(VP8_COMP *cpi,
    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
    int seg_map_index = (mb_row * cpi->common.mb_cols);
 #if CONFIG_MULTITHREAD
    const int nsync = cpi->mt_sync_range;
    const int rightmost_col = cm->mb_cols - 1;
    volatile const int *last_row_current_mb_col;
    if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
        last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
    else
        last_row_current_mb_col = &rightmost_col;
 #endif
    // reset above block coeffs
    xd->above_context = cm->above_context;
@@ -425,6 +532,27 @@ void encode_mb_row(VP8_COMP *cpi,
        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
        xd->left_available = (mb_col != 0);
        x->rddiv = cpi->RDDIV;
        x->rdmult = cpi->RDMULT;
 #if CONFIG_MULTITHREAD
        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
        {
            if ((mb_col & (nsync - 1)) == 0)
            {
                while (mb_col > (*last_row_current_mb_col - nsync)
                        && (*last_row_current_mb_col) != (cm->mb_cols - 1))
                {
                    x86_pause_hint();
                    thread_sleep(0);
                }
            }
        }
 #endif
        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
            activity_sum += vp8_activity_masking(cpi, x);
        // Is segmentation enabled
        // MB level adjutment to quantizer
        if (xd->segmentation_enabled)
@@ -518,7 +646,12 @@ void encode_mb_row(VP8_COMP *cpi,
        x->partition_info++;
        xd->above_context++;
-        cpi->current_mb_col_main = mb_col;
+#if CONFIG_MULTITHREAD
        if (cpi->b_multi_threaded != 0)
        {
            cpi->mt_current_mb_col[mb_row] = mb_col;
        }
 #endif
    }
    //extend the recon for intra prediction
@@ -531,11 +664,15 @@ void encode_mb_row(VP8_COMP *cpi,
    // this is to account for the border
    xd->mode_info_context++;
    x->partition_info++;
    x->activity_sum += activity_sum;
 #if CONFIG_MULTITHREAD
    if ((cpi->b_multi_threaded != 0) && (mb_row == cm->mb_rows - 1))
    {
        sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
    }
 #endif
 }
 void vp8_encode_frame(VP8_COMP *cpi)
 {
@@ -544,7 +681,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
    VP8_COMMON *const cm = & cpi->common;
    MACROBLOCKD *const xd = & x->e_mbd;
    int i;
    TOKENEXTRA *tp = cpi->tok;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
@@ -627,9 +763,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
    }
    vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
    //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
    //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);
    // Copy data over into macro block data sturctures.
@@ -647,22 +781,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
    vp8_setup_block_ptrs(x);
-    x->rddiv = cpi->RDDIV;
+    x->activity_sum = 0;
    x->rdmult = cpi->RDMULT;
 #if 0
    // Experimental rd code
    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
    // such as cpi->rate_correction_factor that indicate relative complexity.
    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
    {
        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
    }
    else
        x->rdmult = cpi->RDMULT; */
    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
 #endif
    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
@@ -681,7 +800,76 @@ void vp8_encode_frame(VP8_COMP *cpi)
        struct vpx_usec_timer  emr_timer;
        vpx_usec_timer_start(&emr_timer);
-        if (!cpi->b_multi_threaded)
+#if CONFIG_MULTITHREAD
        if (cpi->b_multi_threaded)
        {
            int i;
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
            for (i = 0; i < cm->mb_rows; i++)
                cpi->mt_current_mb_col[i] = 0;
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                sem_post(&cpi->h_event_start_encoding[i]);
            }
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
                vp8_zero(cm->left_context)
                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
                // adjust to the next row of mbs
                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
            }
            sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
            cpi->tok_count = 0;
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
            {
                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
            }
            if (xd->segmentation_enabled)
            {
                int i, j;
                if (xd->segmentation_enabled)
                {
                    for (i = 0; i < cpi->encoding_thread_count; i++)
                    {
                        for (j = 0; j < 4; j++)
                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
                    }
                }
            }
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                totalrate += cpi->mb_row_ei[i].totalrate;
            }
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
            }
        }
        else
 #endif
        {
            // for each macroblock row in image
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
@@ -699,94 +887,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
            cpi->tok_count = tp - cpi->tok;
        }
        else
        {
 #if CONFIG_MULTITHREAD
            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
                int i;
                cpi->current_mb_col_main = -1;
                for (i = 0; i < cpi->encoding_thread_count; i++)
                {
                    if ((mb_row + i + 1) >= cm->mb_rows)
                        break;
                    cpi->mb_row_ei[i].mb_row = mb_row + i + 1;
                    cpi->mb_row_ei[i].tp  = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24);
                    cpi->mb_row_ei[i].current_mb_col = -1;
                    //SetEvent(cpi->h_event_mbrencoding[i]);
                    sem_post(&cpi->h_event_mbrencoding[i]);
                }
                vp8_zero(cm->left_context)
                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
                // adjust to the next row of mbs
                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
                if (mb_row < cm->mb_rows - 1)
                    //WaitForSingleObject(cpi->h_event_main, INFINITE);
                    sem_wait(&cpi->h_event_main);
            }
            /*
            for( ;mb_row<cm->mb_rows; mb_row ++)
            {
            vp8_zero( cm->left_context)
            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
            encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
            // adjust to the next row of mbs
            x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
            x->src.u_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
            x->src.v_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
            }
            */
            cpi->tok_count = 0;
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
            {
                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
            }
            if (xd->segmentation_enabled)
            {
                int i, j;
                if (xd->segmentation_enabled)
                {
                    for (i = 0; i < cpi->encoding_thread_count; i++)
                    {
                        for (j = 0; j < 4; j++)
                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
                    }
                }
            }
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                totalrate += cpi->mb_row_ei[i].totalrate;
            }
 #endif
        }
        vpx_usec_timer_mark(&emr_timer);
@@ -920,6 +1020,14 @@ void vp8_encode_frame(VP8_COMP *cpi)
    cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
    /* Update the average activity for the next frame.
     * This is feed-forward for now; it could also be saved in two-pass, or
     *  done during lookahead when that is eventually added.
     */
    cpi->activity_avg = (unsigned int )(x->activity_sum/cpi->common.MBs);
    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
 }
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
@@ -1040,77 +1148,41 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
    int Error4x4, Error16x16, error_uv;
    B_PREDICTION_MODE intra_bmodes[16];
    int rate4x4, rate16x16, rateuv;
    int dist4x4, dist16x16, distuv;
    int rate = 0;
    int rate4x4_tokenonly = 0;
    int rate16x16_tokenonly = 0;
    int rateuv_tokenonly = 0;
    int i;
    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 #if !(CONFIG_REALTIME_ONLY)
-
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
    if (cpi->sf.RD || cpi->compressor_speed != 2)
    {
-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
+        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
-
+        rate += rateuv;
        //save the b modes for possible later use
        for (i = 0; i < 16; i++)
            intra_bmodes[i] = x->e_mbd.block[i].bmi.mode;
        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
-        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);
-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
        rate += rateuv;
        if (Error4x4 < Error16x16)
        {
            rate += rate4x4;
            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
            // get back the intra block modes
            for (i = 0; i < 16; i++)
                x->e_mbd.block[i].bmi.mode = intra_bmodes[i];
            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
            cpi->prediction_error += Error4x4 ;
 #if 0
            // Experimental RD code
            cpi->frame_distortion += dist4x4;
 #endif
        }
        else
        {
            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
            rate += rate16x16;
 #if 0
            // Experimental RD code
            cpi->prediction_error += Error16x16;
            cpi->frame_distortion += dist16x16;
 #endif
        }
        sum_intra_stats(cpi, x);
        vp8_tokenize_mb(cpi, &x->e_mbd, t);
    }
    else
 #endif
    {
-
+        int rate2, best_distortion;
        int rate2, distortion2;
        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
        int this_rd;
        Error16x16 = INT_MAX;
        vp8_pick_intra_mbuv_mode(x);
        for (mode = DC_PRED; mode <= TM_PRED; mode ++)
        {
            int distortion2;
            x->e_mbd.mode_info_context->mbmi.mode = mode;
            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
@@ -1121,34 +1193,27 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
            {
                Error16x16 = this_rd;
                best_mode = mode;
                best_distortion = distortion2;
            }
        }
        x->e_mbd.mode_info_context->mbmi.mode = best_mode;
-        vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2);
+        Error4x4 = vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &best_distortion);
-
+    }
        if (distortion2 == INT_MAX)
            Error4x4 = INT_MAX;
        else
            Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
    if (Error4x4 < Error16x16)
    {
        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
        vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
            cpi->prediction_error += Error4x4;
    }
    else
    {
            x->e_mbd.mode_info_context->mbmi.mode = best_mode;
        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
            cpi->prediction_error += Error16x16;
    }
        vp8_pick_intra_mbuv_mode(x);
    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
    sum_intra_stats(cpi, x);
    vp8_tokenize_mb(cpi, &x->e_mbd, t);
    }
    return rate;
 }
@@ -1181,7 +1246,28 @@ int vp8cx_encode_inter_macroblock
    if (cpi->sf.RD)
    {
        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
        {
            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
            /* the fast quantizer does not use zbin_extra, so
             * do not recalculate */
            cpi->zbin_mode_boost_enabled = 0;
        }
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
        {
            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
        }
        /* restore cpi->zbin_mode_boost_enabled */
        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
    }
    else
 #endif
@@ -1198,7 +1284,7 @@ int vp8cx_encode_inter_macroblock
 #endif
    // MB level adjutment to quantizer setup
-    if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled)
+    if (xd->segmentation_enabled)
    {
        // If cyclic update enabled
        if (cpi->cyclic_refresh_mode_enabled)
@@ -1208,19 +1294,38 @@ int vp8cx_encode_inter_macroblock
                ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
            {
                xd->mode_info_context->mbmi.segment_id = 0;
                /* segment_id changed, so update */
                vp8cx_mb_init_quantizer(cpi, x);
            }
        }
    }
    {
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
-            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME))
+            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
                 cpi->zbin_mode_boost = 0;
            else
            {
                if (xd->mode_info_context->mbmi.mode == ZEROMV)
                {
                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
                    else
-                cpi->zbin_mode_boost = 0;
+                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                }
                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
                    cpi->zbin_mode_boost = 0;
                else
                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
            }
        }
        else
            cpi->zbin_mode_boost = 0;
-        vp8cx_mb_init_quantizer(cpi,  x);
+        vp8_update_zbin_extra(cpi, x);
    }
    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -58,21 +58,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK
    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
 {
    vp8_predict_intra4x4(b, best_mode, b->predictor);
    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
    x->quantize_b(be, b);
    IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 {
    int i;
@@ -105,7 +90,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 #if !(CONFIG_REALTIME_ONLY)
 #if 1
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mby(x, rtcd);
 #endif
@@ -144,51 +129,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    }
 }
 void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    int b;
    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
    vp8_transform_intra_mby(x);
    vp8_quantize_mby(x);
    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
    RECON_INVOKE(&rtcd->common->recon, recon_mby)
        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
    // make sure block modes are set the way we want them for context updates
    for (b = 0; b < 16; b++)
    {
        BLOCKD *d = &x->e_mbd.block[b];
        switch (x->e_mbd.mode_info_context->mbmi.mode)
        {
        case DC_PRED:
            d->bmi.mode = B_DC_PRED;
            break;
        case V_PRED:
            d->bmi.mode = B_VE_PRED;
            break;
        case H_PRED:
            d->bmi.mode = B_HE_PRED;
            break;
        case TM_PRED:
            d->bmi.mode = B_TM_PRED;
            break;
        default:
            d->bmi.mode = B_DC_PRED;
            break;
        }
    }
 }
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_intra_predictors_mbuv(&x->e_mbd);
@@ -213,17 +153,3 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_intra_predictors_mbuv(&x->e_mbd);
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
    vp8_transform_mbuv(x);
    vp8_quantize_mbuv(x);
    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -19,7 +19,5 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
 void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
 void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -243,9 +243,9 @@ struct vp8_token_state{
 };
 // TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 1
+#define Y1_RD_MULT 4
-#define UV_RD_MULT 1
+#define UV_RD_MULT 2
-#define Y2_RD_MULT 4
+#define Y2_RD_MULT 16
 static const int plane_rd_mult[4]=
 {
@@ -273,7 +273,6 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    int x;
    int sz;
    int next;
    int path;
    int rdmult;
    int rddiv;
    int final_eob;
@@ -309,8 +308,10 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    eob = d->eob;
    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-    /* TODO: These should vary with the block type, since the quantizer does. */
+    rdmult = mb->rdmult * err_mult;
-    rdmult = (mb->rdmult << 2)*err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
        rdmult = (rdmult * 9)>>4;
    rddiv = mb->rddiv;
    best_mask[0] = best_mask[1] = 0;
    /* Initialize the sentinel node of the trellis. */
@@ -633,7 +634,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_quantize_mb(x);
 #if !(CONFIG_REALTIME_ONLY)
-    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
+    if (x->optimize)
        vp8_optimize_mb(x, rtcd);
 #endif
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -128,7 +128,7 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
        while (--i > 3);
-        if (x & 240)
+        if (x & 0xFFF0)
            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
    }
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -8,15 +8,18 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "onyx_int.h"
 #include "threading.h"
 #include "common.h"
 #include "extend.h"
 #if CONFIG_MULTITHREAD
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+                                         TOKENEXTRA **t, int recon_yoffset,
                                         int recon_uvoffset);
 extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                          TOKENEXTRA **t);
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
@@ -24,12 +27,12 @@ extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
 #if CONFIG_MULTITHREAD
    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
    VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
    ENTROPY_CONTEXT_PLANES mb_row_left_context;
    const int nsync = cpi->mt_sync_range;
    //printf("Started thread %d\n", ithread);
    while (1)
@@ -38,21 +41,23 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
            break;
        //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0)
+        if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
        {
            if (cpi->b_multi_threaded == FALSE) // we're shutting down
                break;
            else
        {
            VP8_COMMON *cm = &cpi->common;
-                int mb_row           = mbri->mb_row;
+            int mb_row;
            MACROBLOCK *x = &mbri->mb;
            MACROBLOCKD *xd = &x->e_mbd;
-                TOKENEXTRA **tp     = &mbri->tp;
+            TOKENEXTRA *tp ;
            int *segment_counts = mbri->segment_counts;
            int *totalrate = &mbri->totalrate;
            if (cpi->b_multi_threaded == FALSE) // we're shutting down
                break;
            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
                int i;
                int recon_yoffset, recon_uvoffset;
                int mb_col;
@@ -61,11 +66,11 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                volatile int *last_row_current_mb_col;
                INT64 activity_sum = 0;
-                    if (ithread > 0)
+                tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
-                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
+
-                    else
+                last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
                        last_row_current_mb_col = &cpi->current_mb_col_main;
                // reset above block coeffs
                xd->above_context = cm->above_context;
@@ -77,8 +82,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                recon_yoffset = (mb_row * recon_y_stride * 16);
                recon_uvoffset = (mb_row * recon_uv_stride * 8);
-
+                cpi->tplist[mb_row].start = tp;
                    cpi->tplist[mb_row].start = *tp;
                //printf("Thread mb_row = %d\n", mb_row);
@@ -87,11 +91,14 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                {
                    int seg_map_index = (mb_row * cm->mb_cols);
-                        while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1)
+                    if ((mb_col & (nsync - 1)) == 0)
                    {
                        while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
                        {
                            x86_pause_hint();
                            thread_sleep(0);
                        }
                    }
                    // Distance of Mb to the various image edges.
                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
@@ -111,6 +118,12 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                    xd->left_available = (mb_col != 0);
                    x->rddiv = cpi->RDDIV;
                    x->rdmult = cpi->RDMULT;
                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
                        activity_sum += vp8_activity_masking(cpi, x);
                    // Is segmentation enabled
                    // MB level adjutment to quantizer
                    if (xd->segmentation_enabled)
@@ -126,17 +139,18 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    else
                        xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default
                    x->active_ptr = cpi->active_map + seg_map_index + mb_col;
                    if (cm->frame_type == KEY_FRAME)
                    {
-                            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+                        *totalrate += vp8cx_encode_intra_macro_block(cpi, x, &tp);
 #ifdef MODE_STATS
                        y_modes[xd->mbmi.mode] ++;
 #endif
                    }
                    else
                    {
-                            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+                        *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset);
 #ifdef MODE_STATS
                        inter_y_modes[xd->mbmi.mode] ++;
@@ -157,9 +171,30 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                        if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
                            cpi->inter_zz_count++;
-                        }
+                        // Special case code for cyclic refresh
                        // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
                        // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
                        {
                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
                            cpi->segmentation_map[seg_map_index + mb_col] = mbmi->segment_id;
-                        cpi->tplist[mb_row].stop = *tp;
+                            // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
                            // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
                            // else mark it as dirty (1).
                            if (mbmi->segment_id)
                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = -1;
                            else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
                            {
                                if (cpi->cyclic_refresh_map[seg_map_index + mb_col] == 1)
                                    cpi->cyclic_refresh_map[seg_map_index + mb_col] = 0;
                            }
                            else
                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = 1;
                        }
                    }
                    cpi->tplist[mb_row].stop = tp;
                    x->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
@@ -180,11 +215,9 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                    // skip to next mb
                    xd->mode_info_context++;
                    x->partition_info++;
                    xd->above_context++;
-                        cpi->mb_row_ei[ithread].current_mb_col = mb_col;
+                    cpi->mt_current_mb_col[mb_row] = mb_col;
                }
                //extend the recon for intra prediction
@@ -197,6 +230,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                // this is to account for the border
                xd->mode_info_context++;
                x->partition_info++;
                x->activity_sum += activity_sum;
                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -205,21 +239,14 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
                x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
-                    if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1)
+                if (mb_row == cm->mb_rows - 1)
                {
                    //SetEvent(cpi->h_event_main);
-                        sem_post(&cpi->h_event_main);
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
-                    }
+                }
                }
            }
        }
    }
 #else
    (void) p_data;
 #endif
    //printf("exit thread %d\n", ithread);
    return 0;
@@ -240,8 +267,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->sadperbit16      = x->sadperbit16;
    z->sadperbit4       = x->sadperbit4;
    z->errthresh        = x->errthresh;
    z->rddiv            = x->rddiv;
    z->rdmult           = x->rdmult;
    /*
    z->mv_col_min    = x->mv_col_min;
@@ -255,6 +280,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
    z->optimize          = x->optimize;
    /*
    z->mvc              = x->mvc;
@@ -282,6 +308,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    for (i = 0; i < 25; i++)
    {
        z->block[i].quant           = x->block[i].quant;
        z->block[i].quant_fast      = x->block[i].quant_fast;
        z->block[i].quant_shift     = x->block[i].quant_shift;
        z->block[i].zbin            = x->block[i].zbin;
        z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
@@ -334,7 +361,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    }
 }
 void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                               MACROBLOCK *x,
                               MB_ROW_COMP *mbr_ei,
@@ -385,15 +411,13 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);
        vp8_build_block_offsets(mb);
        vp8_setup_block_dptrs(mbd);
        vp8_setup_block_ptrs(mb);
-        mb->rddiv = cpi->RDDIV;
+        mb->activity_sum = 0;
        mb->rdmult = cpi->RDMULT;
        mbd->left_context = &cm->left_context;
        mb->mvc = cm->fc.mvc;
@@ -403,17 +427,12 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
    }
 }
 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
    cpi->b_multi_threaded = 0;
    cpi->processor_core_count = 32; //vp8_get_proc_core_count();
    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 #if CONFIG_MULTITHREAD
    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
    {
        int ithread;
@@ -423,14 +442,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
        else
            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
-        sem_init(&cpi->h_event_main, 0, 0);
+        sem_init(&cpi->h_event_end_encoding, 0, 0);
        cpi->b_multi_threaded = 1;
@@ -438,11 +458,13 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
        {
            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
-            sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0);
+            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
-            cpi->en_thread_data[ithread].ithread = ithread;
+            ethd->ithread = ithread;
-            cpi->en_thread_data[ithread].ptr1 = (void *)cpi;
+            ethd->ptr1 = (void *)cpi;
-            cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread];
+            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
            //printf(" call begin thread %d \n", ithread);
@@ -454,19 +476,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
            //  0,
            //  NULL);
-            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread]));
+            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
        }
    }
 #endif
 }
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
 #if CONFIG_MULTITHREAD
    if (cpi->b_multi_threaded)
    {
        //shutdown other threads
@@ -477,20 +495,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                //SetEvent(cpi->h_event_mbrencoding[i]);
-                sem_post(&cpi->h_event_mbrencoding[i]);
+                sem_post(&cpi->h_event_start_encoding[i]);
                pthread_join(cpi->h_encoding_thread[i], 0);
                sem_destroy(&cpi->h_event_start_encoding[i]);
            }
        }
-            for (i = 0; i < cpi->encoding_thread_count; i++)
+        sem_destroy(&cpi->h_event_end_encoding);
-                sem_destroy(&cpi->h_event_mbrencoding[i]);
+
        }
        //free thread related resources
-        vpx_free(cpi->h_event_mbrencoding);
+        vpx_free(cpi->h_event_start_encoding);
        vpx_free(cpi->h_encoding_thread);
        vpx_free(cpi->mb_row_ei);
        vpx_free(cpi->en_thread_data);
        vpx_free(cpi->mt_current_mb_col);
    }
 }
 #endif
    vpx_free(cpi->tplist);
 }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "math.h"
 #include "limits.h"
 #include "block.h"
@@ -54,7 +53,10 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
 #define IIKFACTOR1 1.40
 #define IIKFACTOR2 1.5
 #define RMAX       14.0
-#define GF_RMAX 48.0        // 128.0
+#define GF_RMAX    48.0
 #define KF_MB_INTRA_MIN 300
 #define GF_MB_INTRA_MIN 200
 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
@@ -65,6 +67,18 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
 const int cq_level[QINDEX_RANGE] =
 {
    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
 };
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
@@ -163,40 +177,68 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    return modified_err;
 }
 static const double weight_table[256] = {
 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
 };
 double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
 {
    int i, j;
    unsigned char *src = source->y_buffer;
    unsigned char value;
    double sum_weights = 0.0;
    double Weight;
    // Loop throught the Y plane raw examining levels and creating a weight for the image
-    for (i = 0; i < source->y_height; i++)
+    i = source->y_height;
    do
    {
-        for (j = 0; j < source->y_width; j++)
+        j = source->y_width;
        do
        {
-            value = src[j];
+            sum_weights += weight_table[ *src];
-
+            src++;
-            if (value >= 64)
+        }while(--j);
-                Weight = 1.0;
+        src -= source->y_width;
            else if (value > 32)
                Weight = (value - 32.0f) / 32.0f;
            else
                Weight = 0.02;
            sum_weights += Weight;
        }
        src += source->y_stride;
-    }
+    }while(--i);
    sum_weights /= (source->y_height * source->y_width);
    return sum_weights;
 }
 // This function returns the current per frame maximum bitrate target
 int frame_max_bits(VP8_COMP *cpi)
 {
@@ -247,7 +289,6 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
     * macroblock.
     */
    size_t stats_sz;
    FIRSTPASS_STATS stats;
    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
    stats_sz = (stats_sz + 7) & ~7;
@@ -374,8 +415,6 @@ unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
 }
 void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
 {
    int Offset;
    cpi->fp_motion_map_stats = target_pos;
 }
@@ -428,7 +467,6 @@ void vp8_end_first_pass(VP8_COMP *cpi)
    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }
 void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD * const xd = & x->e_mbd;
@@ -448,7 +486,6 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }
 void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD *const xd = & x->e_mbd;
@@ -472,7 +509,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
    // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
        tmp_err += new_mv_mode_penalty;
@@ -495,7 +532,7 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
            num00--;
        else
        {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                tmp_err += new_mv_mode_penalty;
@@ -536,7 +573,6 @@ void vp8_first_pass(VP8_COMP *cpi)
    int sum_in_vectors = 0;
    MV best_ref_mv = {0, 0};
    MV zero_ref_mv = {0, 0};
    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
@@ -574,13 +610,20 @@ void vp8_first_pass(VP8_COMP *cpi)
    // for each macroblock row in image
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
    {
-        MV best_ref_mv = {0, 0};
+        int_mv best_ref_mv;
        best_ref_mv.as_int = 0;
        // reset above block coeffs
        xd->up_available = (mb_row != 0);
        recon_yoffset = (mb_row * recon_y_stride * 16);
        recon_uvoffset = (mb_row * recon_uv_stride * 8);
        // Set up limit values for motion vectors to prevent them extending outside the UMV borders
        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
        // for each macroblock col in image
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
@@ -613,8 +656,6 @@ void vp8_first_pass(VP8_COMP *cpi)
            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
            x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
            x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
            // Other than for the first frame do a motion search
            if (cm->current_video_frame > 0)
@@ -635,12 +676,12 @@ void vp8_first_pass(VP8_COMP *cpi)
                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
+                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                        &d->bmi.mv.as_mv, lst_yv12,
                                        &motion_error, recon_yoffset);
                // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-                if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+                if (best_ref_mv.as_int)
                {
                   tmp_err = INT_MAX;
                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
@@ -652,7 +693,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                        d->bmi.mv.as_mv.row = tmp_mv.row;
                        d->bmi.mv.as_mv.col = tmp_mv.col;
                   }
                }
                // Experimental search in a second reference frame ((0,0) based only)
@@ -681,6 +721,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                    xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
                }
                /* Intra assumed best */
                best_ref_mv.as_int = 0;
                if (motion_error <= this_error)
                {
                    d->bmi.mv.as_mv.row <<= 3;
@@ -696,13 +739,10 @@ void vp8_first_pass(VP8_COMP *cpi)
                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
                    intercount++;
-                    best_ref_mv.row = d->bmi.mv.as_mv.row;
+                    best_ref_mv.as_int = d->bmi.mv.as_int;
                    best_ref_mv.col = d->bmi.mv.as_mv.col;
                    //best_ref_mv.row = 0;
                    //best_ref_mv.col = 0;
                    // Was the vector non-zero
-                    if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+                    if (d->bmi.mv.as_int)
                    {
                        mvcount++;
@@ -758,12 +798,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                            *fp_motion_map_ptr = 1;
                    }
                }
                else
                {
                    // Intra was best
                    best_ref_mv.row = 0;
                    best_ref_mv.col = 0;
                }
            }
            coded_error += this_error;
@@ -801,6 +835,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        fps.coded_error = coded_error >> 8;
        weight = vp8_simple_weight(cpi->Source);
        if (weight < 0.1)
            weight = 0.1;
@@ -905,7 +940,7 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
    double pow_lowq = 0.40;
    if (section_target_bandwitdh <= 0)
-        return MAXQ;
+        return cpi->maxq_max_limit;          // Highest value allowed
    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
@@ -941,10 +976,12 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
    // Correction factor used for Q values >= 20
    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
-    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+    corr_high = (corr_high < 0.05)
                    ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
-    // Try and pick a Q that should be high enough to encode the content at the given rate.
+    // Try and pick a max Q that will be high enough to encode the
-    for (Q = 0; Q < MAXQ; Q++)
+    // content at the given rate.
    for (Q = cpi->maxq_min_limit; Q < cpi->maxq_max_limit; Q++)
    {
        int bits_per_mb_at_this_q;
@@ -963,6 +1000,28 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
            break;
    }
    // Restriction on active max q for constrained quality mode.
    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
         (Q < cpi->cq_target_quality) )
         //(Q < cpi->oxcf.cq_level;) )
    {
        Q = cpi->cq_target_quality;
        //Q = cpi->oxcf.cq_level;
    }
    // Adjust maxq_min_limit and maxq_max_limit limits based on
    // averaga q observed in clip for non kf/gf.arf frames
    // Give average a chance to settle though.
    if ( (cpi->ni_frames >
                  ((unsigned int)cpi->total_stats->count >> 8)) &&
         (cpi->ni_frames > 150) )
    {
        cpi->maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
        cpi->maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
    }
    return Q;
 }
 static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
@@ -1111,6 +1170,79 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
    return Q;
 }
 // For cq mode estimate a cq level that matches the observed
 // complexity and data rate.
 static int estimate_cq(VP8_COMP *cpi, double section_err,
                       int section_target_bandwitdh, int Height, int Width)
 {
    int Q;
    int num_mbs = ((Height * Width) / (16 * 16));
    int target_norm_bits_per_mb;
    double err_per_mb = section_err / num_mbs;
    double correction_factor;
    double corr_high;
    double speed_correction = 1.0;
    double pow_highq = 0.90;
    double pow_lowq = 0.40;
    double clip_iiratio;
    double clip_iifactor;
    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                              ? (512 * section_target_bandwitdh) / num_mbs
                              : 512 * (section_target_bandwitdh / num_mbs);
    // Corrections for higher compression speed settings
    // (reduced compression expected)
    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
    {
        if (cpi->oxcf.cpu_used <= 5)
            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
        else
            speed_correction = 1.25;
    }
    // II ratio correction factor for clip as a whole
    clip_iiratio = cpi->total_stats->intra_error /
                   DOUBLE_DIVIDE_CHECK(cpi->total_stats->coded_error);
    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
    if (clip_iifactor < 0.80)
        clip_iifactor = 0.80;
    // Correction factor used for Q values >= 20
    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
    // Try and pick a Q that can encode the content at the given rate.
    for (Q = 0; Q < MAXQ; Q++)
    {
        int bits_per_mb_at_this_q;
        if (Q < 50)
        {
            correction_factor =
                pow( err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
            correction_factor = (correction_factor < 0.05) ? 0.05
                                    : (correction_factor > 5.0) ? 5.0
                                        : correction_factor;
        }
        else
            correction_factor = corr_high;
        bits_per_mb_at_this_q =
            (int)( .5 + correction_factor *
                        speed_correction *
                        clip_iifactor *
                        (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
            break;
    }
    return cq_level[Q];
 }
 extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
 void vp8_init_second_pass(VP8_COMP *cpi)
@@ -1145,6 +1277,14 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    cpi->output_frame_rate = cpi->oxcf.frame_rate;
    cpi->bits_left = (long long)(cpi->total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
    cpi->bits_left -= (long long)(cpi->total_stats->duration * two_pass_min_rate / 10000000.0);
    cpi->clip_bits_total = cpi->bits_left;
    // Calculate a minimum intra value to be used in determining the IIratio
    // scores used in the second pass. We have this minimum to make sure
    // that clips that are static but "low complexity" in the intra domain
    // are still boosted appropriately for KF/GF/ARF
    cpi->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
    cpi->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
    vp8_avg_stats(cpi->total_stats);
@@ -1173,17 +1313,25 @@ void vp8_init_second_pass(VP8_COMP *cpi)
    {
        start_pos = cpi->stats_in;               // Note starting "file" position
-        cpi->modified_total_error_left = 0.0;
+        cpi->modified_error_total = 0.0;
        cpi->modified_error_used = 0.0;
        while (vp8_input_stats(cpi, &this_frame) != EOF)
        {
-            cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
+            cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
        }
        cpi->modified_error_left = cpi->modified_error_total;
        reset_fpf_position(cpi, start_pos);            // Reset file position
    }
    // Calculate the clip target modified bits per error
    // The observed bpe starts as the same number.
    cpi->clip_bpe =  cpi->bits_left /
                     DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
    cpi->observed_bpe = cpi->clip_bpe;
    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }
@@ -1191,6 +1339,43 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 {
 }
 // This function gives and estimate of how badly we believe
 // the predicition quality is decaying from frame to frame.
 double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
    double prediction_decay_rate;
    double motion_decay;
    double motion_pct = next_frame->pcnt_motion;
    // Initial basis is the % mbs inter coded
    prediction_decay_rate = next_frame->pcnt_inter;
    // High % motion -> somewhat higher decay rate
    motion_decay = (1.0 - (motion_pct / 20.0));
    if (motion_decay < prediction_decay_rate)
        prediction_decay_rate = motion_decay;
    // Adjustment to decay rate based on speed of motion
    {
        double this_mv_rabs;
        double this_mv_cabs;
        double distance_factor;
        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
                               (this_mv_cabs * this_mv_cabs)) / 250.0;
        distance_factor = ((distance_factor > 1.0)
                                ? 0.0 : (1.0 - distance_factor));
        if (distance_factor < prediction_decay_rate)
            prediction_decay_rate = distance_factor;
    }
    return prediction_decay_rate;
 }
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1223,6 +1408,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    unsigned char *fpmm_pos;
    unsigned int allow_alt_ref =
                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
    cpi->gf_group_bits = 0;
    cpi->gf_decay_rate = 0;
@@ -1237,26 +1425,31 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Preload the stats for the next frame.
    mod_frame_err = calculate_modified_err(cpi, this_frame);
-    // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf
+    // Note the error of the frame at the start of the group (this will be
    // the GF frame error if we code a normal gf
    gf_first_frame_err = mod_frame_err;
-    // Special treatment if the current frame is a key frame (which is also a gf).
+    // Special treatment if the current frame is a key frame (which is also
-    // If it is then its error score (and hence bit allocation) need to be subtracted out
+    // a gf). If it is then its error score (and hence bit allocation) need
-    // from the calculation for the GF group
+    // to be subtracted out from the calculation for the GF group
    if (cpi->common.frame_type == KEY_FRAME)
        gf_group_err -= gf_first_frame_err;
-    // Scan forward to try and work out how many frames the next gf group should contain and
+    // Scan forward to try and work out how many frames the next gf group
-    // what level of boost is appropriate for the GF or ARF that will be coded with the group
+    // should contain and what level of boost is appropriate for the GF
    // or ARF that will be coded with the group
    i = 0;
-    while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    while (((i < cpi->static_scene_max_gf_interval) ||
            ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) &&
           (i < cpi->frames_to_key))
    {
        double r;
        double this_frame_mvr_ratio;
        double this_frame_mvc_ratio;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        //double motion_pct = next_frame.pcnt_motion;
        double motion_pct;
        i++;    // Increment the loop counter
@@ -1265,19 +1458,24 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        gf_group_err += mod_frame_err;
-        mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+        mod_err_per_mb_accumulator +=
            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;
        // Accumulate motion stats.
        motion_pct = next_frame.pcnt_motion;
        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_pct);
        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_pct);
        //Accumulate Motion In/Out of frame stats
-        this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
+        this_frame_mv_in_out =
-        mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion;
+            next_frame.mv_in_out_count * motion_pct;
-        abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+        mv_in_out_accumulator +=
            next_frame.mv_in_out_count * motion_pct;
        abs_mv_in_out_accumulator +=
            fabs(next_frame.mv_in_out_count * motion_pct);
        // If there is a significant amount of motion
        if (motion_pct > 0.05)
@@ -1306,65 +1504,98 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }
        // Underlying boost factor is based on inter intra error ratio
-        r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
+        r = ( boost_factor *
              ( next_frame.intra_error /
                DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
-        // Increase boost for frames where new data coming into frame (eg zoom out)
+        if (next_frame.intra_error > cpi->gf_intra_err_min)
-        // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
+            r = (IIKFACTOR2 * next_frame.intra_error /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
        else
            r = (IIKFACTOR2 * cpi->gf_intra_err_min /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
        // Increase boost for frames where new data coming into frame
        // (eg zoom out). Slightly reduce boost if there is a net balance
        // of motion out of the frame (zoom in).
        // The range for this_frame_mv_in_out is -1.0 to +1.0
        if (this_frame_mv_in_out > 0.0)
            r += r * (this_frame_mv_in_out * 2.0);
        // In extreme case boost is halved
        else
-            r += r * (this_frame_mv_in_out / 2.0);  // In extreme case boost is halved
+            r += r * (this_frame_mv_in_out / 2.0);
        if (r > GF_RMAX)
            r = GF_RMAX;
-        // Adjust loop decay rate
+        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
        //if ( next_frame.pcnt_inter < loop_decay_rate )
        loop_decay_rate = next_frame.pcnt_inter;
        // High % motion -> somewhat higher decay rate
        motion_decay = (1.0 - (motion_pct / 20.0));
        if (motion_decay < loop_decay_rate)
            loop_decay_rate = motion_decay;
        // Adjustment to decay rate based on speed of motion
        {
            double this_mv_rabs;
            double this_mv_cabs;
            double distance_factor;
            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
            distance_factor = ((distance_factor > 1.0)
                                    ? 0.0 : (1.0 - distance_factor));
            if (distance_factor < loop_decay_rate)
                loop_decay_rate = distance_factor;
        }
        // Cumulative effect of decay
        decay_accumulator = decay_accumulator * loop_decay_rate;
        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
        boost_score += (decay_accumulator * r);
        // Break clause to detect very still sections after motion
        // For example a staic image after a fade or other transition
        // instead of a clean key frame.
        if ( (i > MIN_GF_INTERVAL) &&
             (loop_decay_rate >= 0.999) &&
             (decay_accumulator < 0.9) )
        {
            int j;
            FIRSTPASS_STATS * position = cpi->stats_in;
            FIRSTPASS_STATS tmp_next_frame;
            double decay_rate;
            // Look ahead a few frames to see if static condition
            // persists...
            for ( j = 0; j < 4; j++ )
            {
                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
                    break;
                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
                if ( decay_rate < 0.999 )
                    break;
            }
            reset_fpf_position(cpi, position);            // Reset file position
            // Force GF not alt ref
            if ( j == 4 )
            {
                if (0)
                {
                    FILE *f = fopen("fadegf.stt", "a");
                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
                         cpi->common.current_video_frame+i, i,
                         loop_decay_rate, decay_accumulator,
                         boost_score );
                    fclose(f);
                }
                allow_alt_ref = FALSE;
                boost_score = old_boost_score;
                break;
            }
        }
        // Break out conditions.
        if  (   /* i>4 || */
            // Break at cpi->max_gf_interval unless almost totally static
            (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
            (
-                (i > MIN_GF_INTERVAL) &&                            // Dont break out with a very short interval
+                // Dont break out with a very short interval
-                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&      // Dont break out very close to a key frame
+                (i > MIN_GF_INTERVAL) &&
                // Dont break out very close to a key frame
                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&
                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
                ((mv_ratio_accumulator > 100.0) ||
                 (abs_mv_in_out_accumulator > 3.0) ||
                 (mv_in_out_accumulator < -2.0) ||
-                 ((boost_score - old_boost_score) < 2.0)
+                 ((boost_score - old_boost_score) < 2.0))
-                )
+            ) )
            )
        )
        {
            boost_score = old_boost_score;
            break;
@@ -1375,7 +1606,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        old_boost_score = boost_score;
    }
-    cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+    cpi->gf_decay_rate =
        (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
    // When using CBR apply additional buffer related upper limits
    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -1385,7 +1617,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // For cbr apply buffer related limits
        if (cpi->drop_frames_allowed)
        {
-            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+            int df_buffer_level = cpi->oxcf.drop_frames_water_mark *
                                  (cpi->oxcf.optimal_buffer_level / 100);
            if (cpi->buffer_level > df_buffer_level)
                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
@@ -1408,10 +1641,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
    // Should we use the alternate refernce frame
-    if (cpi->oxcf.play_alternate &&
+    if (allow_alt_ref &&
        cpi->oxcf.lag_in_frames &&
        (i >= MIN_GF_INTERVAL) &&
-        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&          // dont use ARF very near next kf
+        // dont use ARF very near next kf
        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&
        (((next_frame.pcnt_inter > 0.75) &&
          ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) &&
          //(cpi->gfu_boost>150) &&
@@ -1439,7 +1672,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // Boost for arf frame
        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (cpi->baseline_gf_interval * 50);
+        Boost += (i * 50);
        allocation_chunks = (i * 100) + Boost;
        // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1585,6 +1818,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Reset the file position
    reset_fpf_position(cpi, start_pos);
    // Update the record of error used so far (only done once per gf group)
    cpi->modified_error_used += gf_group_err;
    // Assign  bits to the arf or gf.
    {
        int Boost;
@@ -1738,17 +1974,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        vp8_avg_stats(&sectionstats);
-        if (sectionstats.pcnt_motion < .17)
+        cpi->section_intra_rating =
-            cpi->section_is_low_motion = 1;
+            sectionstats.intra_error /
-        else
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
            cpi->section_is_low_motion = 0;
        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
            cpi->section_is_fast_motion = 1;
        else
            cpi->section_is_fast_motion = 0;
        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -1892,6 +2120,16 @@ void vp8_second_pass(VP8_COMP *cpi)
    // Is this a GF / ARF (Note that a KF is always also a GF)
    if (cpi->frames_till_gf_update_due == 0)
    {
        // Update monitor of the bits per error observed so far.
        // Done once per gf group based on what has gone before
        // so do nothing if this is the first frame.
        if (cpi->common.current_video_frame > 0)
        {
            cpi->observed_bpe =
                (double)(cpi->clip_bits_total - cpi->bits_left) /
                cpi->modified_error_used;
        }
        // Define next gf group and assign bits to it
        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
        define_gf_group(cpi, &this_frame_copy);
@@ -1965,22 +2203,56 @@ void vp8_second_pass(VP8_COMP *cpi)
    if (cpi->common.current_video_frame == 0)
    {
        // guess at 2nd pass q
        cpi->est_max_qcorrection_factor = 1.0;
        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
-        if (tmp_q < cpi->worst_quality)
+        // Experimental code to try and set a cq_level in constrained
        // quality mode.
        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
        {
            int est_cq;
            est_cq =
                estimate_cq( cpi,
                             (cpi->total_coded_error_left / frames_left),
                             (int)(cpi->bits_left / frames_left),
                             cpi->common.Height, cpi->common.Width);
            cpi->cq_target_quality = cpi->oxcf.cq_level;
            if ( est_cq > cpi->cq_target_quality )
                cpi->cq_target_quality = est_cq;
        }
        // guess at maxq needed in 2nd pass
        cpi->maxq_max_limit = cpi->worst_quality;
        cpi->maxq_min_limit = cpi->best_quality;
        tmp_q = estimate_max_q( cpi,
                                (cpi->total_coded_error_left / frames_left),
                                (int)(cpi->bits_left / frames_left),
                                cpi->common.Height,
                                cpi->common.Width);
        // Limit the maxq value returned subsequently.
        // This increases the risk of overspend or underspend if the initial
        // estimate for the clip is bad, but helps prevent excessive
        // variation in Q, especially near the end of a clip
        // where for example a small overspend may cause Q to crash
        cpi->maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
                                  ? (tmp_q + 32) : cpi->worst_quality;
        cpi->maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
                                  ? (tmp_q - 32) : cpi->best_quality;
        cpi->active_worst_quality         = tmp_q;
        cpi->ni_av_qi                     = tmp_q;
    }
-        else
+
-        {
+    // The last few frames of a clip almost always have to few or too many
-            cpi->active_worst_quality         = cpi->worst_quality;
+    // bits and for the sake of over exact rate control we dont want to make
-            cpi->ni_av_qi                     = cpi->worst_quality;
+    // radical adjustments to the allowed quantizer range just to use up a
-        }
+    // few surplus bits or get beneath the target rate.
-    }
+    else if ( (cpi->common.current_video_frame <
-    else
+                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
                  (unsigned int)cpi->total_stats->count) )
    {
        if (frames_left < 1)
            frames_left = 1;
@@ -1994,13 +2266,6 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->active_worst_quality --;
        cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
        // Clamp to user set limits
        if (cpi->active_worst_quality > cpi->worst_quality)
            cpi->active_worst_quality = cpi->worst_quality;
        else if (cpi->active_worst_quality < cpi->best_quality)
            cpi->active_worst_quality = cpi->best_quality;
    }
    cpi->frames_to_key --;
@@ -2122,6 +2387,9 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->common.frame_type = KEY_FRAME;
    // is this a forced key frame by interval
    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
    // Clear the alt ref active flag as this can never be active on a key frame
    cpi->source_alt_ref_active = FALSE;
@@ -2178,14 +2446,41 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    if (cpi->oxcf.auto_key
        && cpi->frames_to_key > (int)cpi->key_frame_frequency )
    {
        FIRSTPASS_STATS *current_pos = cpi->stats_in;
        FIRSTPASS_STATS tmp_frame;
        cpi->frames_to_key /= 2;
-        // Estimate corrected kf group error
+        // Copy first frame details
-        kf_group_err /= 2.0;
+        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
-        kf_group_intra_err /= 2.0;
+
-        kf_group_coded_err /= 2.0;
+        // Reset to the start of the group
        reset_fpf_position(cpi, start_position);
        kf_group_err = 0;
        kf_group_intra_err = 0;
        kf_group_coded_err = 0;
        // Rescan to get the correct error data for the forced kf group
        for( i = 0; i < cpi->frames_to_key; i++ )
        {
            // Accumulate kf group errors
            kf_group_err += calculate_modified_err(cpi, &tmp_frame);
            kf_group_intra_err += tmp_frame.intra_error;
            kf_group_coded_err += tmp_frame.coded_error;
            // Load a the next frame's stats
            vp8_input_stats(cpi, &tmp_frame);
        }
        // Reset to the start of the group
        reset_fpf_position(cpi, current_pos);
        cpi->next_key_frame_forced = TRUE;
    }
    else
        cpi->next_key_frame_forced = FALSE;
    // Special case for the last frame of the file
    if (cpi->stats_in >= cpi->stats_in_end)
    {
@@ -2199,7 +2494,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    }
    // Calculate the number of bits that should be assigned to the kf group.
-    if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
+    if ((cpi->bits_left > 0) && ((int)cpi->modified_error_left > 0))
    {
        // Max for a single normal frame (not key frame)
        int max_bits = frame_max_bits(cpi);
@@ -2211,7 +2506,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // complexity of the section
        cpi->kf_group_bits = (long long)( cpi->bits_left *
                                          ( kf_group_err /
-                                            cpi->modified_total_error_left ));
+                                            cpi->modified_error_left ));
        // Clip based on maximum per frame rate defined by the user.
        max_grp_bits = (long long)max_bits * (long long)cpi->frames_to_key;
@@ -2278,12 +2573,17 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    {
        double r;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        double motion_pct;
        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;
-        r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)) ;
+        if (next_frame.intra_error > cpi->kf_intra_err_min)
            r = (IIKFACTOR2 * next_frame.intra_error /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
        else
            r = (IIKFACTOR2 * cpi->kf_intra_err_min /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
        if (r > RMAX)
            r = RMAX;
@@ -2293,6 +2593,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        loop_decay_rate = next_frame.pcnt_inter;
        // High % motion -> somewhat higher decay rate
        motion_pct = next_frame.pcnt_motion;
        motion_decay = (1.0 - (motion_pct / 20.0));
        if (motion_decay < loop_decay_rate)
            loop_decay_rate = motion_decay;
@@ -2344,16 +2645,6 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        vp8_avg_stats(&sectionstats);
        if (sectionstats.pcnt_motion < .17)
            cpi->section_is_low_motion = 1;
        else
            cpi->section_is_low_motion = 0;
        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
            cpi->section_is_fast_motion = 1;
        else
            cpi->section_is_fast_motion = 0;
         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
@@ -2434,7 +2725,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        kf_boost = (int)((double)kf_boost * 100.0) >> 4;                          // Scale 16 to 100
        // Adjustment to boost based on recent average q
-        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        //kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
        if (kf_boost < 250)                                                      // Min KF boost
            kf_boost = 250;
@@ -2474,7 +2765,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            double  alt_kf_grp_bits =
                        ((double)cpi->bits_left *
                         (kf_mod_err * (double)cpi->frames_to_key) /
-                         DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left));
+                         DOUBLE_DIVIDE_CHECK(cpi->modified_error_left));
            alt_kf_bits = (int)((double)kf_boost *
                                (alt_kf_grp_bits / (double)allocation_chunks));
@@ -2492,7 +2783,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            alt_kf_bits =
                (int)((double)cpi->bits_left *
                      (kf_mod_err /
-                       DOUBLE_DIVIDE_CHECK(cpi->modified_total_error_left)));
+                       DOUBLE_DIVIDE_CHECK(cpi->modified_error_left)));
            if (alt_kf_bits > cpi->kf_bits)
            {
@@ -2512,7 +2803,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Adjust the count of total modified error left.
    // The count of bits left is adjusted elsewhere based on real coded frame sizes
-    cpi->modified_total_error_left -= kf_group_err;
+    cpi->modified_error_left -= kf_group_err;
    if (cpi->oxcf.allow_spatial_resampling)
    {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -85,9 +91,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-
+#if !(CONFIG_REALTIME_ONLY)
    cpi->rtcd.search.full_search             = vp8_full_search_sad;
 #endif
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif
    // Pure C:
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -408,6 +408,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
        break;
    case 3:
    default:
        this_mv.col += 4;
        this_mv.row += 4;
        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
@@ -778,15 +779,17 @@ int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vfp,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
    MV *center_mv
 )
 {
    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
    int i, j;
    unsigned char *src = (*(b->base_src) + b->src);
    int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    int rr = center_mv->row, rc = center_mv->col;
    int br = ref_mv->row >> 3, bc = ref_mv->col >> 3, tr, tc;
    unsigned int besterr, thiserr = 0x7fffffff;
    int k = -1, tk;
@@ -891,7 +894,7 @@ cal_neighbors:
    best_mv->row = br;
    best_mv->col = bc;
-    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + vp8_mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
 }
 #undef MVC
 #undef PRE
@@ -913,7 +916,8 @@ int vp8_diamond_search_sad
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
    MV *center_mv
 )
 {
    int i, j, step;
@@ -940,6 +944,8 @@ int vp8_diamond_search_sad
    unsigned char *check_here;
    int thissad;
    *num00 = 0;
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -949,7 +955,7 @@ int vp8_diamond_search_sad
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }
    // search_param determines the length of the initial step and hence the number of iterations
@@ -961,8 +967,6 @@ int vp8_diamond_search_sad
    best_mv->row = ref_row;
    best_mv->col = ref_col;
    *num00 = 0;
    for (step = 0; step < tot_steps ; step++)
    {
        for (j = 0 ; j < x->searches_per_step ; j++)
@@ -982,7 +986,7 @@ int vp8_diamond_search_sad
                {
                    this_mv.row = this_row_offset << 3;
                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                    if (thissad < bestsad)
                    {
@@ -1013,7 +1017,7 @@ int vp8_diamond_search_sad
        return INT_MAX;
    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 int vp8_diamond_search_sadx4
@@ -1028,7 +1032,8 @@ int vp8_diamond_search_sadx4
    int *num00,
    vp8_variance_fn_ptr_t *fn_ptr,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
    MV *center_mv
 )
 {
    int i, j, step;
@@ -1055,6 +1060,8 @@ int vp8_diamond_search_sadx4
    unsigned char *check_here;
    unsigned int thissad;
    *num00 = 0;
    // Work out the start point for the search
    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
    best_address = in_what;
@@ -1064,7 +1071,7 @@ int vp8_diamond_search_sadx4
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }
    // search_param determines the length of the initial step and hence the number of iterations
@@ -1076,8 +1083,6 @@ int vp8_diamond_search_sadx4
    best_mv->row = ref_row;
    best_mv->col = ref_col;
    *num00 = 0;
    for (step = 0; step < tot_steps ; step++)
    {
        int all_in = 1, t;
@@ -1108,7 +1113,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                        if (sad_array[t] < bestsad)
                        {
@@ -1137,7 +1142,7 @@ int vp8_diamond_search_sadx4
                    {
                        this_mv.row = this_row_offset << 3;
                        this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                        if (thissad < bestsad)
                        {
@@ -1168,12 +1173,12 @@ int vp8_diamond_search_sadx4
        return INT_MAX;
    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 #if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1211,7 +1216,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
        // Baseline value at the centre
        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }
    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1239,7 +1244,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
            this_mv.col = c << 3;
            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
            if (thissad < bestsad)
            {
@@ -1258,12 +1263,12 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
@@ -1301,7 +1306,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }
    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1323,7 +1328,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
        check_here = r * mv_stride + in_what + col_min;
        c = col_min;
-        while ((c + 3) < col_max)
+        while ((c + 2) < col_max)
        {
            int i;
@@ -1336,7 +1341,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                    if (thissad < bestsad)
                    {
@@ -1359,7 +1364,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                if (thissad < bestsad)
                {
@@ -1381,12 +1386,163 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
 #endif
 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
    unsigned char *what = (*(b->base_src) + b->src);
    int what_stride = b->src_stride;
    unsigned char *in_what;
    int in_what_stride = d->pre_stride;
    int mv_stride = d->pre_stride;
    unsigned char *bestaddress;
    MV *best_mv = &d->bmi.mv.as_mv;
    MV this_mv;
    int bestsad = INT_MAX;
    int r, c;
    unsigned char *check_here;
    unsigned int thissad;
    int ref_row = ref_mv->row >> 3;
    int ref_col = ref_mv->col >> 3;
    int row_min = ref_row - distance;
    int row_max = ref_row + distance;
    int col_min = ref_col - distance;
    int col_max = ref_col + distance;
    unsigned short sad_array8[8];
    unsigned int sad_array[3];
    // Work out the mid point for the search
    in_what = *(d->base_pre) + d->pre;
    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
    best_mv->row = ref_row;
    best_mv->col = ref_col;
    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
    {
        // Baseline value at the centre
        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
    }
    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
    if (col_min < x->mv_col_min)
        col_min = x->mv_col_min;
    if (col_max > x->mv_col_max)
        col_max = x->mv_col_max;
    if (row_min < x->mv_row_min)
        row_min = x->mv_row_min;
    if (row_max > x->mv_row_max)
        row_max = x->mv_row_max;
    for (r = row_min; r < row_max ; r++)
    {
        this_mv.row = r << 3;
        check_here = r * mv_stride + in_what + col_min;
        c = col_min;
        while ((c + 7) < col_max)
        {
            int i;
            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
            for (i = 0; i < 8; i++)
            {
                thissad = (unsigned int)sad_array8[i];
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                    if (thissad < bestsad)
                    {
                        bestsad = thissad;
                        best_mv->row = r;
                        best_mv->col = c;
                        bestaddress = check_here;
                    }
                }
                check_here++;
                c++;
            }
        }
        while ((c + 2) < col_max)
        {
            int i;
            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
            for (i = 0; i < 3; i++)
            {
                thissad = sad_array[i];
                if (thissad < bestsad)
                {
                    this_mv.col = c << 3;
                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                    if (thissad < bestsad)
                    {
                        bestsad = thissad;
                        best_mv->row = r;
                        best_mv->col = c;
                        bestaddress = check_here;
                    }
                }
                check_here++;
                c++;
            }
        }
        while (c < col_max)
        {
            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
            if (thissad < bestsad)
            {
                this_mv.col = c << 3;
                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
                if (thissad < bestsad)
                {
                    bestsad = thissad;
                    best_mv->row = r;
                    best_mv->col = c;
                    bestaddress = check_here;
                }
            }
            check_here ++;
            c ++;
        }
    }
    this_mv.row = best_mv->row << 3;
    this_mv.col = best_mv->col << 3;
    if (bestsad < INT_MAX)
        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
    else
        return INT_MAX;
 }
 #endif /* !(CONFIG_REALTIME_ONLY) */
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -25,7 +25,6 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
 extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
@@ -44,8 +43,8 @@ extern int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vf,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
-
+    MV *center_mv
 );
 typedef int (fractional_mv_step_fp)
@@ -67,7 +66,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int distance, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvcost[2], \
-     int *mvsadcost[2] \
+     int *mvsadcost[2], \
     MV *center_mv \
    )
 #define prototype_diamond_search_sad(sym)\
@@ -83,7 +83,8 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
     int *num00, \
     vp8_variance_fn_ptr_t *fn_ptr, \
     int *mvsadcost[2], \
-     int *mvcost[2] \
+     int *mvcost[2], \
     MV *center_mv \
    )
 #if ARCH_X86 || ARCH_X86_64
@@ -93,6 +94,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
 typedef prototype_full_search_sad(*vp8_full_search_fn_t);
 extern prototype_full_search_sad(vp8_full_search_sad);
 extern prototype_full_search_sad(vp8_full_search_sadx3);
 extern prototype_full_search_sad(vp8_full_search_sadx8);
 typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
 extern prototype_diamond_search_sad(vp8_diamond_search_sad);
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,7 +18,6 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "onyxc_int.h"
 #include "preproc.h"
 #include "variance.h"
 #include "dct.h"
 #include "encodemb.h"
@@ -28,6 +27,8 @@
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
 #include "temporal_filter.h"
 #include "findnearmv.h"
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -46,6 +47,8 @@
 #define MAX_THRESHMULT  512
 #define GF_ZEROMV_ZBIN_BOOST 24
 #define LF_ZEROMV_ZBIN_BOOST 12
 #define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192
 #define VP8_TEMPORAL_ALT_REF 1
@@ -180,16 +183,17 @@ typedef struct
    int first_step;
    int optimize_coefficients;
    int use_fastquant_for_pick;
    int no_skip_block4x4_search;
    int improved_mv_pred;
 } SPEED_FEATURES;
 typedef struct
 {
    MACROBLOCK  mb;
    int mb_row;
    TOKENEXTRA *tp;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
    int current_mb_col;
 } MB_ROW_COMP;
 typedef struct
@@ -227,6 +231,7 @@ typedef struct VP8_ENCODER_RTCD
    vp8_encodemb_rtcd_vtable_t  encodemb;
    vp8_quantize_rtcd_vtable_t  quantize;
    vp8_search_rtcd_vtable_t    search;
    vp8_temporal_rtcd_vtable_t  temporal;
 } VP8_ENCODER_RTCD;
 enum
@@ -260,6 +265,9 @@ typedef struct
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);
    MACROBLOCK mb;
@@ -276,14 +284,14 @@ typedef struct
    unsigned int source_frame_flags;
    YV12_BUFFER_CONFIG scaled_source;
-    int source_buffer_count;
+    int source_buffer_count;    // number of src_buffers in use for lagged encoding
-    int source_encode_index;
+    int source_encode_index;    // index of buffer in src_buffer to encode
-    int source_alt_ref_pending;
+    int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-    int source_alt_ref_active;
+    int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
-    int last_alt_ref_sei;
+    int last_alt_ref_sei;       // index into src_buffers of frame used as alt reference
-    int is_src_frame_alt_ref;
+    int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
-    int is_next_src_alt_ref;
+    int is_next_src_alt_ref;    // source of next frame to encode is an exact copy of an alt ref frame
    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
@@ -294,15 +302,17 @@ typedef struct
    YV12_BUFFER_CONFIG last_frame_uf;
    char *Dest;
    TOKENEXTRA *tok;
    unsigned int tok_count;
    unsigned int frames_since_key;
    unsigned int key_frame_frequency;
-    unsigned int next_key;
+    unsigned int this_key_frame_forced;
    unsigned int next_key_frame_forced;
    // Ambient reconstruction err target for force key frames
    int ambient_err;
    unsigned int mode_check_freq[MAX_MODES];
    unsigned int mode_test_hit_counts[MAX_MODES];
@@ -319,15 +329,11 @@ typedef struct
    int mvcostmultiplier;
    int subseqblockweight;
    int errthresh;
    unsigned int activity_avg;
    int RDMULT;
    int RDDIV ;
    TOKENEXTRA *rdtok;
    vp8_writer rdbc;
    int intra_mode_costs[10];
    CODING_CONTEXT coding_context;
    // Rate targetting variables
@@ -335,7 +341,6 @@ typedef struct
    long long last_prediction_error;
    long long intra_error;
    long long last_intra_error;
    long long last_auto_filter_prediction_error;
 #if 0
    // Experimental RD code
@@ -350,7 +355,6 @@ typedef struct
    int this_frame_target;
    int projected_frame_size;
    int last_q[2];                   // Separate values for Intra/Inter
    int target_bits_per_mb;
    double rate_correction_factor;
    double key_frame_rate_correction_factor;
@@ -383,6 +387,7 @@ typedef struct
    int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
    int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
    int max_gf_interval;
    int static_scene_max_gf_interval;
    int baseline_gf_interval;
    int gf_decay_rate;
    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -399,6 +404,7 @@ typedef struct
    int inter_frame_target;
    double output_frame_rate;
    long long last_time_stamp_seen;
    long long last_end_time_stamp_seen;
    long long first_time_stamp_ever;
    int ni_av_qi;
@@ -431,6 +437,10 @@ typedef struct
    int best_quality;
    int active_best_quality;
    int cq_target_quality;
    int maxq_max_limit;
    int maxq_min_limit;
    int drop_frames_allowed;          // Are we permitted to drop frames?
    int drop_frame;                  // Drop this frame?
    int drop_count;                  // How many frames have we dropped?
@@ -454,8 +464,6 @@ typedef struct
    unsigned char *output_partition2;
    size_t output_partition2size;
    pre_proc_instance ppi;
    int frames_to_key;
    int gfu_boost;
    int kf_boost;
@@ -465,12 +473,20 @@ typedef struct
    double total_coded_error_left;
    double start_tot_err_left;
    double min_error;
    double kf_intra_err_min;
    double gf_intra_err_min;
    double modified_error_total;
    double modified_error_used;
    double modified_error_left;
    double clip_bpe;
    double observed_bpe;
    double modified_total_error_left;
    double avg_iiratio;
    int target_bandwidth;
    long long bits_left;
    long long clip_bits_total;
    FIRSTPASS_STATS *total_stats;
    FIRSTPASS_STATS *this_frame_stats;
    FIRSTPASS_STATS *stats_in, *stats_in_end;
@@ -529,8 +545,6 @@ typedef struct
    int ref_frame_flags;
    int exp[512];
    SPEED_FEATURES sf;
    int error_bins[1024];
@@ -576,22 +590,21 @@ typedef struct
    int cyclic_refresh_q;
    signed char *cyclic_refresh_map;
 #if CONFIG_MULTITHREAD
    // multithread data
-    int current_mb_col_main;
+    int * mt_current_mb_col;
    int mt_sync_range;
    int processor_core_count;
    int b_multi_threaded;
    int encoding_thread_count;
 #if CONFIG_MULTITHREAD
    pthread_t *h_encoding_thread;
 #endif
    MB_ROW_COMP *mb_row_ei;
    ENCODETHREAD_DATA *en_thread_data;
 #if CONFIG_MULTITHREAD
    //events
-    sem_t *h_event_mbrencoding;
+    sem_t *h_event_start_encoding;
-    sem_t h_event_main;
+    sem_t h_event_end_encoding;
 #endif
    TOKENLIST *tplist;
@@ -611,9 +624,6 @@ typedef struct
    unsigned int tempdata2;
    int base_skip_false_prob[128];
    unsigned int section_is_low_motion;
    unsigned int section_benefits_from_aggresive_q;
    unsigned int section_is_fast_motion;
    unsigned int section_intra_rating;
    double section_max_qfactor;
@@ -661,7 +671,14 @@ typedef struct
    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
    int gf_active_count;
    //Store last frame's MV info for next frame MV prediction
    int_mv *lfmv;
    int *lf_ref_frame_sign_bias;
    int *lf_ref_frame;
 #if CONFIG_REALTIME_ONLY
    int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
 #endif
 } VP8_COMP;
 void control_data_rate(VP8_COMP *cpi);
@@ -670,6 +687,8 @@ void vp8_encode_frame(VP8_COMP *cpi);
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);
 unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
 int rd_cost_intra_mb(MACROBLOCKD *x);
 void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -24,7 +24,7 @@
 #include "g_common.h"
 #include "variance.h"
 #include "mcomp.h"
-
+#include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -168,8 +168,6 @@ static int pick_intra4x4block(
    B_PREDICTION_MODE *best_mode,
    B_PREDICTION_MODE above,
    B_PREDICTION_MODE left,
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,
    int *bestrate,
    int *bestdistortion)
@@ -179,8 +177,6 @@ static int pick_intra4x4block(
    int rate;
    int distortion;
    unsigned int *mode_costs;
    (void) l;
    (void) a;
    if (x->e_mbd.frame_type == KEY_FRAME)
    {
@@ -211,6 +207,7 @@ static int pick_intra4x4block(
    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
    vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode);
    return best_rd;
 }
@@ -220,17 +217,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    MACROBLOCKD *const xd = &mb->e_mbd;
    int i;
    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
-    int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode
+    int error;
    int distortion = 0;
    ENTROPY_CONTEXT_PLANES t_above, t_left;
    ENTROPY_CONTEXT *ta;
    ENTROPY_CONTEXT *tl;
    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
    ta = (ENTROPY_CONTEXT *)&t_above;
    tl = (ENTROPY_CONTEXT *)&t_left;
    vp8_intra_prediction_down_copy(xd);
@@ -243,10 +231,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
-        error += pick_intra4x4block(rtcd,
+        pick_intra4x4block(rtcd, mb, mb->block + i, xd->block + i,
-                                    mb, mb->block + i, xd->block + i, &best_mode, A, L,
+                               &best_mode, A, L, &r, &d);
                                    ta + vp8_block2above[i],
                                    tl + vp8_block2left[i], &r, &d);
        cost += r;
        distortion += d;
@@ -264,10 +250,15 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    *Rate = cost;
    if (i == 16)
    {
        *best_dist = distortion;
        error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, distortion);
    }
    else
    {
        *best_dist = INT_MAX;
-
+        error = INT_MAX;
    }
    return error;
 }
@@ -421,7 +412,6 @@ int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)
 }
 int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
    BLOCK *b = &x->block[0];
@@ -430,7 +420,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    B_MODE_INFO best_bmodes[16];
    MB_MODE_INFO best_mbmode;
    PARTITION_INFO best_partition;
-    MV best_ref_mv1;
+    MV best_ref_mv;
    MV mode_mv[MB_MODE_COUNT];
    MB_PREDICTION_MODE this_mode;
    int num00;
@@ -448,9 +438,14 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    int best_mode_index = 0;
    int sse = INT_MAX;
    MV mvp;
    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    int saddone=0;
    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
    MV nearest_mv[4];
    MV near_mv[4];
-    MV best_ref_mv[4];
+    MV frame_best_ref_mv[4];
    int MDCounts[4][4];
    unsigned char *y_buffer[4];
    unsigned char *u_buffer[4];
@@ -470,7 +465,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
-                          &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
@@ -484,7 +479,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
-                          &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
@@ -498,7 +493,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
-                          &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
@@ -538,10 +533,6 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                                        + vp8_cost_one(cpi->prob_gf_coded);
    }
    best_rd = INT_MAX;
    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
    // if we encode a new mv this is important
@@ -604,17 +595,41 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv1 = best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
        }
-        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
+        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-        if (cpi->is_src_frame_alt_ref)
+        // unless ARNR filtering is enabled in which case we want
        // an unfiltered alternative
        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
        {
            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                continue;
        }
        if(cpi->sf.improved_mv_pred && x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
        {
            if(!saddone)
            {
                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
                saddone = 1;
            }
            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
            /* adjust mvp to make sure it is within MV range */
            if(mvp.row > best_ref_mv.row + MAX_FULL_PEL_VAL)
                mvp.row = best_ref_mv.row + MAX_FULL_PEL_VAL;
            else if(mvp.row < best_ref_mv.row - MAX_FULL_PEL_VAL)
                mvp.row = best_ref_mv.row - MAX_FULL_PEL_VAL;
            if(mvp.col > best_ref_mv.col + MAX_FULL_PEL_VAL)
                mvp.col = best_ref_mv.col + MAX_FULL_PEL_VAL;
            else if(mvp.col < best_ref_mv.col - MAX_FULL_PEL_VAL)
                mvp.col = best_ref_mv.col - MAX_FULL_PEL_VAL;
        }
        switch (this_mode)
        {
        case B_PRED:
@@ -670,61 +685,59 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            int n = 0;
            int sadpb = x->sadperbit16;
            int col_min;
            int col_max;
            int row_min;
            int row_max;
            int tmp_col_min = x->mv_col_min;
            int tmp_col_max = x->mv_col_max;
            int tmp_row_min = x->mv_row_min;
            int tmp_row_max = x->mv_row_max;
            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
            // Further step/diamond searches as necessary
-            if (cpi->Speed < 8)
+            step_param = cpi->sf.first_step + speed_adjust;
            if(cpi->sf.improved_mv_pred)
            {
-                step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
+                sr += speed_adjust;
-                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+                //adjust search range according to sr from mv prediction
-            }
+                if(sr > step_param)
-            else
+                    step_param = sr;
                col_min = (best_ref_mv.col - MAX_FULL_PEL_VAL) >>3;
                col_max = (best_ref_mv.col + MAX_FULL_PEL_VAL) >>3;
                row_min = (best_ref_mv.row - MAX_FULL_PEL_VAL) >>3;
                row_max = (best_ref_mv.row + MAX_FULL_PEL_VAL) >>3;
                // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
                if (x->mv_col_min < col_min )
                    x->mv_col_min = col_min;
                if (x->mv_col_max > col_max )
                    x->mv_col_max = col_max;
                if (x->mv_row_min < row_min )
                    x->mv_row_min = row_min;
                if (x->mv_row_max > row_max )
                    x->mv_row_max = row_max;
            }else
            {
-                step_param = cpi->sf.first_step + 2;
+                mvp.row = best_ref_mv.row;
-                further_steps = 0;
+                mvp.col = best_ref_mv.col;
            }
-#if 0
+            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);
            // Initial step Search
            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
            // Further step searches
            while (n < further_steps)
            {
                n++;
                if (num00)
                    num00--;
                else
                {
                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
                    if (thissme < bestsme)
                    {
                        bestsme = thissme;
                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
                    }
                    else
                    {
                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
                    }
                }
            }
 #else
            if (cpi->sf.search_method == HEX)
            {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -743,7 +756,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9
                        if (thissme < bestsme)
                        {
@@ -760,18 +773,23 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                }
            }
-#endif
+            if(cpi->sf.improved_mv_pred)
            {
                x->mv_col_min = tmp_col_min;
                x->mv_col_max = tmp_col_max;
                x->mv_row_min = tmp_row_min;
                x->mv_row_max = tmp_row_max;
            }
            if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
            // mv cost;
-        rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
-
+        }
        case NEARESTMV:
        case NEARMV:
--- a/Show More
+++ b/Show More