Use memcpy for save/restore_predictor

The save_predictor and restore_predictor functions perform a 1D backup of the 2D predictor block. Use memcpy to get a faster copy operation than 4 individual load/stores. Change-Id: Ia609ed71fbff1ade6fa677186efce9ee29167fd6
Merge "Improve vp8_sad16x16_sse3 function"
2011-02-15 10:22:21 -05:00 · 2011-02-14 14:09:25 -08:00 · 2011-02-14 13:58:12 -08:00 · 2011-02-14 16:34:33 -05:00 · 2011-02-14 16:23:49 -05:00 · 2011-02-14 11:29:22 -08:00
107 changed files with 2405 additions and 3150 deletions
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -17,15 +17,17 @@ for i; do
        on_of=1
    elif [ "$i" == "-v" ]; then
        verbose=1
+    elif [ "$i" == "-g" ]; then
+        args="${args} --debug"
    elif [ "$on_of" == "1" ]; then
        outfile=$i
-    on_of=0
+        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
    elif [ "${i:0:2}" == "-l" ]; then
        libs="$libs ${i#-l}"
    elif [ "${i:0:2}" == "-L" ]; then
-    libpaths="${libpaths} ${i#-L}"
+        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
    fi
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -78,6 +78,7 @@ Build options:
  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
+  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
@@ -442,6 +443,9 @@ process_common_cmdline() {
        ;;
        --cpu=*) tune_cpu="$optval"
        ;;
+        --extra-cflags=*)
+        extra_cflags="${optval}"
+        ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        echo "${CMDLINE_SELECT} ${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null || die_unknown $opt
@@ -660,12 +664,12 @@ process_common_toolchain() {
            elif enabled armv7
            then
                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
-        check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
+                check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
            else
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
            fi
-
+            enabled debug && add_asflags -g
            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
            ;;
        rvct)
@@ -690,16 +694,24 @@ process_common_toolchain() {
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+            enabled debug && add_asflags -g
+            add_cflags --gnu
+            add_cflags --enum_is_int
+            add_cflags --wchar32
        ;;
        esac

        case ${tgt_os} in
+        none*)
+            disable multithread
+            disable os_support
+            ;;
        darwin*)
            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin9-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -713,14 +725,14 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.1.sdk"
+            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.sdk"

            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS3.1.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.2.sdk

            # Add the paths for the alternate libc
 #            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin9/4.0.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done
@@ -742,13 +754,9 @@ process_common_toolchain() {
                    || die "Must supply --libc when targetting *-linux-rvct"

                # Set up compiler
-                add_cflags --gnu
-                add_cflags --enum_is_int
                add_cflags --library_interface=aeabi_glibc
                add_cflags --no_hide_all
-                add_cflags --wchar32
                add_cflags --dwarf2
-                add_cflags --gnu

                # Set up linker
                add_ldflags --sysv --no_startup --no_ref_cpp_init
@@ -972,6 +980,12 @@ EOF
        add_cflags -D_LARGEFILE_SOURCE
        add_cflags -D_FILE_OFFSET_BITS=64
    fi
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
 }

 process_toolchain() {
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -590,7 +590,7 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
    //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);

-    fp = fopen("vpx_asm_offsets.asm", "w");
+    fp = fopen("assembly_offsets.asm", "w");

    if (fp == NULL)
    {
--- a/10
+++ b/10
@@ -40,7 +40,6 @@ Advanced options:
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
-  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
@@ -79,11 +78,13 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
+all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv5te-wince-vs8"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
+all_platforms="${all_platforms} armv6-none-rvct"
 all_platforms="${all_platforms} armv6-symbian-gcc"
 all_platforms="${all_platforms} armv6-wince-vs8"
 all_platforms="${all_platforms} iwmmxt-linux-rvct"
@@ -95,6 +96,7 @@ all_platforms="${all_platforms} iwmmxt2-wince-vs8"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -159,6 +161,7 @@ enable fast_unaligned #allow unaligned accesses, if supported by hw
 enable md5
 enable spatial_resampling
 enable multithread
+enable os_support

 [ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8; do
@@ -251,8 +254,8 @@ CONFIG_LIST="
    realtime_only
    shared
    small
-    arm_asm_detok
    postproc_visualizer
+    os_support
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -291,7 +294,6 @@ CMDLINE_SELECT="
    realtime_only
    shared
    small
-    arm_asm_detok
    postproc_visualizer
 "

@@ -300,7 +302,7 @@ process_cmdline() {
        optval="${opt#*=}"
        case "$opt" in
        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
-        *) process_common_cmdline $opt
+        *) process_common_cmdline "$opt"
        ;;
        esac
    done
--- a/examples.mk
+++ b/examples.mk
@@ -93,8 +93,16 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame


 # Handle extra library flags depending on codec configuration
-CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m

+# We should not link to math library (libm) on RVCT
+# when building for bare-metal targets
+ifeq ($(CONFIG_OS_SUPPORT), yes)
+CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+else
+    ifeq ($(CONFIG_GCC), yes)
+    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+    endif
+endif
 #
 # End of specified files. The rest of the build rules should happen
 # automagically from here.
--- a/libs.mk
+++ b/libs.mk
@@ -230,10 +230,39 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
-#$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(BUILD_PFX)vpx_asm_offsets.asm
 $(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
 $(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm

+#
+# Calculate platform- and compiler-specific offsets for hand coded assembly
+#
+ifeq ($(ARCH_ARM), yes)
+  asm_com_offsets.asm: obj_int_extract
+  asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+  OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+  CLEAN-OBJS += asm_com_offsets.asm
+  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+
+  ifeq ($(CONFIG_VP8_ENCODER), yes)
+    asm_enc_offsets.asm: obj_int_extract
+    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    CLEAN-OBJS += asm_enc_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+  endif
+
+  ifeq ($(CONFIG_VP8_DECODER), yes)
+    asm_dec_offsets.asm: obj_int_extract
+    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    CLEAN-OBJS += asm_dec_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+  endif
+endif
+
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

--- a/md5_utils.c
+++ b/md5_utils.c
@@ -20,8 +20,6 @@
 * Still in the public domain.
 */

-#include <sys/types.h>    /* for stupid systems */
-
 #include <string.h>   /* for memcpy() */

 #include "md5_utils.h"
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -15,19 +15,19 @@
    AREA    |.text|, CODE, READONLY  ; name this block of code

 ;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
-; stack    const short *vp8_filter
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    mov     r12, r3                         ; outer-loop counter
    sub     r2, r2, r4                      ; src increment for height loop
@@ -38,10 +38,10 @@

    ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row

    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_1st_filter
@@ -140,17 +140,17 @@

 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
-; stack const short *vp8_filter
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -243,8 +243,6 @@ skip_secondpass_hloop
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter8_coeff_
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -10,128 +10,29 @@


 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-);
-
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
+#include "arm/bilinearfilter_arm.h"

 void vp8_filter_block2d_bil_armv6
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const short      *HFilter,
-    const short      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -148,8 +49,8 @@ void vp8_bilinear_predict4x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +68,8 @@ void vp8_bilinear_predict8x8_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +87,8 @@ void vp8_bilinear_predict8x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +106,8 @@ void vp8_bilinear_predict16x16_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -11,26 +11,10 @@

 #include "vpx_ports/config.h"
 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
 #include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
-
-
 extern void vp8_filter_block2d_first_pass_armv6
 (
    unsigned char *src_ptr,
@@ -93,11 +77,11 @@ void vp8_sixtap_predict_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* Vfilter is null. First pass only */
    if (xoffset && !yoffset)
@@ -129,47 +113,6 @@ void vp8_sixtap_predict_armv6
    }
 }

-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
    unsigned char  *src_ptr,
@@ -182,10 +125,10 @@ void vp8_sixtap_predict8x8_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
@@ -224,10 +167,10 @@ void vp8_sixtap_predict16x16_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -41,13 +41,13 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -57,7 +57,7 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -65,13 +65,13 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -81,7 +81,7 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -94,10 +94,10 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -122,10 +122,10 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -148,10 +148,10 @@ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }

 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -161,7 +161,7 @@ void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -169,10 +169,10 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }

 void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -182,7 +182,7 @@ void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -195,7 +195,7 @@ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }

 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -220,7 +220,7 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }

 void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -350,10 +350,7 @@ filt_blk2d_spo16x16_loop_neon
    ENDP

 ;-----------------
-    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter16_coeff_
    DCD     bifilter16_coeff
 bifilter16_coeff
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -123,10 +123,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter4_coeff_
    DCD     bifilter4_coeff
 bifilter4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -128,10 +128,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter8x4_coeff_
    DCD     bifilter8x4_coeff
 bifilter8x4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -176,10 +176,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter8_coeff_
    DCD     bifilter8_coeff
 bifilter8_coeff
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -397,7 +397,8 @@
    bx          lr
    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|

-    AREA    loopfilter_dat, DATA, READONLY
+;-----------------
+
 _lf_coeff_
    DCD     lf_coeff
 lf_coeff
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -104,10 +104,7 @@
    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|

 ;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _lfhy_coeff_
    DCD     lfhy_coeff
 lfhy_coeff
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -145,10 +145,7 @@
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|

 ;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _vlfy_coeff_
    DCD     vlfy_coeff
 vlfy_coeff
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -505,7 +505,8 @@
    bx          lr
    ENDP        ; |vp8_mbloop_filter_neon|

-    AREA    mbloopfilter_dat, DATA, READONLY
+;-----------------
+
 _mblf_coeff_
    DCD     mblf_coeff
 mblf_coeff
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -113,10 +113,7 @@
    ENDP

 ;-----------------
-    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _idct_coeff_
    DCD     idct_coeff
 idct_coeff
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -476,10 +476,7 @@ secondpass_only_inner_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter16_coeff_
    DCD     filter16_coeff
 filter16_coeff
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -407,10 +407,7 @@ secondpass_filter4x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter4_coeff_
    DCD     filter4_coeff
 filter4_coeff
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -458,10 +458,7 @@ secondpass_filter8x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -509,10 +509,7 @@ filt_blk2d_spo8x8_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "vpx_scale/yv12config.h"
+
+#define ct_assert(name,cond) \
+    static void assert_##name(void) UNUSED;\
+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+//vpx_scale
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter_c.c
@@ -10,13 +10,10 @@


 #include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
    { 128,   0 },
    { 112,  16 },
@@ -28,8 +25,7 @@ static const int bilinear_filters[8][2] =
    {  16, 112 }
 };

-
-static const short sub_pel_filters[8][6] =
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {

    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
@@ -40,9 +36,6 @@ static const short sub_pel_filters[8][6] =
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
-
-
-
 };

 void vp8_filter_block2d_first_pass
@@ -146,7 +139,7 @@ void vp8_filter_block2d
    const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
@@ -195,8 +188,8 @@ void vp8_sixtap_predict_c
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
@@ -212,10 +205,10 @@ void vp8_sixtap_predict8x8_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
@@ -238,10 +231,10 @@ void vp8_sixtap_predict8x4_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -264,11 +257,11 @@ void vp8_sixtap_predict16x16_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
@@ -283,57 +276,50 @@ void vp8_sixtap_predict16x16_c
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_first_pass
 (
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int i, j;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
+        src_ptr += src_stride - width;
+        dst_ptr += width;
    }
 }

@@ -341,60 +327,51 @@ void vp8_filter_block2d_bil_first_pass
 *
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int  i, j;
    int  Temp;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply filter */
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
+        dst_ptr += dst_pitch;
    }
 }

@@ -404,11 +381,14 @@ void vp8_filter_block2d_bil_second_pass
 *  ROUTINE       : filter_block2d_bil
 *
 *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@@ -422,23 +402,23 @@ void vp8_filter_block2d_bil_second_pass
 void vp8_filter_block2d_bil
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {

-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -452,11 +432,11 @@ void vp8_bilinear_predict4x4_c
    int dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
    {
        int i;
@@ -490,11 +470,11 @@ void vp8_bilinear_predict8x8_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

@@ -510,11 +490,11 @@ void vp8_bilinear_predict8x4_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

@@ -530,11 +510,11 @@ void vp8_bilinear_predict16x16_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/decoder/arm/detokenize_arm.h
+++ b/vp8/decoder/arm/detokenize_arm.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -9,14 +9,14 @@
 */


-#ifndef DETOKENIZE_ARM_H
-#define DETOKENIZE_ARM_H
+#ifndef FILTER_H
+#define FILTER_H

-#if HAVE_ARMV6
-#if CONFIG_ARM_ASM_DETOK
-void vp8_init_detokenizer(VP8D_COMP *dx);
-void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
-#endif
-#endif
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7

-#endif
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif //FILTER_H
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,47 +11,9 @@

 #include "findnearmv.h"

-#define FINDNEAR_SEARCH_SITES   3
-
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
-
-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
-static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = x->mbmi.mv.as_mv;
-
-    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-
-void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
-{
-    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
-
-
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -82,7 +44,7 @@ void vp8_find_near_mvs
        if (above->mbmi.mv.as_int)
        {
            (++mv)->as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
            ++cntx;
        }

@@ -97,7 +59,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
@@ -119,7 +81,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -17,6 +17,41 @@
 #include "modecont.h"
 #include "treecoder.h"

+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = mvp->as_mv;
+
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
+{
+    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -35,8 +70,4 @@ const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);

 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);

-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
-
-
 #endif
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -28,13 +28,13 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -44,7 +44,7 @@ void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -52,13 +52,13 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -68,7 +68,7 @@ void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -81,10 +81,10 @@ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -109,10 +109,10 @@ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -137,8 +137,6 @@ void vp8_init_loop_filter(VP8_COMMON *cm)

    int block_inside_limit = 0;
    int HEVThresh;
-    const int yhedge_boost  = 2;
-    const int uvhedge_boost = 2;

    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
    for (i = 0; i <= MAX_LOOP_FILTER; i++)
@@ -182,15 +180,9 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
        for (j = 0; j < 16; j++)
        {
            lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
-            lfi[i].mbthr[j] = HEVThresh;
+            lfi[i].mbflim[j] = filt_lvl + 2;
            lfi[i].flim[j] = filt_lvl;
            lfi[i].thr[j] = HEVThresh;
-            lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost;
-            lfi[i].uvmbthr[j] = HEVThresh;
-            lfi[i].uvflim[j] = filt_lvl;
-            lfi[i].uvthr[j] = HEVThresh;
        }

    }
@@ -249,57 +241,52 @@ void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
        for (j = 0; j < 16; j++)
        {
            /*lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
-            lfi[i].mbthr[j] = HEVThresh;
+            lfi[i].mbflim[j] = filt_lvl+2;*/
            /*lfi[i].flim[j] = filt_lvl;*/
            lfi[i].thr[j] = HEVThresh;
-            /*lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
-            lfi[i].uvmbthr[j] = HEVThresh;
-            /*lfi[i].uvflim[j] = filt_lvl;*/
-            lfi[i].uvthr[j] = HEVThresh;
        }
    }
 }


-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
 {
    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;

    if (mbd->mode_ref_lf_delta_enabled)
    {
        /* Apply delta for reference frame */
-        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+        filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];

        /* Apply delta for mode */
        if (mbmi->ref_frame == INTRA_FRAME)
        {
            /* Only the split mode BPRED has a further special case */
            if (mbmi->mode == B_PRED)
-                *filter_level +=  mbd->mode_lf_deltas[0];
+                filter_level +=  mbd->mode_lf_deltas[0];
        }
        else
        {
            /* Zero motion mode */
            if (mbmi->mode == ZEROMV)
-                *filter_level +=  mbd->mode_lf_deltas[1];
+                filter_level +=  mbd->mode_lf_deltas[1];

            /* Split MB motion mode */
            else if (mbmi->mode == SPLITMV)
-                *filter_level +=  mbd->mode_lf_deltas[3];
+                filter_level +=  mbd->mode_lf_deltas[3];

            /* All other inter motion modes (Nearest, Near, New) */
            else
-                *filter_level +=  mbd->mode_lf_deltas[2];
+                filter_level +=  mbd->mode_lf_deltas[2];
        }

        /* Range check */
-        if (*filter_level > MAX_LOOP_FILTER)
-            *filter_level = MAX_LOOP_FILTER;
-        else if (*filter_level < 0)
-            *filter_level = 0;
+        if (filter_level > MAX_LOOP_FILTER)
+            filter_level = MAX_LOOP_FILTER;
+        else if (filter_level < 0)
+            filter_level = 0;
    }
+    return filter_level;
 }


@@ -373,7 +360,7 @@ void vp8_loop_filter_frame
             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
             * Apply any context driven MB level adjustment
             */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);

            if (filter_level)
            {
@@ -473,7 +460,7 @@ void vp8_loop_filter_frame_yonly
            filter_level = baseline_filter_level[Segment];

            /* Apply any context driven MB level adjustment */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);

            if (filter_level)
            {
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -32,12 +32,6 @@ typedef struct
    DECLARE_ALIGNED(16, signed char, flim[16]);
    DECLARE_ALIGNED(16, signed char, thr[16]);
    DECLARE_ALIGNED(16, signed char, mbflim[16]);
-    DECLARE_ALIGNED(16, signed char, mbthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvlim[16]);
-    DECLARE_ALIGNED(16, signed char, uvflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbthr[16]);
 } loop_filter_info;


--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -200,7 +200,7 @@ typedef struct VP8Common
 } VP8_COMMON;


-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
 void vp8_init_loop_filter(VP8_COMMON *cm);
 void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -56,10 +56,10 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }

 void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -77,10 +77,10 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }

 void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -104,7 +104,7 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }

 void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -127,7 +127,7 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);

    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }

 void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -14,6 +14,8 @@

 #define VPXINFINITE 10000       /* 10second. */

+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
 /* Thread management macros */
 #ifdef _WIN32
 /* Win32 */
@@ -88,4 +90,6 @@
 #define x86_pause_hint()
 #endif

+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
 #endif
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -45,13 +45,13 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }


@@ -62,7 +62,7 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -71,13 +71,13 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }


@@ -88,7 +88,7 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -102,10 +102,10 @@ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }


@@ -132,10 +132,10 @@ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }


@@ -159,10 +159,10 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }


@@ -173,7 +173,7 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -182,10 +182,10 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }


@@ -196,7 +196,7 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -210,7 +210,7 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }


@@ -237,7 +237,7 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }


--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -14,7 +14,6 @@
 #include "blockd.h"
 #include "pragmas.h"
 #include "postproc.h"
-#include "dboolhuff.h"
 #include "dequantize.h"
 #include "onyxd_int.h"

@@ -35,12 +34,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
    }
 #endif

@@ -54,12 +47,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
    }
 #endif
 #endif
--- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -1,163 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_v6|
-    EXPORT  |vp8dx_start_decode_v6|
-    EXPORT  |vp8dx_stop_decode_v6|
-    EXPORT  |vp8dx_decode_bool_v6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-br      RN  r0
-prob    RN  r1
-bits    RN  r1
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_v6| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, br
-    mov     r5, bits
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     prob, #0x80
-    mov     br, r4
-    bl      vp8dx_decode_bool_v6_internal     ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_v6|
-
-
-;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_v6| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [br, #bool_decoder_lowvalue]
-    str     r3, [br, #bool_decoder_range]
-    str     r1, [br, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [br, #bool_decoder_count]
-    str     r2, [br, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [br, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_v6|
-
-
-;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_v6| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_v6|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_v6| PROC
-vp8dx_decode_bool_v6_internal
-    stmdb   sp!, {r4 - r6, lr}
-
-    ldr     r2, [br, #bool_decoder_range]
-    ldr     r3, [br, #bool_decoder_value]
-
-    mov     r6, r2, lsl #8
-    sub     r6, r6, #256                ;   split = 1 +  (((range-1) * probability) >> 8)
-    mov     r12, #1
-    smlawb  r6, r6, prob, r12
-
-    mov     lr, #0
-    subs    r5, r3, r6, lsl #24
-
-    ;cmp        r3, r1
-    movhs   lr, #1
-    movhs   r3, r5
-    subhs   r2, r2, r6
-    movlo   r2, r6
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    ;strd   r2, r3, [br, #bool_decoder_range]
-    str     r2, [br, #bool_decoder_range]
-    str     r3, [br, #bool_decoder_value]
-    mov     r0, lr
-    ldmia   sp!, {r4 - r6, pc}
-
-range_less_0x80
-    ldr     r5, [br, #bool_decoder_pos]
-    ldr     r1, [br, #bool_decoder_buffer]
-    ldr     r4, [br, #bool_decoder_count]
-    add     r1, r1, r5
-
-    clz       r12, r2
-    sub       r12, r12, #24
-    subs      r4, r4, r12
-    ldrleb    r6, [r1], #1
-    mov       r2, r2, lsl r12
-    mov       r3, r3, lsl r12
-    addle     r4, r4, #8
-    rsble     r12, r4, #8
-    addle     r5, r5, #1
-    orrle     r3, r3, r6, lsl r12
-
-    ;strd       r2, r3, [br, #bool_decoder_range]
-    ;strd       r4, r5, [br, #bool_decoder_count]
-    str         r2, [br, #bool_decoder_range]
-    str         r3, [br, #bool_decoder_value]
-    str         r4, [br, #bool_decoder_count]
-    str         r5, [br, #bool_decoder_pos]
-
-    mov     r0, lr
-
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8dx_decode_bool_v6|
-
-    END
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -1,43 +0,0 @@
-#ifndef DBOOLHUFF_ARM_H
-#define DBOOLHUFF_ARM_H
-
-/* JLK
- * There are currently no arm-optimized versions of
- * these functions. As they are implemented, they
- * can be uncommented below and added to
- * arm/dsystemdependent.c
- *
- * The existing asm code is likely so different as
- * to be useless. However, its been left (for now)
- * for reference.
- */
-#if 0
-#if HAVE_ARMV6
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_v6
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_v6
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_v6
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_v6
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_neon
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_neon
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_neon
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_neon
-#endif /* HAVE_ARMV7 */
-#endif
-#endif /* DBOOLHUFF_ARM_H */
--- a/vp8/decoder/arm/detokenize.asm
+++ b/vp8/decoder/arm/detokenize.asm
@@ -1,320 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_mb_tokens_v6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    INCLUDE vpx_asm_offsets.asm
-
-l_qcoeff    EQU     0
-l_i         EQU     4
-l_type      EQU     8
-l_stop      EQU     12
-l_c         EQU     16
-l_l_ptr     EQU     20
-l_a_ptr     EQU     24
-l_bc        EQU     28
-l_coef_ptr  EQU     32
-l_stacksize EQU     64
-
-
-;; constant offsets -- these should be created at build time
-c_block2above_offset         EQU 25
-c_entropy_nodes              EQU 11
-c_dct_eob_token              EQU 11
-
-|vp8_decode_mb_tokens_v6| PROC
-    stmdb       sp!, {r4 - r11, lr}
-    sub         sp, sp, #l_stacksize
-    mov         r7, r1                      ; type
-    mov         r9, r0                      ; detoken
-
-    ldr         r1, [r9, #detok_current_bc]
-    ldr         r0, [r9, #detok_qcoeff_start_ptr]
-    mov         r11, #0                     ; i
-    mov         r3, #16                     ; stop
-
-    cmp         r7, #1                      ; type ?= 1
-    addeq       r11, r11, #24               ; i = 24
-    addeq       r3, r3, #8                  ; stop = 24
-    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
-
-    str         r0, [sp, #l_qcoeff]
-    str         r11, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-    str         r1, [sp, #l_bc]
-
-    add         lr, r9, r7, lsl #2          ; detoken + type*4
-
-    ldr         r8, [r1, #bool_decoder_user_buffer]
-
-    ldr         r10, [lr, #detok_coef_probs]
-    ldr         r5, [r1, #bool_decoder_count]
-    ldr         r6, [r1, #bool_decoder_range]
-    ldr         r4, [r1, #bool_decoder_value]
-
-    str         r10, [sp, #l_coef_ptr]
-
-BLOCK_LOOP
-    ldr         r3, [r9, #detok_ptr_block2leftabove]
-    ldr         r1, [r9, #detok_L]
-    ldr         r2, [r9, #detok_A]
-    ldrb        r12, [r3, r11]!             ; block2left[i]
-    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
-
-    cmp         r7, #0                      ; c = !type
-    moveq       r7, #1
-    movne       r7, #0
-
-    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
-    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
-    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
-
-; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
-    cmp         r0, #0                      ; *l ?= 0
-    movne       r0, #1
-    cmp         r3, #0                      ; *a ?= 0
-    addne       r0, r0, #1                  ; t
-
-    str         r1, [sp, #l_l_ptr]          ; save &l
-    str         r2, [sp, #l_a_ptr]          ; save &a
-    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
-    mov         r1, #0                      ; t = 0
-    str         r7, [sp, #l_c]
-
-    ;align 4
-COEFF_LOOP
-    ldr         r3, [r9, #detok_ptr_coef_bands_x]
-    ldr         lr, [r9, #detok_coef_tree_ptr]
-    ;STALL
-    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
-    ;STALL
-    ;STALL
-    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
-
-get_token_loop
-    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
-    mov         r10, #1                     ; 1
-
-    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
-    ;++
-
-    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
-    addhs       r1, r1, #1                  ; t += 1
-    movhs       r4, r3                      ; value -= bigsplit (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
- ;   movlo       r6, r2                      ; range = split
-
-    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
-
-; NORMALIZE
-    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
-    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
-    addle         r5, r5, #8                ; count += 8
-    rsble         r3, r5, #24               ; 24 - count
-    addle         r8, r8, #1                ; bufptr++
-    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
-
-    cmp         r1, #0                      ; t ?= 0
-    bgt         get_token_loop              ; while (t > 0)
-
-    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
-    beq         END_OF_BLOCK                ; break
-
-    rsb         lr, r1, #0                  ; v = -t;
-
-    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
-    ble         SKIP_EXTRABITS
-
-    ldr         r3, [r9, #detok_teb_base_ptr]
-    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
-    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
-
-    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
-    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
-
-extrabits_loop
-    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
-
-    ldrb        r2, [r3, #4]                ; probability. why +4?
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; range << 8 + 1 << 8
-
-    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; *bufptr
-    ;++
-
-    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
-    movhs       r4, r10                     ; value = value - (split << 24)
-    subhs       r2, r6, r2                  ; range = range - split
-    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
-
-; NORMALIZE
-    clz         r3, r2                      ; shift - leading zeros in split
-    sub         r3, r3, #24                 ; don't count first 3 bytes
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range = range << shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-    addle       r5, r5, #8                  ; count += BR_COUNT
-    addle       r8, r8, #1                  ; bufptr++
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
-
-    subs        r0, r0, #1                  ; bits_count --
-    bpl         extrabits_loop
-
-
-SKIP_EXTRABITS
-    ldr         r11, [sp, #l_qcoeff]
-    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
-
-    cmp         r1, #0                      ; check for nonzero token - if (t)
-    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
-
-    add         r3, r6, #1                  ; range + 1
-    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
-
-    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
-    movhs       r4, r3                      ; value -= (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
-    mvnhs       r3, lr                      ; -v
-    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
-
-; NORMALIZE
-    clz         r3, r2                      ; leading 0s in split
-    sub         r3, r3, #24                 ; shift
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-    ldrleb      r2, [r8], #1                ; *(bufptr++)
-    addle       r5, r5, #8                  ; count += 8
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
-
-    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    cmn         r1, #1                      ; t < -ONE_TOKEN
-
-    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    mvn         r1, #1                      ; t = -1 ???? C is -2
-
-SKIP_EOB_CHECK
-    ldr         r7, [sp, #l_c]              ; c
-    ldr         r3, [r9, #detok_scan]
-    add         r1, r1, #2                  ; t+= 2
-    cmp         r7, #15                     ; c should will be one higher
-
-    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
-    add         r7, r7, #1                  ; c++
-    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
-
-    str         r7, [sp, #l_c]              ; store c
-    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
-
-    blt         COEFF_LOOP
-
-    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
-
-END_OF_BLOCK
-    ldr         r3, [sp, #l_type]           ; type
-    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
-    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
-    ldr         r11, [sp, #l_i]             ; i
-    ldr         r12, [sp, #l_stop]          ; stop
-
-    cmp         r3, #0                      ; type ?= 0
-    moveq       r1, #1
-    movne       r1, #0
-    add         r3, r11, r9                 ; detok + i
-
-    cmp         r7, r1                      ; c ?= !type
-    strb        r7, [r3, #detok_eob]        ; eob[i] = c
-
-    ldr         r7, [sp, #l_l_ptr]          ; l
-    ldr         r2, [sp, #l_a_ptr]          ; a
-    movne       r3, #1                      ; t
-    moveq       r3, #0
-
-    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
-    add         r11, r11, #1                ; i++
-    strb        r3, [r7]                    ; *l = t
-    strb        r3, [r2]                    ; *a = t
-    str         r0, [sp, #l_qcoeff]         ; qcoeff
-    str         r11, [sp, #l_i]             ; i
-
-    cmp         r11, r12                    ; i < stop
-    ldr         r7, [sp, #l_type]           ; type
-
-    blt         BLOCK_LOOP
-
-    cmp         r11, #25                    ; i ?= 25
-    bne         ln2_decode_mb_to
-
-    ldr         r12, [r9, #detok_qcoeff_start_ptr]
-    ldr         r10, [r9, #detok_coef_probs]
-    mov         r7, #0                      ; type/i = 0
-    mov         r3, #16                     ; stop = 16
-    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
-    str         r7, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
-
-    b           BLOCK_LOOP
-
-ln2_decode_mb_to
-    cmp         r11, #16                    ; i ?= 16
-    bne         ln1_decode_mb_to
-
-    mov         r10, #detok_coef_probs
-    add         r10, r10, #2*4              ; coef_probs[type]
-    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
-
-    mov         r7, #2                      ; type = 2
-    mov         r3, #24                     ; stop = 24
-
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
-    b           BLOCK_LOOP
-
-ln1_decode_mb_to
-    ldr         r2, [sp, #l_bc]
-    mov         r0, #0
-    nop
-
-    str         r8, [r2, #bool_decoder_user_buffer]
-    str         r5, [r2, #bool_decoder_count]
-    str         r4, [r2, #bool_decoder_value]
-    str         r6, [r2, #bool_decoder_range]
-
-    add         sp, sp, #l_stacksize
-    ldmia       sp!, {r4 - r11, pc}
-
-    ENDP  ; |vp8_decode_mb_tokens_v6|
-
-    END
--- a/vp8/decoder/arm/neon/dboolhuff_neon.asm
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -1,160 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_neon|
-    EXPORT  |vp8dx_start_decode_neon|
-    EXPORT  |vp8dx_stop_decode_neon|
-    EXPORT  |vp8dx_decode_bool_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_neon| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, r0
-    mov     r5, r1
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     r1, #0x80
-    mov     r0, r4
-    bl      vp8dx_decode_bool_neon_internal       ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_neon|
-
-
-;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_neon| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [r0, #bool_decoder_lowvalue]
-    str     r3, [r0, #bool_decoder_range]
-    str     r1, [r0, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [r0, #bool_decoder_count]
-    str     r2, [r0, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [r0, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_neon|
-
-
-;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_neon| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_neon|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_neon| PROC
-vp8dx_decode_bool_neon_internal
-;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
-;before memory allocation
-    stmdb   sp!, {r4 - r5, lr}
-
-    ldr     r2, [r0, #bool_decoder_range]       ;load range (r2), value(r3)
-    ldr     r3, [r0, #bool_decoder_value]
-    ;ldrd   r2, r3, [r0, #bool_decoder_range]   ;ldrd costs 2 cycles
-    ;
-
-    mov     r4, r2, lsl #8
-    sub     r4, r4, #256
-    mov     r12, #1
-
-    smlawb  r4, r4, r1, r12         ;split = 1 +  (((range-1) * probability) >> 8)
-
-    mov     lr, r0
-    mov     r0, #0                  ;bit = 0
-    ;
-    subs    r5, r3, r4, lsl #24
-
-    subhs   r2, r2, r4              ;range = br->range-split
-    movlo   r2, r4                  ;range = split
-    movhs   r0, #1                  ;bit = 1
-    movhs   r3, r5                  ;value = value-bigsplit
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-
-    ldmia   sp!, {r4 - r5, pc}
-
-range_less_0x80
-
-    ldrd    r4, r5, [lr, #bool_decoder_count]   ;load count, pos, buffer
-    ldr     r1, [lr, #bool_decoder_buffer]
-
-    clz     r12, r2
-    add     r1, r1, r5
-
-    sub     r12, r12, #24
-    subs    r4, r4, r12             ;count -= shift
-    mov     r2, r2, lsl r12         ;range <<= shift
-    mov     r3, r3, lsl r12         ;value <<= shift
-    addle   r4, r4, #8              ;count += 8
-    ldrleb  r12, [r1], #1           ;br->buffer[br->pos]
-
-    rsble   r1, r4, #8              ;-count
-    addle   r5, r5, #1              ;br->pos++
-    orrle   r3, r3, r12, lsl r1     ;value |= (br->buffer[br->pos]) << (-count)
-
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-    strd    r4, r5, [lr, #bool_decoder_count]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_decode_bool_neon|
-
-    END
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,13 +12,7 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>

-#if CONFIG_VP8_ENCODER
-#include "vpx_scale/yv12config.h"
-#endif
-
-#if CONFIG_VP8_DECODER
 #include "onyxd_int.h"
-#endif

 #define DEFINE(sym, val) int sym = val;

@@ -31,29 +25,6 @@
 * {
 */

-#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-#endif
-
-#if CONFIG_VP8_DECODER
-DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
-DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
-DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
-DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
-DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
-DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
-DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
-DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
-
 DEFINE(detok_scan,                              offsetof(DETOK, scan));
 DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
 DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
@@ -77,7 +48,6 @@ DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));

 DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
 DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
-#endif

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -26,8 +26,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8dx_bitreader_norm[256]) =
 };


-int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
-                        unsigned int source_sz)
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz)
 {
    br->user_buffer_end = source+source_sz;
    br->user_buffer     = source;
@@ -39,13 +40,13 @@ int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
        return 1;

    /* Populate the buffer */
-    vp8dx_bool_decoder_fill_c(br);
+    vp8dx_bool_decoder_fill(br);

    return 0;
 }


-void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
    const unsigned char *bufptr;
    const unsigned char *bufend;
@@ -62,69 +63,3 @@ void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
    br->value = value;
    br->count = count;
 }
-
-#if 0
-/*
- * Until optimized versions of these functions are available, we
- * keep the implementation in the header to allow inlining.
- *
- * The RTCD-style invocations are still in place so this can
- * be switched by just uncommenting these functions here and
- * the DBOOLHUFF_INVOKE calls in the header.
- */
-int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
-{
-    unsigned int bit=0;
-    VP8_BD_VALUE value;
-    unsigned int split;
-    VP8_BD_VALUE bigsplit;
-    int count;
-    unsigned int range;
-
-    value = br->value;
-    count = br->count;
-    range = br->range;
-
-    split = 1 + (((range-1) * probability) >> 8);
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
-
-    range = split;
-    if(value >= bigsplit)
-    {
-        range = br->range-split;
-        value = value-bigsplit;
-        bit = 1;
-    }
-
-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit;
-    }*/
-
-    {
-        register unsigned int shift = vp8dx_bitreader_norm[range];
-        range <<= shift;
-        value <<= shift;
-        count -= shift;
-    }
-    br->value = value;
-    br->count = count;
-    br->range = range;
-    if (count < 0)
-        vp8dx_bool_decoder_fill_c(br);
-    return bit;
-}
-
-int vp8dx_decode_value_c(BOOL_DECODER *br, int bits)
-{
-    int z = 0;
-    int bit;
-    for ( bit=bits-1; bit>=0; bit-- )
-    {
-        z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-    }
-    return z;
-}
-#endif
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -25,10 +25,6 @@ typedef size_t VP8_BD_VALUE;
  Even relatively modest values like 100 would work fine.*/
 # define VP8_LOTS_OF_BITS (0x40000000)

-
-
-struct vp8_dboolhuff_rtcd_vtable;
-
 typedef struct
 {
    const unsigned char *user_buffer_end;
@@ -36,82 +32,15 @@ typedef struct
    VP8_BD_VALUE         value;
    int                  count;
    unsigned int         range;
-#if CONFIG_RUNTIME_CPU_DETECT
-    struct vp8_dboolhuff_rtcd_vtable *rtcd;
-#endif
 } BOOL_DECODER;

-#define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
-    const unsigned char *source, unsigned int source_sz)
-#define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
-#define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
-#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
-
-#if ARCH_ARM
-#include "arm/dboolhuff_arm.h"
-#endif
-
-#ifndef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_c
-#endif
-
-#ifndef vp8_dbool_fill
-#define vp8_dbool_fill vp8dx_bool_decoder_fill_c
-#endif
-
-#ifndef vp8_dbool_debool
-#define vp8_dbool_debool vp8dx_decode_bool_c
-#endif
-
-#ifndef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8dx_decode_value_c
-#endif
-
-extern prototype_dbool_start(vp8_dbool_start);
-extern prototype_dbool_fill(vp8_dbool_fill);
-extern prototype_dbool_debool(vp8_dbool_debool);
-extern prototype_dbool_devalue(vp8_dbool_devalue);
-
-typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
-typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
-typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
-typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
-
-typedef struct vp8_dboolhuff_rtcd_vtable {
-    vp8_dbool_start_fn_t   start;
-    vp8_dbool_fill_fn_t    fill;
-    vp8_dbool_debool_fn_t  debool;
-    vp8_dbool_devalue_fn_t devalue;
-} vp8_dboolhuff_rtcd_vtable_t;
-
-/* There are no processor-specific versions of these
- * functions right now. Disable RTCD to avoid using
- * function pointers which gives a speed boost
- */
-/*#ifdef ENABLE_RUNTIME_CPU_DETECT
-#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
-#define IF_RTCD(x) (x)
-#else*/
-#define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
-#define IF_RTCD(x) NULL
-/*#endif*/
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);

-/* wrapper functions to hide RTCD. static means inline means hopefully no
- * penalty
- */
-static int vp8dx_start_decode(BOOL_DECODER *br,
-        struct vp8_dboolhuff_rtcd_vtable *rtcd,
-        const unsigned char *source, unsigned int source_sz) {
-#if CONFIG_RUNTIME_CPU_DETECT
-    br->rtcd = rtcd;
-#endif
-    return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
-}
-static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
-    DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
-}
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);

 /*The refill loop is used in several places, so define it in a macro to make
   sure they're all consistent.
@@ -138,12 +67,6 @@ static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {


 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
-   */
    unsigned int bit = 0;
    VP8_BD_VALUE value;
    unsigned int split;
@@ -167,13 +90,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
        bit = 1;
    }

-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit
-    }*/
-
    {
        register unsigned int shift = vp8dx_bitreader_norm[range];
        range <<= shift;
@@ -190,12 +106,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {

 static int vp8_decode_value(BOOL_DECODER *br, int bits)
 {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, devalue)(br, bits);
-   */
    int z = 0;
    int bit;

--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -475,8 +475,7 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                               "Truncated packet or corrupt partition "
                               "%d length", i + 1);

-        if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
-                               partition, partition_size))
+        if (vp8dx_start_decode(bool_decoder, partition, partition_size))
            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                               "Failed to allocate bool decoder %d", i + 1);

@@ -485,9 +484,11 @@ static void setup_token_decoder(VP8D_COMP *pbi,
        bool_decoder++;
    }

+#if CONFIG_MULTITHREAD
    /* Clamp number of decoder threads */
    if (pbi->decoding_thread_count > num_part - 1)
        pbi->decoding_thread_count = num_part - 1;
+#endif
 }


@@ -651,8 +652,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    init_frame(pbi);

-    if (vp8dx_start_decode(bc, IF_RTCD(&pbi->dboolhuff),
-                           data, data_end - data))
+    if (vp8dx_start_decode(bc, data, data_end - data))
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
@@ -846,7 +846,9 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

    /* set up frame new frame for intra coded blocks */
+#if CONFIG_MULTITHREAD
    if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+#endif
        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

    vp8_setup_block_dptrs(xd);
@@ -866,6 +868,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));

+#if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
    {
        vp8mt_decode_mb_rows(pbi, xd);
@@ -880,6 +883,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
    }
    else
+#endif
    {
        int ibc = 0;
        int num_part = 1 << pc->multi_token_partition;
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -74,37 +74,6 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
    }
 }

-#if CONFIG_ARM_ASM_DETOK
-/* mashup of vp8_block2left and vp8_block2above so we only need one pointer
- * for the assembly version.
- */
-DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
-{
-    /* vp8_block2left */
-    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    /* vp8_block2above */
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
-
-void vp8_init_detokenizer(VP8D_COMP *dx)
-{
-    const VP8_COMMON *const oc = & dx->common;
-    MACROBLOCKD *x = & dx->mb;
-
-    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
-    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
-    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
-    dx->detoken.scan = vp8_default_zig_zag1d;
-    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
-    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
-
-    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
-    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
-    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
-    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
-}
-#endif
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 #define FILL \
    if(count < 0) \
@@ -202,35 +171,6 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
    }\
    NORMALIZE

-#if CONFIG_ARM_ASM_DETOK
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
-{
-    int eobtotal = 0;
-    int i, type;
-
-    dx->detoken.current_bc = x->current_bc;
-    dx->detoken.A = x->above_context;
-    dx->detoken.L = x->left_context;
-
-    type = 3;
-
-    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        type = 1;
-        eobtotal -= 16;
-    }
-
-    vp8_decode_mb_tokens_v6(&dx->detoken, type);
-
-    for (i = 0; i < 25; i++)
-    {
-        x->eobs[i] = dx->detoken.eob[i];
-        eobtotal += dx->detoken.eob[i];
-    }
-
-    return eobtotal;
-}
-#else
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
@@ -423,4 +363,3 @@ BLOCK_FINISHED:
    return eobtotal;

 }
-#endif /*!CONFIG_ASM_DETOK*/
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -14,10 +14,6 @@

 #include "onyxd_int.h"

-#if ARCH_ARM
-#include "arm/detokenize_arm.h"
-#endif
-
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);

--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -27,12 +27,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
-    pbi->dboolhuff.start             = vp8dx_start_decode_c;
-    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-#if 0 /*For use with RTCD, when implemented*/
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-#endif
 #endif

 #if ARCH_X86 || ARCH_X86_64
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -114,8 +114,10 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
    pbi->ready_for_new_data = 1;

    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
+#if CONFIG_MULTITHREAD
    pbi->max_threads = oxcf->max_threads;
    vp8_decoder_create_threads(pbi);
+#endif

    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
@@ -131,9 +133,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
        cm->last_sharpness_level = cm->sharpness_level;
    }

-#if CONFIG_ARM_ASM_DETOK
-    vp8_init_detokenizer(pbi);
-#endif
    pbi->common.error.setjmp = 0;
    return (VP8D_PTR) pbi;
 }
@@ -149,8 +148,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd)
        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
-#endif
    vp8_decoder_remove_threads(pbi);
+#endif
    vp8_remove_common(&pbi->common);
    vpx_free(pbi);
 }
@@ -407,6 +406,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        return retcode;
    }

+#if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
    {
        if (swap_frame_buffers (cm))
@@ -424,6 +424,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
            return -1;
        }
    } else
+#endif
    {
        if (swap_frame_buffers (cm))
        {
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -87,14 +87,15 @@ typedef struct VP8Decompressor
    unsigned int time_decoding;
    unsigned int time_loop_filtering;

+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
    volatile int b_multithreaded_rd;
    int max_threads;
    int current_mb_col_main;
    int decoding_thread_count;
    int allocated_decoding_thread_count;

-    /* variable for threading */
-#if CONFIG_MULTITHREAD
    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
    int sync_range;
    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
@@ -125,7 +126,6 @@ typedef struct VP8Decompressor

 #if CONFIG_RUNTIME_CPU_DETECT
    vp8_dequant_rtcd_vtable_t        dequant;
-    struct vp8_dboolhuff_rtcd_vtable dboolhuff;
 #endif


--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -21,7 +21,6 @@

 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -146,17 +145,10 @@ void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -289,17 +281,10 @@ void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_r
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;    /*[16];*/
    unsigned char uleft_buf[8];
@@ -452,17 +437,10 @@ void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_ro
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;   /*[16];*/
    unsigned char uleft_buf[8];
@@ -621,12 +599,6 @@ void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }


@@ -638,7 +610,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
                          int mb_col,
                          int num)
 {
-#if CONFIG_MULTITHREAD
    int i, r, c;

    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
@@ -935,15 +906,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,


    }
-#else
-    (void) pbi;
-    (void) xd;
-    (void) b_mode;
-    (void) predictor;
-    (void) mb_row;
-    (void) mb_col;
-    (void) num;
-#endif
 }

 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
@@ -951,7 +913,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
 */
 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
    unsigned int *src_ptr;
    unsigned int *dst_ptr0;
@@ -973,10 +934,4 @@ void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    *dst_ptr0 = *src_ptr;
    *dst_ptr1 = *src_ptr;
    *dst_ptr2 = *src_ptr;
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -9,7 +9,7 @@
 */


-#ifndef WIN32
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
 #ifdef __APPLE__
@@ -38,7 +38,6 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);

 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i, j;

@@ -88,18 +87,11 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC

    for (i=0; i< pc->mb_rows; i++)
        pbi->mt_current_mb_col[i]=-1;
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mbrd;
-    (void) count;
-#endif
 }


 void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
    VP8_COMMON *pc = &pbi->common;
@@ -222,18 +214,11 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
                    (xd->qcoeff+16*16, xd->block[16].dequant,
                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }


 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -320,7 +305,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                             * Apply any context driven MB level adjustment
                             */
-                            vp8_adjust_mb_lf_value(xd, &filter_level);
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                        }

                        /* Distance of Mb to the various image edges.
@@ -438,9 +423,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
            sem_post(&pbi->h_event_end_decoding);
        }
    }
-#else
-    (void) p_data;
-#endif

    return 0 ;
 }
@@ -448,7 +430,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)

 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
    int core_count = 0;
    int ithread;

@@ -482,16 +463,11 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)

        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
    }
-
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;

@@ -589,15 +565,11 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
            pbi->mt_vleft_col = NULL ;
        }
    }
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;
    int uv_width;
@@ -646,17 +618,11 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
-#else
-    (void) pbi;
-    (void) width;
-#endif
 }


 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
-
    /* shutdown MB Decoding thread; */
    if (pbi->b_multithreaded_rd)
    {
@@ -702,15 +668,11 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            pbi->de_thread_data = NULL;
        }
    }
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
@@ -752,16 +714,11 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
        vp8_init_loop_filter(cm);
    else if (frame_type != cm->last_frame_type)
        vp8_frame_init_loop_filter(lfi, frame_type);
-#else
-    (void) pbi;
-    (void) default_filt_lvl;
-#endif
 }


 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-#if CONFIG_MULTITHREAD
    int mb_row;
    VP8_COMMON *pc = &pbi->common;

@@ -863,7 +820,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                     * Apply any context driven MB level adjustment
                     */
-                    vp8_adjust_mb_lf_value(xd, &filter_level);
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                }

                /* Distance of Mb to the various image edges.
@@ -981,8 +938,4 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
    }

    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
-#else
-    (void) pbi;
-    (void) xd;
-#endif
 }
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;

        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;

        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -14,7 +14,7 @@
    EXPORT |vp8_stop_encode|
    EXPORT |vp8_encode_value|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_tokens_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_mb_row_tokens_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
@@ -65,6 +65,8 @@
 numparts_loop
    ldr     r10, [sp, #40]              ; ptr
    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    sub     r5, r5, r11                 ; move start point with each partition
+                                        ; mb_rows starts at i
    str     r5,  [sp, #12]

    ; Reset all of the VP8 Writer data for each partition that
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,147 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0x0]      ; load 4 src pixels
+    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #0x4]      ; load 4 src pixels
+    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #0x8]      ; load 4 src pixels
+    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #0xc]      ; load 4 src pixels
+    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #0x28]     ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
--- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
@@ -112,10 +112,7 @@
    ENDP

 ;-----------------
-    AREA    fastfdct_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _ffdct_coeff_
    DCD     ffdct_coeff
 ffdct_coeff
--- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
@@ -165,10 +165,7 @@
    ENDP

 ;-----------------
-    AREA    fastfdct8x4_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _ffdct8_coeff_
    DCD     ffdct8_coeff
 ffdct8_coeff
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -122,10 +122,7 @@
    ENDP

 ;-----------------
-    AREA    dct4x4_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _dct_matrix_
    DCD     dct_matrix
 dct_matrix
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_sub_pixel_variance16x16_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
    ARM
    REQUIRE8
    PRESERVE8
@@ -24,7 +24,7 @@
 ; stack(r6) unsigned int *sse
 ;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.

-|vp8_sub_pixel_variance16x16_neon| PROC
+|vp8_sub_pixel_variance16x16_neon_func| PROC
    push            {r4-r6, lr}

    ldr             r12, _BilinearTaps_coeff_
@@ -416,10 +416,7 @@ sub_pixel_variance16x16_neon_loop
    ENDP

 ;-----------------
-    AREA    vp8e_bilinear_taps_dat, DATA, READWRITE          ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -215,10 +215,7 @@ sub_pixel_variance8x8_neon_loop
    ENDP

 ;-----------------
-    AREA    bilinear_taps_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "variance.h"
+#include "filter.h"
+#include "arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+#endif
+
+#if HAVE_ARMV7
+
+unsigned int vp8_sub_pixel_variance16x16_neon
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+  if (xoffset == 4 && yoffset == 0)
+    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 0 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 4 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else
+    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -12,6 +12,23 @@
 #ifndef VARIANCE_ARM_H
 #define VARIANCE_ARM_H

+#if HAVE_ARMV6
+
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
 #if HAVE_ARMV7
 extern prototype_sad(vp8_sad4x4_neon);
 extern prototype_sad(vp8_sad8x8_neon);
@@ -30,6 +47,7 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func);
 extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,9 +12,9 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>

-#include "../treewriter.h"
-#include "../tokenize.h"
-#include "../onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "onyx_int.h"

 #define ct_assert(name,cond) \
    static void assert_##name(void) UNUSED;\
@@ -31,6 +31,7 @@
 * {
 */

+//pack tokens
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
 DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
@@ -40,19 +41,19 @@ DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));

 DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
 DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                  offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
 DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
 DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));

-DEFINE(vp8_extra_bit_struct_sz,                   sizeof(vp8_extra_bit_struct));
+DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));

 DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
 DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));

-DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));
+DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));

 DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
 DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
@@ -62,12 +63,14 @@ DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
 DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
 DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

-DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));
+DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

-// These two sizes are used in vp7cx_pack_tokens.  They are hard coded
-//  so if the size changes this will have to be adjusted.
+// These two sizes are used in vp8cx_pack_tokens.  They are hard coded
+// so if the size changes this will have to be adjusted.
+#if HAVE_ARMV5TE
 ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
 ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1654,10 +1654,12 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
    {
        vp8_start_encode(&cpi->bc2, cx_data + bc->pos);

-        if (!cpi->b_multi_threaded)
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
-        else
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
            pack_mb_row_tokens(cpi, &cpi->bc2);
+        else
+#endif
+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);

        vp8_stop_encode(&cpi->bc2);
        oh.first_partition_length_in_bytes = cpi->bc.pos ;
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -112,6 +112,7 @@ typedef struct

    unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
    int optimize;
+    int q_index;

    void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -365,6 +365,33 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
    x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
    x->block[24].zbin_extra = (short)zbin_extra;
+
+    /* save this macroblock QIndex for vp8_update_zbin_extra() */
+    x->q_index = QIndex;
+}
+void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    int i;
+    int QIndex = x->q_index;
+    int zbin_extra;
+
+    // Y
+    zbin_extra = (cpi->common.Y1dequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    for (i = 0; i < 16; i++)
+    {
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // UV
+    zbin_extra = (cpi->common.UVdequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    for (i = 16; i < 24; i++)
+    {
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // Y2
+    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].zbin_extra = (short)zbin_extra;
 }

 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
@@ -372,13 +399,6 @@ void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
    // Clear Zbin mode boost for default case
    cpi->zbin_mode_boost = 0;

-    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
-    // when these values are not all zero.
-    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
-    {
-        vp8cx_init_quantizer(cpi);
-    }
-
    // MB level quantizer setup
    vp8cx_mb_init_quantizer(cpi, &cpi->mb);
 }
@@ -460,6 +480,16 @@ void encode_mb_row(VP8_COMP *cpi,
    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
    int seg_map_index = (mb_row * cpi->common.mb_cols);

+#if CONFIG_MULTITHREAD
+    const int nsync = cpi->mt_sync_range;
+    const int rightmost_col = cm->mb_cols - 1;
+    volatile const int *last_row_current_mb_col;
+
+    if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+    else
+        last_row_current_mb_col = &rightmost_col;
+#endif

    // reset above block coeffs
    xd->above_context = cm->above_context;
@@ -505,6 +535,21 @@ void encode_mb_row(VP8_COMP *cpi,
        x->rddiv = cpi->RDDIV;
        x->rdmult = cpi->RDMULT;

+#if CONFIG_MULTITHREAD
+        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        {
+            if ((mb_col & (nsync - 1)) == 0)
+            {
+                while (mb_col > (*last_row_current_mb_col - nsync)
+                        && (*last_row_current_mb_col) != (cm->mb_cols - 1))
+                {
+                    x86_pause_hint();
+                    thread_sleep(0);
+                }
+            }
+        }
+#endif
+
        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
            activity_sum += vp8_activity_masking(cpi, x);

@@ -601,7 +646,12 @@ void encode_mb_row(VP8_COMP *cpi,
        x->partition_info++;

        xd->above_context++;
-        cpi->current_mb_col_main = mb_col;
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded != 0)
+        {
+            cpi->mt_current_mb_col[mb_row] = mb_col;
+        }
+#endif
    }

    //extend the recon for intra prediction
@@ -615,12 +665,15 @@ void encode_mb_row(VP8_COMP *cpi,
    xd->mode_info_context++;
    x->partition_info++;
    x->activity_sum += activity_sum;
+
+#if CONFIG_MULTITHREAD
+    if ((cpi->b_multi_threaded != 0) && (mb_row == cm->mb_rows - 1))
+    {
+        sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+    }
+#endif
 }

-
-
-
-
 void vp8_encode_frame(VP8_COMP *cpi)
 {
    int mb_row;
@@ -747,7 +800,76 @@ void vp8_encode_frame(VP8_COMP *cpi)
        struct vpx_usec_timer  emr_timer;
        vpx_usec_timer_start(&emr_timer);

-        if (!cpi->b_multi_threaded)
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
+            int i;
+
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+
+            for (i = 0; i < cm->mb_rows; i++)
+                cpi->mt_current_mb_col[i] = 0;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+            }
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+                vp8_zero(cm->left_context)
+
+                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
+
+            }
+
+            sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
+
+            cpi->tok_count = 0;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
+            }
+
+            if (xd->segmentation_enabled)
+            {
+                int i, j;
+
+                if (xd->segmentation_enabled)
+                {
+
+                    for (i = 0; i < cpi->encoding_thread_count; i++)
+                    {
+                        for (j = 0; j < 4; j++)
+                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+                    }
+                }
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                totalrate += cpi->mb_row_ei[i].totalrate;
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
+            }
+
+        }
+        else
+#endif
        {
            // for each macroblock row in image
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
@@ -765,100 +887,6 @@ void vp8_encode_frame(VP8_COMP *cpi)

            cpi->tok_count = tp - cpi->tok;

-        }
-        else
-        {
-#if CONFIG_MULTITHREAD
-            int i;
-
-            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
-
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
-            {
-                cpi->current_mb_col_main = -1;
-
-                for (i = 0; i < cpi->encoding_thread_count; i++)
-                {
-                    if ((mb_row + i + 1) >= cm->mb_rows)
-                        break;
-
-                    cpi->mb_row_ei[i].mb_row = mb_row + i + 1;
-                    cpi->mb_row_ei[i].tp  = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24);
-                    cpi->mb_row_ei[i].current_mb_col = -1;
-                    //SetEvent(cpi->h_event_mbrencoding[i]);
-                    sem_post(&cpi->h_event_mbrencoding[i]);
-                }
-
-                vp8_zero(cm->left_context)
-
-                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
-
-                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-
-                // adjust to the next row of mbs
-                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
-                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-
-                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
-                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
-
-                if (mb_row < cm->mb_rows - 1)
-                    //WaitForSingleObject(cpi->h_event_main, INFINITE);
-                    sem_wait(&cpi->h_event_main);
-            }
-
-            /*
-            for( ;mb_row<cm->mb_rows; mb_row ++)
-            {
-            vp8_zero( cm->left_context)
-
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
-
-            encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-            // adjust to the next row of mbs
-            x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-            x->src.u_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
-            x->src.v_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
-
-            }
-            */
-            cpi->tok_count = 0;
-
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
-            {
-                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
-            }
-
-            if (xd->segmentation_enabled)
-            {
-
-                int i, j;
-
-                if (xd->segmentation_enabled)
-                {
-
-                    for (i = 0; i < cpi->encoding_thread_count; i++)
-                    {
-                        for (j = 0; j < 4; j++)
-                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
-                    }
-                }
-
-            }
-
-            for (i = 0; i < cpi->encoding_thread_count; i++)
-            {
-                totalrate += cpi->mb_row_ei[i].totalrate;
-            }
-
-            for (i = 0; i < cpi->encoding_thread_count; i++)
-            {
-                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
-            }
-
-#endif
-
        }

        vpx_usec_timer_mark(&emr_timer);
@@ -1120,77 +1148,41 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
    int Error4x4, Error16x16, error_uv;
-    B_PREDICTION_MODE intra_bmodes[16];
    int rate4x4, rate16x16, rateuv;
    int dist4x4, dist16x16, distuv;
    int rate = 0;
    int rate4x4_tokenonly = 0;
    int rate16x16_tokenonly = 0;
    int rateuv_tokenonly = 0;
-    int i;

    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

 #if !(CONFIG_REALTIME_ONLY)
-
-    if (cpi->sf.RD || cpi->compressor_speed != 2)
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
    {
-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
-
-        //save the b modes for possible later use
-        for (i = 0; i < 16; i++)
-            intra_bmodes[i] = x->e_mbd.block[i].bmi.mode;
+        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        rate += rateuv;

        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);

-        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);

-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-        rate += rateuv;
-
-        if (Error4x4 < Error16x16)
-        {
-            rate += rate4x4;
-            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-
-            // get back the intra block modes
-            for (i = 0; i < 16; i++)
-                x->e_mbd.block[i].bmi.mode = intra_bmodes[i];
-
-            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error4x4 ;
-#if 0
-            // Experimental RD code
-            cpi->frame_distortion += dist4x4;
-#endif
-        }
-        else
-        {
-            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-            rate += rate16x16;
-
-#if 0
-            // Experimental RD code
-            cpi->prediction_error += Error16x16;
-            cpi->frame_distortion += dist16x16;
-#endif
-        }
-
-        sum_intra_stats(cpi, x);
-
-        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
    }
    else
 #endif
    {
-
-        int rate2, distortion2;
+        int rate2, best_distortion;
        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
        int this_rd;
        Error16x16 = INT_MAX;

+        vp8_pick_intra_mbuv_mode(x);
+
        for (mode = DC_PRED; mode <= TM_PRED; mode ++)
        {
+            int distortion2;
+
            x->e_mbd.mode_info_context->mbmi.mode = mode;
            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
@@ -1201,35 +1193,28 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
            {
                Error16x16 = this_rd;
                best_mode = mode;
+                best_distortion = distortion2;
            }
        }
+        x->e_mbd.mode_info_context->mbmi.mode = best_mode;

-        vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2);
-
-        if (distortion2 == INT_MAX)
-            Error4x4 = INT_MAX;
-        else
-            Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
-
-        if (Error4x4 < Error16x16)
-        {
-            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error4x4;
-        }
-        else
-        {
-            x->e_mbd.mode_info_context->mbmi.mode = best_mode;
-            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error16x16;
-        }
-
-        vp8_pick_intra_mbuv_mode(x);
-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-        sum_intra_stats(cpi, x);
-        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+        Error4x4 = vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &best_distortion);
    }

+    if (Error4x4 < Error16x16)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+        vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+    }
+    else
+    {
+        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+    }
+
+    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+    sum_intra_stats(cpi, x);
+    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+
    return rate;
 }
 #ifdef SPEEDSTATS
@@ -1261,10 +1246,17 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
+        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
+        {
            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);

+            /* the fast quantizer does not use zbin_extra, so
+             * do not recalculate */
+            cpi->zbin_mode_boost_enabled = 0;
+        }
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);

        /* switch back to the regular quantizer for the encode */
@@ -1273,6 +1265,9 @@ int vp8cx_encode_inter_macroblock
            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
        }

+        /* restore cpi->zbin_mode_boost_enabled */
+        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+
    }
    else
 #endif
@@ -1289,7 +1284,7 @@ int vp8cx_encode_inter_macroblock
 #endif

    // MB level adjutment to quantizer setup
-    if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled)
+    if (xd->segmentation_enabled)
    {
        // If cyclic update enabled
        if (cpi->cyclic_refresh_mode_enabled)
@@ -1299,9 +1294,14 @@ int vp8cx_encode_inter_macroblock
                ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
            {
                xd->mode_info_context->mbmi.segment_id = 0;
+
+                /* segment_id changed, so update */
+                vp8cx_mb_init_quantizer(cpi, x);
            }
        }
+    }

+    {
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
@@ -1325,7 +1325,7 @@ int vp8cx_encode_inter_macroblock
        else
            cpi->zbin_mode_boost = 0;

-        vp8cx_mb_init_quantizer(cpi,  x);
+        vp8_update_zbin_extra(cpi, x);
    }

    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -58,21 +58,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK
    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }

-void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
-{
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
-
-    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-
-    x->quantize_b(be, b);
-
-    IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
-
-    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 {
    int i;
@@ -144,51 +129,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    }
 }

-void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    int b;
-
-    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
-
-    vp8_transform_intra_mby(x);
-
-    vp8_quantize_mby(x);
-
-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    RECON_INVOKE(&rtcd->common->recon, recon_mby)
-        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-
-    // make sure block modes are set the way we want them for context updates
-    for (b = 0; b < 16; b++)
-    {
-        BLOCKD *d = &x->e_mbd.block[b];
-
-        switch (x->e_mbd.mode_info_context->mbmi.mode)
-        {
-
-        case DC_PRED:
-            d->bmi.mode = B_DC_PRED;
-            break;
-        case V_PRED:
-            d->bmi.mode = B_VE_PRED;
-            break;
-        case H_PRED:
-            d->bmi.mode = B_HE_PRED;
-            break;
-        case TM_PRED:
-            d->bmi.mode = B_TM_PRED;
-            break;
-        default:
-            d->bmi.mode = B_DC_PRED;
-            break;
-
-        }
-    }
-}
-
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_intra_predictors_mbuv(&x->e_mbd);
@@ -213,17 +153,3 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }

-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_intra_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -19,7 +19,5 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
 void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);

 #endif
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -128,7 +128,7 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)

        while (--i > 3);

-        if (x & 240)
+        if (x & 0xFFF0)
            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
    }

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -8,15 +8,18 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "onyx_int.h"
 #include "threading.h"
 #include "common.h"
 #include "extend.h"

+#if CONFIG_MULTITHREAD

-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
-extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                         TOKENEXTRA **t, int recon_yoffset,
+                                         int recon_uvoffset);
+extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
+                                          TOKENEXTRA **t);
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
@@ -24,12 +27,12 @@ extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
-    VP8_COMP *cpi   = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+    VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
    ENTROPY_CONTEXT_PLANES mb_row_left_context;

+    const int nsync = cpi->mt_sync_range;
    //printf("Started thread %d\n", ithread);

    while (1)
@@ -38,218 +41,213 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
            break;

        //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0)
+        if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
        {
+            VP8_COMMON *cm = &cpi->common;
+            int mb_row;
+            MACROBLOCK *x = &mbri->mb;
+            MACROBLOCKD *xd = &x->e_mbd;
+            TOKENEXTRA *tp ;
+
+            int *segment_counts = mbri->segment_counts;
+            int *totalrate = &mbri->totalrate;
+
            if (cpi->b_multi_threaded == FALSE) // we're shutting down
                break;
-            else
+
+            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
-                VP8_COMMON *cm      = &cpi->common;
-                int mb_row           = mbri->mb_row;
-                MACROBLOCK  *x      = &mbri->mb;
-                MACROBLOCKD *xd     = &x->e_mbd;
-                TOKENEXTRA **tp     = &mbri->tp;
-                int *segment_counts  = mbri->segment_counts;
-                int *totalrate      = &mbri->totalrate;

+                int i;
+                int recon_yoffset, recon_uvoffset;
+                int mb_col;
+                int ref_fb_idx = cm->lst_fb_idx;
+                int dst_fb_idx = cm->new_fb_idx;
+                int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+                int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+                volatile int *last_row_current_mb_col;
+                INT64 activity_sum = 0;
+
+                tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+
+                last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+
+                // reset above block coeffs
+                xd->above_context = cm->above_context;
+                xd->left_context = &mb_row_left_context;
+
+                vp8_zero(mb_row_left_context);
+
+                xd->up_available = (mb_row != 0);
+                recon_yoffset = (mb_row * recon_y_stride * 16);
+                recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+                cpi->tplist[mb_row].start = tp;
+
+                //printf("Thread mb_row = %d\n", mb_row);
+
+                // for each macroblock col in image
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                {
-                    int i;
-                    int recon_yoffset, recon_uvoffset;
-                    int mb_col;
-                    int ref_fb_idx = cm->lst_fb_idx;
-                    int dst_fb_idx = cm->new_fb_idx;
-                    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-                    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-                    volatile int *last_row_current_mb_col;
-                    INT64 activity_sum = 0;
+                    int seg_map_index = (mb_row * cm->mb_cols);

-                    if (ithread > 0)
-                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
-                    else
-                        last_row_current_mb_col = &cpi->current_mb_col_main;
-
-                    // reset above block coeffs
-                    xd->above_context = cm->above_context;
-                    xd->left_context = &mb_row_left_context;
-
-                    vp8_zero(mb_row_left_context);
-
-                    xd->up_available = (mb_row != 0);
-                    recon_yoffset = (mb_row * recon_y_stride * 16);
-                    recon_uvoffset = (mb_row * recon_uv_stride * 8);
-
-
-                    cpi->tplist[mb_row].start = *tp;
-
-                    //printf("Thread mb_row = %d\n", mb_row);
-
-                    // for each macroblock col in image
-                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                    if ((mb_col & (nsync - 1)) == 0)
                    {
-                        int seg_map_index = (mb_row * cm->mb_cols);
-
-                        while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1)
+                        while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
                        {
                            x86_pause_hint();
                            thread_sleep(0);
                        }
-
-                        // Distance of Mb to the various image edges.
-                        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-                        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-                        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-                        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-
-                        // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
-                        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-                        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-                        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-                        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
-
-                        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-                        xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-                        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-                        xd->left_available = (mb_col != 0);
-
-                        x->rddiv = cpi->RDDIV;
-                        x->rdmult = cpi->RDMULT;
-
-                        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
-                            activity_sum += vp8_activity_masking(cpi, x);
-
-                        // Is segmentation enabled
-                        // MB level adjutment to quantizer
-                        if (xd->segmentation_enabled)
-                        {
-                            // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
-                            if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
-                                xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
-                            else
-                                xd->mode_info_context->mbmi.segment_id = 0;
-
-                            vp8cx_mb_init_quantizer(cpi, x);
-                        }
-                        else
-                            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default
-
-                        x->active_ptr = cpi->active_map + seg_map_index + mb_col;
-
-                        if (cm->frame_type == KEY_FRAME)
-                        {
-                            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
-#ifdef MODE_STATS
-                            y_modes[xd->mbmi.mode] ++;
-#endif
-                        }
-                        else
-                        {
-                            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
-
-#ifdef MODE_STATS
-                            inter_y_modes[xd->mbmi.mode] ++;
-
-                            if (xd->mbmi.mode == SPLITMV)
-                            {
-                                int b;
-
-                                for (b = 0; b < xd->mbmi.partition_count; b++)
-                                {
-                                    inter_b_modes[x->partition->bmi[b].mode] ++;
-                                }
-                            }
-
-#endif
-
-                            // Count of last ref frame 0,0 useage
-                            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                                cpi->inter_zz_count ++;
-
-                            // Special case code for cyclic refresh
-                            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
-                            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
-                            {
-                                cpi->segmentation_map[seg_map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
-
-                                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
-                                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
-                                // else mark it as dirty (1).
-                                if (xd->mode_info_context->mbmi.segment_id)
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
-                                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                                {
-                                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
-                                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
-                                }
-                                else
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
-
-                            }
-                        }
-                        cpi->tplist[mb_row].stop = *tp;
-
-                        x->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
-
-                        for (i = 0; i < 16; i++)
-                            vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
-
-                        // adjust to the next column of macroblocks
-                        x->src.y_buffer += 16;
-                        x->src.u_buffer += 8;
-                        x->src.v_buffer += 8;
-
-                        recon_yoffset += 16;
-                        recon_uvoffset += 8;
-
-                        // Keep track of segment useage
-                        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
-
-                        // skip to next mb
-                        xd->mode_info_context++;
-                        x->partition_info++;
-
-                        xd->above_context++;
-
-                        cpi->mb_row_ei[ithread].current_mb_col = mb_col;
-
                    }

-                    //extend the recon for intra prediction
-                    vp8_extend_mb_row(
-                        &cm->yv12_fb[dst_fb_idx],
-                        xd->dst.y_buffer + 16,
-                        xd->dst.u_buffer + 8,
-                        xd->dst.v_buffer + 8);
+                    // Distance of Mb to the various image edges.
+                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                    xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+                    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+                    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-                    // this is to account for the border
+                    // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+                    x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+                    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+                    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+                    xd->left_available = (mb_col != 0);
+
+                    x->rddiv = cpi->RDDIV;
+                    x->rdmult = cpi->RDMULT;
+
+                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                        activity_sum += vp8_activity_masking(cpi, x);
+
+                    // Is segmentation enabled
+                    // MB level adjutment to quantizer
+                    if (xd->segmentation_enabled)
+                    {
+                        // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+                        if (cpi->segmentation_map[seg_map_index + mb_col] <= 3)
+                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index + mb_col];
+                        else
+                            xd->mode_info_context->mbmi.segment_id = 0;
+
+                        vp8cx_mb_init_quantizer(cpi, x);
+                    }
+                    else
+                        xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default
+
+                    x->active_ptr = cpi->active_map + seg_map_index + mb_col;
+
+                    if (cm->frame_type == KEY_FRAME)
+                    {
+                        *totalrate += vp8cx_encode_intra_macro_block(cpi, x, &tp);
+#ifdef MODE_STATS
+                        y_modes[xd->mbmi.mode] ++;
+#endif
+                    }
+                    else
+                    {
+                        *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+                        inter_y_modes[xd->mbmi.mode] ++;
+
+                        if (xd->mbmi.mode == SPLITMV)
+                        {
+                            int b;
+
+                            for (b = 0; b < xd->mbmi.partition_count; b++)
+                            {
+                                inter_b_modes[x->partition->bmi[b].mode] ++;
+                            }
+                        }
+
+#endif
+
+                        // Count of last ref frame 0,0 useage
+                        if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                            cpi->inter_zz_count++;
+
+                        // Special case code for cyclic refresh
+                        // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+                        // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                        {
+                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+                            cpi->segmentation_map[seg_map_index + mb_col] = mbmi->segment_id;
+
+                            // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+                            // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+                            // else mark it as dirty (1).
+                            if (mbmi->segment_id)
+                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = -1;
+                            else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+                            {
+                                if (cpi->cyclic_refresh_map[seg_map_index + mb_col] == 1)
+                                    cpi->cyclic_refresh_map[seg_map_index + mb_col] = 0;
+                            }
+                            else
+                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = 1;
+
+                        }
+                    }
+                    cpi->tplist[mb_row].stop = tp;
+
+                    x->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
+
+                    for (i = 0; i < 16; i++)
+                        vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+                    // adjust to the next column of macroblocks
+                    x->src.y_buffer += 16;
+                    x->src.u_buffer += 8;
+                    x->src.v_buffer += 8;
+
+                    recon_yoffset += 16;
+                    recon_uvoffset += 8;
+
+                    // Keep track of segment useage
+                    segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+
+                    // skip to next mb
                    xd->mode_info_context++;
                    x->partition_info++;
-                    x->activity_sum += activity_sum;
-
-                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
-                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-                    x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-
-                    xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
-                    x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
-
-                    if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1)
-                    {
-                        //SetEvent(cpi->h_event_main);
-                        sem_post(&cpi->h_event_main);
-                    }
+                    xd->above_context++;

+                    cpi->mt_current_mb_col[mb_row] = mb_col;
                }

+                //extend the recon for intra prediction
+                vp8_extend_mb_row(
+                    &cm->yv12_fb[dst_fb_idx],
+                    xd->dst.y_buffer + 16,
+                    xd->dst.u_buffer + 8,
+                    xd->dst.v_buffer + 8);
+
+                // this is to account for the border
+                xd->mode_info_context++;
+                x->partition_info++;
+                x->activity_sum += activity_sum;
+
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+
+                if (mb_row == cm->mb_rows - 1)
+                {
+                    //SetEvent(cpi->h_event_main);
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
            }
        }
    }

-#else
-    (void) p_data;
-#endif
-
    //printf("exit thread %d\n", ithread);
    return 0;
 }
@@ -363,7 +361,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    }
 }

-
 void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                               MACROBLOCK *x,
                               MB_ROW_COMP *mbr_ei,
@@ -414,7 +411,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);

-
        vp8_build_block_offsets(mb);

        vp8_setup_block_dptrs(mbd);
@@ -431,17 +427,12 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
    }
 }

-
 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
    cpi->b_multi_threaded = 0;

    cpi->processor_core_count = 32; //vp8_get_proc_core_count();

-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
-
-#if CONFIG_MULTITHREAD
-
    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
    {
        int ithread;
@@ -451,14 +442,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
        else
            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;

-
        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+
        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
-        sem_init(&cpi->h_event_main, 0, 0);
+        sem_init(&cpi->h_event_end_encoding, 0, 0);

        cpi->b_multi_threaded = 1;

@@ -466,11 +458,13 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)

        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
        {
+            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
+
            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
-            sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0);
-            cpi->en_thread_data[ithread].ithread = ithread;
-            cpi->en_thread_data[ithread].ptr1 = (void *)cpi;
-            cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread];
+            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+            ethd->ithread = ithread;
+            ethd->ptr1 = (void *)cpi;
+            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];

            //printf(" call begin thread %d \n", ithread);

@@ -482,19 +476,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
            //  0,
            //  NULL);

-            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread]));
-
+            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
        }

    }

-#endif
 }

 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
-#if CONFIG_MULTITHREAD
-
    if (cpi->b_multi_threaded)
    {
        //shutdown other threads
@@ -505,20 +495,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                //SetEvent(cpi->h_event_mbrencoding[i]);
-                sem_post(&cpi->h_event_mbrencoding[i]);
+                sem_post(&cpi->h_event_start_encoding[i]);
                pthread_join(cpi->h_encoding_thread[i], 0);
-            }

-            for (i = 0; i < cpi->encoding_thread_count; i++)
-                sem_destroy(&cpi->h_event_mbrencoding[i]);
+                sem_destroy(&cpi->h_event_start_encoding[i]);
+            }
        }
+
+        sem_destroy(&cpi->h_event_end_encoding);
+
        //free thread related resources
-        vpx_free(cpi->h_event_mbrencoding);
+        vpx_free(cpi->h_event_start_encoding);
        vpx_free(cpi->h_encoding_thread);
        vpx_free(cpi->mb_row_ei);
        vpx_free(cpi->en_thread_data);
+        vpx_free(cpi->mt_current_mb_col);
    }
-
-#endif
-    vpx_free(cpi->tplist);
 }
+#endif
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "math.h"
 #include "limits.h"
 #include "block.h"
@@ -178,40 +177,68 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    return modified_err;
 }

+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
 double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
 {
    int i, j;

    unsigned char *src = source->y_buffer;
-    unsigned char value;
    double sum_weights = 0.0;
-    double Weight;

    // Loop throught the Y plane raw examining levels and creating a weight for the image
-    for (i = 0; i < source->y_height; i++)
+    i = source->y_height;
+    do
    {
-        for (j = 0; j < source->y_width; j++)
+        j = source->y_width;
+        do
        {
-            value = src[j];
-
-            if (value >= 64)
-                Weight = 1.0;
-            else if (value > 32)
-                Weight = (value - 32.0f) / 32.0f;
-            else
-                Weight = 0.02;
-
-            sum_weights += Weight;
-        }
-
+            sum_weights += weight_table[ *src];
+            src++;
+        }while(--j);
+        src -= source->y_width;
        src += source->y_stride;
-    }
+    }while(--i);

    sum_weights /= (source->y_height * source->y_width);

    return sum_weights;
 }

+
 // This function returns the current per frame maximum bitrate target
 int frame_max_bits(VP8_COMP *cpi)
 {
@@ -440,7 +467,6 @@ void vp8_end_first_pass(VP8_COMP *cpi)
    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }

-
 void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD * const xd = & x->e_mbd;
@@ -460,7 +486,6 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }

-
 void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD *const xd = & x->e_mbd;
@@ -548,7 +573,6 @@ void vp8_first_pass(VP8_COMP *cpi)

    int sum_in_vectors = 0;

-    MV best_ref_mv = {0, 0};
    MV zero_ref_mv = {0, 0};

    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
@@ -586,13 +610,20 @@ void vp8_first_pass(VP8_COMP *cpi)
    // for each macroblock row in image
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
    {
-        MV best_ref_mv = {0, 0};
+        int_mv best_ref_mv;
+
+        best_ref_mv.as_int = 0;

        // reset above block coeffs
        xd->up_available = (mb_row != 0);
        recon_yoffset = (mb_row * recon_y_stride * 16);
        recon_uvoffset = (mb_row * recon_uv_stride * 8);

+        // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
        // for each macroblock col in image
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
@@ -625,8 +656,6 @@ void vp8_first_pass(VP8_COMP *cpi)
            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-            x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-            x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);

            // Other than for the first frame do a motion search
            if (cm->current_video_frame > 0)
@@ -647,12 +676,12 @@ void vp8_first_pass(VP8_COMP *cpi)

                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
+                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                        &d->bmi.mv.as_mv, lst_yv12,
                                        &motion_error, recon_yoffset);

                // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-                if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+                if (best_ref_mv.as_int)
                {
                   tmp_err = INT_MAX;
                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
@@ -664,7 +693,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                        d->bmi.mv.as_mv.row = tmp_mv.row;
                        d->bmi.mv.as_mv.col = tmp_mv.col;
                   }
-
                }

                // Experimental search in a second reference frame ((0,0) based only)
@@ -693,6 +721,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                    xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
                }

+                /* Intra assumed best */
+                best_ref_mv.as_int = 0;
+
                if (motion_error <= this_error)
                {
                    d->bmi.mv.as_mv.row <<= 3;
@@ -708,13 +739,10 @@ void vp8_first_pass(VP8_COMP *cpi)
                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
                    intercount++;

-                    best_ref_mv.row = d->bmi.mv.as_mv.row;
-                    best_ref_mv.col = d->bmi.mv.as_mv.col;
-                    //best_ref_mv.row = 0;
-                    //best_ref_mv.col = 0;
+                    best_ref_mv.as_int = d->bmi.mv.as_int;

                    // Was the vector non-zero
-                    if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+                    if (d->bmi.mv.as_int)
                    {
                        mvcount++;

@@ -770,12 +798,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                            *fp_motion_map_ptr = 1;
                    }
                }
-                else
-                {
-                    // Intra was best
-                    best_ref_mv.row = 0;
-                    best_ref_mv.col = 0;
-                }
            }

            coded_error += this_error;
@@ -813,6 +835,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        fps.coded_error = coded_error >> 8;
        weight = vp8_simple_weight(cpi->Source);

+
        if (weight < 0.1)
            weight = 0.1;

@@ -1316,6 +1339,43 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 {
 }

+// This function gives and estimate of how badly we believe
+// the predicition quality is decaying from frame to frame.
+double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    double prediction_decay_rate;
+    double motion_decay;
+    double motion_pct = next_frame->pcnt_motion;
+
+
+    // Initial basis is the % mbs inter coded
+    prediction_decay_rate = next_frame->pcnt_inter;
+
+    // High % motion -> somewhat higher decay rate
+    motion_decay = (1.0 - (motion_pct / 20.0));
+    if (motion_decay < prediction_decay_rate)
+        prediction_decay_rate = motion_decay;
+
+    // Adjustment to decay rate based on speed of motion
+    {
+        double this_mv_rabs;
+        double this_mv_cabs;
+        double distance_factor;
+
+        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                               (this_mv_cabs * this_mv_cabs)) / 250.0;
+        distance_factor = ((distance_factor > 1.0)
+                                ? 0.0 : (1.0 - distance_factor));
+        if (distance_factor < prediction_decay_rate)
+            prediction_decay_rate = distance_factor;
+    }
+
+    return prediction_decay_rate;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1337,17 +1397,20 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    double decay_accumulator = 1.0;

    double boost_factor = IIFACTOR;
-    double loop_decay_rate = 1.00;        // Starting decay rate
+    double loop_decay_rate = 1.00;          // Starting decay rate

    double this_frame_mv_in_out = 0.0;
    double mv_in_out_accumulator = 0.0;
    double abs_mv_in_out_accumulator = 0.0;
    double mod_err_per_mb_accumulator = 0.0;

-    int max_bits = frame_max_bits(cpi);    // Max for a single frame
+    int max_bits = frame_max_bits(cpi);     // Max for a single frame

    unsigned char *fpmm_pos;

+    unsigned int allow_alt_ref =
+                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
    cpi->gf_group_bits = 0;
    cpi->gf_decay_rate = 0;

@@ -1362,47 +1425,57 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Preload the stats for the next frame.
    mod_frame_err = calculate_modified_err(cpi, this_frame);

-    // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf
+    // Note the error of the frame at the start of the group (this will be
+    // the GF frame error if we code a normal gf
    gf_first_frame_err = mod_frame_err;

-    // Special treatment if the current frame is a key frame (which is also a gf).
-    // If it is then its error score (and hence bit allocation) need to be subtracted out
-    // from the calculation for the GF group
+    // Special treatment if the current frame is a key frame (which is also
+    // a gf). If it is then its error score (and hence bit allocation) need
+    // to be subtracted out from the calculation for the GF group
    if (cpi->common.frame_type == KEY_FRAME)
        gf_group_err -= gf_first_frame_err;

-    // Scan forward to try and work out how many frames the next gf group should contain and
-    // what level of boost is appropriate for the GF or ARF that will be coded with the group
+    // Scan forward to try and work out how many frames the next gf group
+    // should contain and what level of boost is appropriate for the GF
+    // or ARF that will be coded with the group
    i = 0;

-    while (((i < cpi->static_scene_max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    while (((i < cpi->static_scene_max_gf_interval) ||
+            ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) &&
+           (i < cpi->frames_to_key))
    {
        double r;
        double this_frame_mvr_ratio;
        double this_frame_mvc_ratio;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        //double motion_pct = next_frame.pcnt_motion;
+        double motion_pct;

-        i++;                                                    // Increment the loop counter
+        i++;    // Increment the loop counter

        // Accumulate error score of frames in this gf group
        mod_frame_err = calculate_modified_err(cpi, this_frame);

        gf_group_err += mod_frame_err;

-        mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+        mod_err_per_mb_accumulator +=
+            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);

        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;

        // Accumulate motion stats.
+        motion_pct = next_frame.pcnt_motion;
        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_pct);
        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_pct);

        //Accumulate Motion In/Out of frame stats
-        this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
-        mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion;
-        abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+        this_frame_mv_in_out =
+            next_frame.mv_in_out_count * motion_pct;
+        mv_in_out_accumulator +=
+            next_frame.mv_in_out_count * motion_pct;
+        abs_mv_in_out_accumulator +=
+            fabs(next_frame.mv_in_out_count * motion_pct);

        // If there is a significant amount of motion
        if (motion_pct > 0.05)
@@ -1431,7 +1504,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }

        // Underlying boost factor is based on inter intra error ratio
-        r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
+        r = ( boost_factor *
+              ( next_frame.intra_error /
+                DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));

        if (next_frame.intra_error > cpi->gf_intra_err_min)
            r = (IIKFACTOR2 * next_frame.intra_error /
@@ -1440,54 +1515,76 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            r = (IIKFACTOR2 * cpi->gf_intra_err_min /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-        // Increase boost for frames where new data coming into frame (eg zoom out)
-        // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
+        // Increase boost for frames where new data coming into frame
+        // (eg zoom out). Slightly reduce boost if there is a net balance
+        // of motion out of the frame (zoom in).
        // The range for this_frame_mv_in_out is -1.0 to +1.0
        if (this_frame_mv_in_out > 0.0)
            r += r * (this_frame_mv_in_out * 2.0);
+        // In extreme case boost is halved
        else
-            r += r * (this_frame_mv_in_out / 2.0);  // In extreme case boost is halved
+            r += r * (this_frame_mv_in_out / 2.0);

        if (r > GF_RMAX)
            r = GF_RMAX;

-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);

        // Cumulative effect of decay
        decay_accumulator = decay_accumulator * loop_decay_rate;
        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;

        boost_score += (decay_accumulator * r);

+        // Break clause to detect very still sections after motion
+        // For example a staic image after a fade or other transition
+        // instead of a clean key frame.
+        if ( (i > MIN_GF_INTERVAL) &&
+             (loop_decay_rate >= 0.999) &&
+             (decay_accumulator < 0.9) )
+        {
+            int j;
+            FIRSTPASS_STATS * position = cpi->stats_in;
+            FIRSTPASS_STATS tmp_next_frame;
+            double decay_rate;
+
+            // Look ahead a few frames to see if static condition
+            // persists...
+            for ( j = 0; j < 4; j++ )
+            {
+                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+                    break;
+
+                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
+                if ( decay_rate < 0.999 )
+                    break;
+            }
+            reset_fpf_position(cpi, position);            // Reset file position
+
+            // Force GF not alt ref
+            if ( j == 4 )
+            {
+                if (0)
+                {
+                    FILE *f = fopen("fadegf.stt", "a");
+                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
+                         cpi->common.current_video_frame+i, i,
+                         loop_decay_rate, decay_accumulator,
+                         boost_score );
+                    fclose(f);
+                }
+
+                allow_alt_ref = FALSE;
+
+                boost_score = old_boost_score;
+                break;
+            }
+        }
+
        // Break out conditions.
        if  (   /* i>4 || */
            // Break at cpi->max_gf_interval unless almost totally static
-            (i >= cpi->max_gf_interval && (loop_decay_rate < 0.99)) ||
+            (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
            (
                // Dont break out with a very short interval
                (i > MIN_GF_INTERVAL) &&
@@ -1509,7 +1606,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        old_boost_score = boost_score;
    }

-    cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+    cpi->gf_decay_rate =
+        (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;

    // When using CBR apply additional buffer related upper limits
    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -1519,7 +1617,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // For cbr apply buffer related limits
        if (cpi->drop_frames_allowed)
        {
-            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+            int df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+                                  (cpi->oxcf.optimal_buffer_level / 100);

            if (cpi->buffer_level > df_buffer_level)
                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
@@ -1542,10 +1641,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;

    // Should we use the alternate refernce frame
-    if (cpi->oxcf.play_alternate &&
-        cpi->oxcf.lag_in_frames &&
+    if (allow_alt_ref &&
        (i >= MIN_GF_INTERVAL) &&
-        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&          // dont use ARF very near next kf
+        // dont use ARF very near next kf
+        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&
        (((next_frame.pcnt_inter > 0.75) &&
          ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) &&
          //(cpi->gfu_boost>150) &&
@@ -2347,12 +2446,35 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    if (cpi->oxcf.auto_key
        && cpi->frames_to_key > (int)cpi->key_frame_frequency )
    {
+        FIRSTPASS_STATS *current_pos = cpi->stats_in;
+        FIRSTPASS_STATS tmp_frame;
+
        cpi->frames_to_key /= 2;

-        // Estimate corrected kf group error
-        kf_group_err /= 2.0;
-        kf_group_intra_err /= 2.0;
-        kf_group_coded_err /= 2.0;
+        // Copy first frame details
+        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+        // Reset to the start of the group
+        reset_fpf_position(cpi, start_position);
+
+        kf_group_err = 0;
+        kf_group_intra_err = 0;
+        kf_group_coded_err = 0;
+
+        // Rescan to get the correct error data for the forced kf group
+        for( i = 0; i < cpi->frames_to_key; i++ )
+        {
+            // Accumulate kf group errors
+            kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+            kf_group_intra_err += tmp_frame.intra_error;
+            kf_group_coded_err += tmp_frame.coded_error;
+
+            // Load a the next frame's stats
+            vp8_input_stats(cpi, &tmp_frame);
+        }
+
+        // Reset to the start of the group
+        reset_fpf_position(cpi, current_pos);

        cpi->next_key_frame_forced = TRUE;
    }
@@ -2451,7 +2573,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    {
        double r;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        double motion_pct;

        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;
@@ -2471,6 +2593,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        loop_decay_rate = next_frame.pcnt_inter;

        // High % motion -> somewhat higher decay rate
+        motion_pct = next_frame.pcnt_motion;
        motion_decay = (1.0 - (motion_pct / 20.0));
        if (motion_decay < loop_decay_rate)
            loop_decay_rate = motion_decay;
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -779,15 +779,17 @@ int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vfp,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
    int i, j;
    unsigned char *src = (*(b->base_src) + b->src);
    int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    int rr = center_mv->row, rc = center_mv->col;
+    int br = ref_mv->row >> 3, bc = ref_mv->col >> 3, tr, tc;
    unsigned int besterr, thiserr = 0x7fffffff;
    int k = -1, tk;

@@ -892,7 +894,7 @@ cal_neighbors:
    best_mv->row = br;
    best_mv->col = bc;

-    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + vp8_mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
 }
 #undef MVC
 #undef PRE
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -43,8 +43,8 @@ extern int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vf,
    int *mvsadcost[2],
-    int *mvcost[2]
-
+    int *mvcost[2],
+    MV *center_mv
 );

 typedef int (fractional_mv_step_fp)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -262,6 +262,10 @@ static void setup_features(VP8_COMP *cpi)

 void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 {
+    if(cpi->tplist!=0)
+        vpx_free(cpi->tplist);
+    cpi->tplist = NULL;
+
    // Delete last frame MV storage buffers
    if (cpi->lfmv != 0)
        vpx_free(cpi->lfmv);
@@ -598,6 +602,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)

    sf->first_step = 0;
    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+    sf->improved_mv_pred = 1;

    cpi->do_full[0] = 0;
    cpi->do_full[1] = 0;
@@ -640,34 +645,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)

        sf->first_step = 0;
        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-
-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
        break;
    case 1:
    case 3:
@@ -725,41 +702,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->full_freq[0] = 15;
        sf->full_freq[1] = 31;

-        sf->first_step = 0;
-        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-
-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
        if (Speed > 0)
        {
-            // Disable coefficient optimization above speed 0
+            /* Disable coefficient optimization above speed 0 */
            sf->optimize_coefficients = 0;
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;

+            sf->first_step = 1;
+
+            cpi->mode_check_freq[THR_SPLITG] = 2;
+            cpi->mode_check_freq[THR_SPLITA] = 2;
+            cpi->mode_check_freq[THR_SPLITMV] = 0;
+        }
+
+        if (Speed > 1)
+        {
            cpi->mode_check_freq[THR_SPLITG] = 4;
            cpi->mode_check_freq[THR_SPLITA] = 4;
            cpi->mode_check_freq[THR_SPLITMV] = 2;
@@ -792,18 +750,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_NEWA     ] = 2000;
                sf->thresh_mult[THR_SPLITA   ] = 20000;
            }
-
-            sf->use_fastquant_for_pick = 1;
-
-            sf->first_step = 1;
-            sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-            sf->no_skip_block4x4_search = 0;
        }

-        if (Speed > 1)
+        if (Speed > 2)
        {
-            sf->use_fastquant_for_pick = 0;
-
            cpi->mode_check_freq[THR_SPLITG] = 15;
            cpi->mode_check_freq[THR_SPLITA] = 15;
            cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -837,8 +787,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                sf->thresh_mult[THR_SPLITA   ] = 50000;
            }

-            sf->first_step = 1;
-
            sf->improved_quant = 0;
            sf->improved_dct = 0;

@@ -848,38 +796,14 @@ void vp8_set_speed_features(VP8_COMP *cpi)

            sf->full_freq[0] = 31;
            sf->full_freq[1] = 63;
-
-        }
-
-        if (Speed > 2)
-        {
-            sf->auto_filter = 0;                     // Faster selection of loop filter
-            cpi->mode_check_freq[THR_V_PRED] = 2;
-            cpi->mode_check_freq[THR_H_PRED] = 2;
-            cpi->mode_check_freq[THR_B_PRED] = 2;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARG] = 2;
-                cpi->mode_check_freq[THR_NEWG] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARA] = 2;
-                cpi->mode_check_freq[THR_NEWA] = 4;
-            }
-
-            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-
-            sf->full_freq[0] = 63;
-            sf->full_freq[1] = 127;
        }

        if (Speed > 3)
        {
+            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+
            cpi->mode_check_freq[THR_V_PRED] = 0;
            cpi->mode_check_freq[THR_H_PRED] = 0;
            cpi->mode_check_freq[THR_B_PRED] = 0;
@@ -891,13 +815,16 @@ void vp8_set_speed_features(VP8_COMP *cpi)
            sf->auto_filter = 1;
            sf->recode_loop = 0; // recode loop off
            sf->RD = 0;         // Turn rd off
-            sf->full_freq[0] = INT_MAX;
-            sf->full_freq[1] = INT_MAX;
+
+            sf->full_freq[0] = 63;
+            sf->full_freq[1] = 127;
        }

        if (Speed > 4)
        {
            sf->auto_filter = 0;                     // Faster selection of loop filter
+            sf->full_freq[0] = INT_MAX;
+            sf->full_freq[1] = INT_MAX;

            cpi->mode_check_freq[THR_V_PRED] = 2;
            cpi->mode_check_freq[THR_H_PRED] = 2;
@@ -963,33 +890,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
        sf->full_freq[1] = 31;
        sf->search_method = NSTEP;

-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
        if (Speed > 0)
        {
            cpi->mode_check_freq[THR_SPLITG] = 4;
@@ -1118,6 +1018,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 #else
            sf->search_method = DIAMOND;
 #endif
+            sf->iterative_sub_pixel = 0;

            cpi->mode_check_freq[THR_V_PRED] = 4;
            cpi->mode_check_freq[THR_H_PRED] = 4;
@@ -1169,7 +1070,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
            int total_skip;

            int min = 2000;
-            sf->iterative_sub_pixel = 0;

            if (cpi->oxcf.encode_breakout > 2000)
                min = cpi->oxcf.encode_breakout;
@@ -1225,6 +1125,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
            sf->thresh_mult[THR_V_PRED] = INT_MAX;
            sf->thresh_mult[THR_H_PRED] = INT_MAX;

+            sf->improved_mv_pred = 0;
        }

        if (Speed > 8)
@@ -1270,7 +1171,36 @@ void vp8_set_speed_features(VP8_COMP *cpi)

        vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));

-    };
+    }; /* switch */
+
+    /* disable frame modes if flags not set */
+    if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+    {
+        sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+        sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+        sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+    }
+
+    if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+    {
+        sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+        sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+        sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+    }
+
+    if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+    {
+        sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+        sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+        sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+    }
+

    // Slow quant, dct and trellis not worthwhile for first pass
    // so make sure they are always turned off.
@@ -1465,6 +1395,22 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate firstpass stats");
 #endif
+
+#if CONFIG_MULTITHREAD
+    if (width < 640)
+        cpi->mt_sync_range = 1;
+    else if (width <= 1280)
+        cpi->mt_sync_range = 4;
+    else if (width <= 2560)
+        cpi->mt_sync_range = 8;
+    else
+        cpi->mt_sync_range = 16;
+#endif
+
+    if(cpi->tplist);
+        vpx_free(cpi->tplist);
+
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 }


@@ -2187,7 +2133,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)

    cpi->common.error.setjmp = 1;

-    CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA)));
    CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));

    vp8_create_common(&cpi->common);
@@ -2224,9 +2169,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->gold_is_alt  = 0 ;

    // allocate memory for storing last frame's MVs for MV prediction.
-    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int_mv)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int_mv)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int)));

    // Create the encoder segmentation map and set all entries to 0
    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
@@ -2417,7 +2362,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    init_mv_ref_counts();
 #endif

+#if CONFIG_MULTITHREAD
    vp8cx_create_encoder_threads(cpi);
+#endif

    cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
    cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
@@ -2692,12 +2639,13 @@ void vp8_remove_compressor(VP8_PTR *ptr)

    }

+#if CONFIG_MULTITHREAD
    vp8cx_remove_encoder_threads(cpi);
+#endif

    vp8_dealloc_compressor_data(cpi);
    vpx_free(cpi->mb.ss);
    vpx_free(cpi->tok);
-    vpx_free(cpi->rdtok);
    vpx_free(cpi->cyclic_refresh_map);

    vp8_remove_common(&cpi->common);
@@ -3114,11 +3062,14 @@ static int pick_frame_size(VP8_COMP *cpi)

    return 1;
 }
+
 static void set_quantizer(VP8_COMP *cpi, int Q)
 {
    VP8_COMMON *cm = &cpi->common;
    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    int update = 0;

+    update |= cm->base_qindex != Q;
    cm->base_qindex = Q;

    cm->y1dc_delta_q = 0;
@@ -3127,11 +3078,21 @@ static void set_quantizer(VP8_COMP *cpi, int Q)
    cm->uvdc_delta_q = 0;
    cm->uvac_delta_q = 0;

+    if(Q<4)
+    {
+        update |= cm->y2dc_delta_q != 4-Q;
+        cm->y2dc_delta_q = 4-Q;
+    }
+
    // Set Segment specific quatizers
    mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
    mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
    mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
    mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+
+    if(update)
+        vp8cx_init_quantizer(cpi);
+
 }

 static void update_alt_ref_frame_and_stats(VP8_COMP *cpi)
@@ -3601,6 +3562,17 @@ static void encode_frame_to_data_rate
    // Test code for segmentation of gf/arf (0,0)
    //segmentation_test_function((VP8_PTR) cpi);

+#if CONFIG_REALTIME_ONLY
+    if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
+    {
+        if(cpi->force_next_frame_intra)
+        {
+            cm->frame_type = KEY_FRAME;  /* delayed intra frame */
+        }
+    }
+    cpi->force_next_frame_intra = 0;
+#endif
+
    // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
 #if !(CONFIG_REALTIME_ONLY)

@@ -3853,7 +3825,7 @@ static void encode_frame_to_data_rate
            // One pass more conservative
            else
               cpi->active_best_quality = kf_high_motion_minq[Q];
-         }
+        }

        else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
        {
@@ -4111,6 +4083,14 @@ static void encode_frame_to_data_rate
        // (assuming that we didn't)!
        if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
        {
+
+#if CONFIG_REALTIME_ONLY
+            {
+                /* we don't do re-encoding in realtime mode
+                 * if key frame is decided than we force it on next frame */
+                cpi->force_next_frame_intra = decide_key_frame(cpi);
+            }
+#else
            if (decide_key_frame(cpi))
            {
                vp8_calc_auto_iframe_target_size(cpi);
@@ -4149,6 +4129,7 @@ static void encode_frame_to_data_rate
                resize_key_frame(cpi);
                continue;
            }
+#endif
        }

        vp8_clear_system_state();
@@ -4188,7 +4169,7 @@ static void encode_frame_to_data_rate
                                         IF_RTCD(&cpi->rtcd.variance));

            // The key frame is not good enough
-            if ( kf_err > ((cpi->ambient_err * 3) >> 2) )
+            if ( kf_err > ((cpi->ambient_err * 7) >> 3) )
            {
                // Lower q_high
                q_high = (Q > q_low) ? (Q - 1) : q_low;
@@ -4386,32 +4367,33 @@ static void encode_frame_to_data_rate
    }

    // This frame's MVs are saved and will be used in next frame's MV prediction.
+    // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0.
    if(cm->show_frame)   //do not save for altref frame
    {
-      int mb_row;
-      int mb_col;
-      MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
-      //static int last_video_frame = 0;
+        int mb_row;
+        int mb_col;
+        MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.

-      if(cm->frame_type != KEY_FRAME)
-      {
-        for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+        if(cm->frame_type != KEY_FRAME)
        {
-          for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
-          {
-              if(tmp->mbmi.ref_frame != INTRA_FRAME)
-                cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride)].as_int = tmp->mbmi.mv.as_int;
+            for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+                {
+                    if(tmp->mbmi.ref_frame != INTRA_FRAME)
+                        cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride+1)].as_int = tmp->mbmi.mv.as_int;

-              cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
-              cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride)] = tmp->mbmi.ref_frame;
-              tmp++;
-          }
+                    cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride+1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+                    cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride+1)] = tmp->mbmi.ref_frame;
+                    tmp++;
+                }
+            }
        }
-      }
    }

    // Update the GF useage maps.
    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
+    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);

    if (cm->frame_type == KEY_FRAME)
@@ -4614,7 +4596,8 @@ static void encode_frame_to_data_rate
    }

    // Update the buffer level variable.
-    if (cpi->common.refresh_alt_ref_frame)
+    // Non-viewable frames are a special case and are treated as pure overhead.
+    if ( !cm->show_frame )
        cpi->bits_off_target -= cpi->projected_frame_size;
    else
        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -28,6 +28,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
 #include "temporal_filter.h"
+#include "findnearmv.h"

 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -184,17 +185,15 @@ typedef struct

    int use_fastquant_for_pick;
    int no_skip_block4x4_search;
+    int improved_mv_pred;

 } SPEED_FEATURES;

 typedef struct
 {
    MACROBLOCK  mb;
-    int mb_row;
-    TOKENEXTRA *tp;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
-    int current_mb_col;
 } MB_ROW_COMP;

 typedef struct
@@ -245,12 +244,6 @@ enum
    BLOCK_MAX_SEGMENTS
 };

-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
 typedef struct
 {

@@ -309,8 +302,6 @@ typedef struct

    YV12_BUFFER_CONFIG last_frame_uf;

-    char *Dest;
-
    TOKENEXTRA *tok;
    unsigned int tok_count;

@@ -343,11 +334,6 @@ typedef struct
    int RDMULT;
    int RDDIV ;

-    TOKENEXTRA *rdtok;
-    vp8_writer rdbc;
-    int intra_mode_costs[10];
-
-
    CODING_CONTEXT coding_context;

    // Rate targetting variables
@@ -355,7 +341,6 @@ typedef struct
    long long last_prediction_error;
    long long intra_error;
    long long last_intra_error;
-    long long last_auto_filter_prediction_error;

 #if 0
    // Experimental RD code
@@ -560,8 +545,6 @@ typedef struct

    int ref_frame_flags;

-    int exp[512];
-
    SPEED_FEATURES sf;
    int error_bins[1024];

@@ -607,22 +590,21 @@ typedef struct
    int cyclic_refresh_q;
    signed char *cyclic_refresh_map;

+#if CONFIG_MULTITHREAD
    // multithread data
-    int current_mb_col_main;
+    int * mt_current_mb_col;
+    int mt_sync_range;
    int processor_core_count;
    int b_multi_threaded;
    int encoding_thread_count;

-#if CONFIG_MULTITHREAD
    pthread_t *h_encoding_thread;
-#endif
    MB_ROW_COMP *mb_row_ei;
    ENCODETHREAD_DATA *en_thread_data;

-#if CONFIG_MULTITHREAD
    //events
-    sem_t *h_event_mbrencoding;
-    sem_t h_event_main;
+    sem_t *h_event_start_encoding;
+    sem_t h_event_end_encoding;
 #endif

    TOKENLIST *tplist;
@@ -694,6 +676,9 @@ typedef struct
    int *lf_ref_frame_sign_bias;
    int *lf_ref_frame;

+#if CONFIG_REALTIME_ONLY
+    int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
+#endif
 } VP8_COMP;

 void control_data_rate(VP8_COMP *cpi);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -24,7 +24,7 @@
 #include "g_common.h"
 #include "variance.h"
 #include "mcomp.h"
-
+#include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -168,8 +168,6 @@ static int pick_intra4x4block(
    B_PREDICTION_MODE *best_mode,
    B_PREDICTION_MODE above,
    B_PREDICTION_MODE left,
-    ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l,

    int *bestrate,
    int *bestdistortion)
@@ -179,8 +177,6 @@ static int pick_intra4x4block(
    int rate;
    int distortion;
    unsigned int *mode_costs;
-    (void) l;
-    (void) a;

    if (x->e_mbd.frame_type == KEY_FRAME)
    {
@@ -211,6 +207,7 @@ static int pick_intra4x4block(

    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
    vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode);
+
    return best_rd;
 }

@@ -220,17 +217,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    MACROBLOCKD *const xd = &mb->e_mbd;
    int i;
    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
-    int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode
+    int error;
    int distortion = 0;
-    ENTROPY_CONTEXT_PLANES t_above, t_left;
-    ENTROPY_CONTEXT *ta;
-    ENTROPY_CONTEXT *tl;
-
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;

    vp8_intra_prediction_down_copy(xd);

@@ -243,10 +231,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);

-        error += pick_intra4x4block(rtcd,
-                                    mb, mb->block + i, xd->block + i, &best_mode, A, L,
-                                    ta + vp8_block2above[i],
-                                    tl + vp8_block2left[i], &r, &d);
+        pick_intra4x4block(rtcd, mb, mb->block + i, xd->block + i,
+                               &best_mode, A, L, &r, &d);

        cost += r;
        distortion += d;
@@ -264,10 +250,15 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    *Rate = cost;

    if (i == 16)
+    {
        *best_dist = distortion;
+        error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, distortion);
+    }
    else
+    {
        *best_dist = INT_MAX;
-
+        error = INT_MAX;
+    }

    return error;
 }
@@ -421,7 +412,6 @@ int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)

 }

-
 int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
    BLOCK *b = &x->block[0];
@@ -430,7 +420,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    B_MODE_INFO best_bmodes[16];
    MB_MODE_INFO best_mbmode;
    PARTITION_INFO best_partition;
-    MV best_ref_mv1;
+    MV best_ref_mv;
    MV mode_mv[MB_MODE_COUNT];
    MB_PREDICTION_MODE this_mode;
    int num00;
@@ -448,9 +438,14 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    int best_mode_index = 0;
    int sse = INT_MAX;

+    MV mvp;
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+
    MV nearest_mv[4];
    MV near_mv[4];
-    MV best_ref_mv[4];
+    MV frame_best_ref_mv[4];
    int MDCounts[4][4];
    unsigned char *y_buffer[4];
    unsigned char *u_buffer[4];
@@ -470,7 +465,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
-                          &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
@@ -484,7 +479,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
-                          &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
@@ -498,7 +493,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
-                          &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
@@ -538,10 +533,6 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                                        + vp8_cost_one(cpi->prob_gf_coded);
    }

-
-
-    best_rd = INT_MAX;
-
    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

    // if we encode a new mv this is important
@@ -604,7 +595,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv1 = best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
        }

@@ -617,6 +608,28 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                continue;
        }

+        if(cpi->sf.improved_mv_pred && x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+        {
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+            /* adjust mvp to make sure it is within MV range */
+            if(mvp.row > best_ref_mv.row + MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row + MAX_FULL_PEL_VAL;
+            else if(mvp.row < best_ref_mv.row - MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row - MAX_FULL_PEL_VAL;
+            if(mvp.col > best_ref_mv.col + MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col + MAX_FULL_PEL_VAL;
+            else if(mvp.col < best_ref_mv.col - MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col - MAX_FULL_PEL_VAL;
+        }
+
        switch (this_mode)
        {
        case B_PRED:
@@ -672,61 +685,59 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            int n = 0;
            int sadpb = x->sadperbit16;

+            int col_min;
+            int col_max;
+            int row_min;
+            int row_max;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
            // Further step/diamond searches as necessary
-            if (cpi->Speed < 8)
+            step_param = cpi->sf.first_step + speed_adjust;
+
+            if(cpi->sf.improved_mv_pred)
            {
-                step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-            }
-            else
+                sr += speed_adjust;
+                //adjust search range according to sr from mv prediction
+                if(sr > step_param)
+                    step_param = sr;
+
+                col_min = (best_ref_mv.col - MAX_FULL_PEL_VAL) >>3;
+                col_max = (best_ref_mv.col + MAX_FULL_PEL_VAL) >>3;
+                row_min = (best_ref_mv.row - MAX_FULL_PEL_VAL) >>3;
+                row_max = (best_ref_mv.row + MAX_FULL_PEL_VAL) >>3;
+
+                // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+            }else
            {
-                step_param = cpi->sf.first_step + 2;
-                further_steps = 0;
+                mvp.row = best_ref_mv.row;
+                mvp.col = best_ref_mv.col;
            }

-#if 0
-
-            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
-            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-            // Further step searches
-            while (n < further_steps)
-            {
-                n++;
-
-                if (num00)
-                    num00--;
-                else
-                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);
-
-                    if (thissme < bestsme)
-                    {
-                        bestsme = thissme;
-                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-                    }
-                    else
-                    {
-                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
-                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
-                    }
-                }
-            }
-
-#else
+            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);

            if (cpi->sf.search_method == HEX)
            {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -745,7 +756,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9

                        if (thissme < bestsme)
                        {
@@ -762,19 +773,24 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                }
            }

-#endif
+            if(cpi->sf.improved_mv_pred)
+            {
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+            }
+
+            if (bestsme < INT_MAX)
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+
+            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+            // mv cost;
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
        }

-        if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
-
-        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-        // mv cost;
-        rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
-
-
        case NEARESTMV:
        case NEARMV:

--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -296,7 +296,6 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    int filt_err = 0;
    int min_filter_level;
    int max_filter_level;
-    int prediction_difference = (int)(100 * abs((int)(cpi->last_auto_filter_prediction_error - cpi->prediction_error)) / (1 + cpi->prediction_error));

    int filter_step;
    int filt_high = 0;
@@ -478,6 +477,5 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    cpi->last_auto_filt_val = filt_best;
    cpi->last_auto_filt_q  = cm->base_qindex;

-    cpi->last_auto_filter_prediction_error = cpi->prediction_error;
    cpi->frames_since_auto_filter = 0;
 }
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -129,9 +129,6 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
        rc   = vp8_default_zig_zag1d[i];
        z    = coeff_ptr[rc];

-        //if ( i == 0 )
-        //    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
-        //else
        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

        zbin_boost_ptr ++;
@@ -144,13 +141,13 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
            y  = (((x * quant_ptr[rc]) >> 16) + x)
                 >> quant_shift_ptr[rc];                // quantize (x)
            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc]  = x;                         // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+            qcoeff_ptr[rc]  = x;                        // write to destination
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

            if (y)
            {
                eob = i;                                // last nonzero coeffs
-                zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+                zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
            }
        }
    }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -43,7 +43,9 @@
 #endif


-void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
+

 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )

@@ -241,10 +243,9 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[cpi->next_iiratio]) >> 4;
    }

-    if (cpi->RDMULT < 125)
-        cpi->RDMULT = 125;
-
    cpi->mb.errorperbit = (cpi->RDMULT / 100);
+    cpi->mb.errorperbit += (cpi->mb.errorperbit==0);
+
    vp8_set_speed_features(cpi);

    if (cpi->common.simpler_lpf)
@@ -537,15 +538,79 @@ static int vp8_rdcost_mby(MACROBLOCK *mb)
    return cost;
 }

+static void macro_block_yrd( MACROBLOCK *mb,
+                             int *Rate,
+                             int *Distortion,
+                             const vp8_encodemb_rtcd_vtable_t *rtcd)
+{
+    int b;
+    MACROBLOCKD *const x = &mb->e_mbd;
+    BLOCK   *const mb_y2 = mb->block + 24;
+    BLOCKD *const x_y2  = x->block + 24;
+    short *Y2DCPtr = mb_y2->src_diff;
+    BLOCK *beptr;
+    int d;

-static void rd_pick_intra4x4block(
+    ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, mb->src.y_buffer,
+                                   mb->e_mbd.predictor, mb->src.y_stride );
+
+    // Fdct and building the 2nd order block
+    for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+    {
+        mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+        *Y2DCPtr++ = beptr->coeff[0];
+        *Y2DCPtr++ = beptr->coeff[16];
+    }
+
+    // 2nd order fdct
+    mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+    // Quantization
+    for (b = 0; b < 16; b++)
+    {
+        mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
+    }
+
+    // DC predication and Quantization of 2nd Order block
+    mb->quantize_b(mb_y2, x_y2);
+
+    // Distortion
+    d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2;
+    d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff);
+
+    *Distortion = (d >> 4);
+
+    // rate
+    *Rate = vp8_rdcost_mby(mb);
+}
+
+static void save_predictor(unsigned char *predictor, unsigned char *dst)
+{
+    int r, c;
+    for (r = 0; r < 4; r++)
+    {
+        memcpy(dst, predictor, 4);
+        dst += 4;
+        predictor += 16;
+    }
+}
+static void restore_predictor(unsigned char *predictor, unsigned char *dst)
+{
+    int r, c;
+    for (r = 0; r < 4; r++)
+    {
+        memcpy(predictor, dst, 4);
+        dst += 4;
+        predictor += 16;
+    }
+}
+static int rd_pick_intra4x4block(
    VP8_COMP *cpi,
    MACROBLOCK *x,
    BLOCK *be,
    BLOCKD *b,
    B_PREDICTION_MODE *best_mode,
-    B_PREDICTION_MODE above,
-    B_PREDICTION_MODE left,
+    unsigned int *bmode_costs,
    ENTROPY_CONTEXT *a,
    ENTROPY_CONTEXT *l,

@@ -554,31 +619,27 @@ static void rd_pick_intra4x4block(
    int *bestdistortion)
 {
    B_PREDICTION_MODE mode;
-    int best_rd = INT_MAX;       // 1<<30
+    int best_rd = INT_MAX;
    int rate = 0;
    int distortion;
-    unsigned int *mode_costs;

    ENTROPY_CONTEXT ta = *a, tempa = *a;
    ENTROPY_CONTEXT tl = *l, templ = *l;

-
-    if (x->e_mbd.frame_type == KEY_FRAME)
-    {
-        mode_costs  = x->bmode_costs[above][left];
-    }
-    else
-    {
-        mode_costs = x->inter_bmode_costs;
-    }
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16);
+    DECLARE_ALIGNED_ARRAY(16, short, dqcoeff, 16);

    for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
    {
        int this_rd;
        int ratey;

-        rate = mode_costs[mode];
-        vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode);
+        rate = bmode_costs[mode];
+
+        vp8_predict_intra4x4(b, mode, b->predictor);
+        ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
+        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);

        tempa = ta;
        templ = tl;
@@ -598,25 +659,36 @@ static void rd_pick_intra4x4block(
            *best_mode = mode;
            *a = tempa;
            *l = templ;
+            save_predictor(b->predictor, predictor);
+            vpx_memcpy(dqcoeff, b->dqcoeff, 32);
        }
    }

    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
-    vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);
+
+    restore_predictor(b->predictor, predictor);
+    vpx_memcpy(b->dqcoeff, dqcoeff, 32);
+
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(b->dqcoeff, b->diff, 32);
+    RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    return best_rd;

 }

-
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
+                                  int *rate_y, int *Distortion, int best_rd)
 {
    MACROBLOCKD *const xd = &mb->e_mbd;
    int i;
    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
    int distortion = 0;
    int tot_rate_y = 0;
+    int total_rd = 0;
    ENTROPY_CONTEXT_PLANES t_above, t_left;
    ENTROPY_CONTEXT *ta;
    ENTROPY_CONTEXT *tl;
+    unsigned int *bmode_costs;

    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -626,17 +698,25 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int

    vp8_intra_prediction_down_copy(xd);

+    bmode_costs = mb->inter_bmode_costs;
+
    for (i = 0; i < 16; i++)
    {
        MODE_INFO *const mic = xd->mode_info_context;
        const int mis = xd->mode_info_stride;
-        const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
-        const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

-        rd_pick_intra4x4block(
-            cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+            const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+        total_rd += rd_pick_intra4x4block(
+            cpi, mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
            ta + vp8_block2above[i],
            tl + vp8_block2left[i], &r, &ry, &d);

@@ -644,42 +724,43 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int
        distortion += d;
        tot_rate_y += ry;
        mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+
+        if(total_rd >= best_rd)
+          break;
    }

+    if(total_rd >= best_rd)
+      return INT_MAX;
+
    *Rate = cost;
    *rate_y += tot_rate_y;
    *Distortion = distortion;

    return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
-
-int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion)
+int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
+                                   MACROBLOCK *x,
+                                   int *Rate,
+                                   int *rate_y,
+                                   int *Distortion)
 {
-
    MB_PREDICTION_MODE mode;
    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
    int rate, ratey;
-    unsigned int distortion;
+    int distortion;
    int best_rd = INT_MAX;
+    int this_rd;

    //Y Search for 16x16 intra prediction mode
    for (mode = DC_PRED; mode <= TM_PRED; mode++)
    {
-        int this_rd;
-        int dummy;
-        rate = 0;
-
        x->e_mbd.mode_info_context->mbmi.mode = mode;

-        rate += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+        vp8_build_intra_predictors_mby_ptr(&x->e_mbd);

-        vp8_encode_intra16x16mbyrd(IF_RTCD(&cpi->rtcd), x);
-
-        ratey = vp8_rdcost_mby(x);
-
-        rate += ratey;
-
-        VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride, &distortion, &dummy);
+        macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
+        rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
+                                     [x->e_mbd.mode_info_context->mbmi.mode];

        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

@@ -689,7 +770,7 @@ int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int
            best_rd = this_rd;
            *Rate = rate;
            *rate_y = ratey;
-            *Distortion = (int)distortion;
+            *Distortion = distortion;
        }
    }

@@ -697,7 +778,6 @@ int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int
    return best_rd;
 }

-
 static int rd_cost_mbuv(MACROBLOCK *mb)
 {
    int b;
@@ -725,15 +805,6 @@ static int rd_cost_mbuv(MACROBLOCK *mb)
 }


-unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares
-{
-    unsigned int sse0, sse1;
-    int sum0, sum1;
-    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0);
-    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1);
-    return (sse0 + sse1);
-}
-
 static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel)
 {
    vp8_build_uvmvs(&x->e_mbd, fullpixel);
@@ -761,12 +832,17 @@ int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *ra
        int this_rd;

        x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-        vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x);
+        vp8_build_intra_predictors_mbuv(&x->e_mbd);
+        ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
+                      x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
+                      x->src.uv_stride);
+        vp8_transform_mbuv(x);
+        vp8_quantize_mbuv(x);

        rate_to = rd_cost_mbuv(x);
        rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.uv_mode];

-        distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
+        distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;

        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

@@ -938,48 +1014,6 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels
    return distortion;
 }

-static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp8_encodemb_rtcd_vtable_t *rtcd)
-{
-    int b;
-    MACROBLOCKD *const x = &mb->e_mbd;
-    BLOCK   *const mb_y2 = mb->block + 24;
-    BLOCKD *const x_y2  = x->block + 24;
-    short *Y2DCPtr = mb_y2->src_diff;
-    BLOCK *beptr;
-    int d;
-
-    ENCODEMB_INVOKE(rtcd, submby)(mb->src_diff, mb->src.y_buffer, mb->e_mbd.predictor, mb->src.y_stride);
-
-    // Fdct and building the 2nd order block
-    for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
-    {
-        mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
-        *Y2DCPtr++ = beptr->coeff[0];
-        *Y2DCPtr++ = beptr->coeff[16];
-    }
-
-    // 2nd order fdct
-    mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
-
-    // Quantization
-    for (b = 0; b < 16; b++)
-    {
-        mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
-    }
-
-    // DC predication and Quantization of 2nd Order block
-    mb->quantize_b(mb_y2, x_y2);
-
-    // Distortion
-    d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2;
-    d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff);
-
-    *Distortion = (d >> 4);
-
-    // rate
-    *Rate = vp8_rdcost_mby(mb);
-}
-
 unsigned char vp8_mbsplit_offset2[4][16] = {
    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@@ -1140,7 +1174,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,

                    if (cpi->sf.search_method == HEX)
                        bestsme = vp8_hex_search(x, c, e, bsi->ref_mv,
-                                                 &mode_mv[NEW4X4], step_param, sadpb, &num00, v_fn_ptr, x->mvsadcost, x->mvcost);
+                                                 &mode_mv[NEW4X4], step_param, sadpb, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);

                    else
                    {
@@ -1420,48 +1454,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,

    return bsi.segment_rd;
 }
-
-
-static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = x->mbmi.mv.as_mv;
-
-    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-static void lf_mv_bias(const int lf_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = mvp->as_mv;
-
-    if (lf_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
-{
-    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
+#endif

 static void swap(int *x,int *y)
 {
@@ -1546,7 +1539,7 @@ static void quicksortsad(int arr[],int idx[], int left, int right)
 }

 //The improved MV prediction
-static void vp8_mv_pred
+void vp8_mv_pred
 (
    VP8_COMP *cpi,
    MACROBLOCKD *xd,
@@ -1561,67 +1554,67 @@ static void vp8_mv_pred
    const MODE_INFO *above = here - xd->mode_info_stride;
    const MODE_INFO *left = here - 1;
    const MODE_INFO *aboveleft = above - 1;
-    int_mv           near_mvs[7];
-    int              near_ref[7];
+    int_mv           near_mvs[8];
+    int              near_ref[8];
    int_mv           mv;
    int              vcnt=0;
    int              find=0;
    int              mb_offset;

-    int              mvx[7];
-    int              mvy[7];
+    int              mvx[8];
+    int              mvy[8];
    int              i;

    mv.as_int = 0;

    if(here->mbmi.ref_frame != INTRA_FRAME)
    {
-        near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = 0;
-        near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = 0;
+        near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+        near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;

        // read in 3 nearby block's MVs from current frame as prediction candidates.
        if (above->mbmi.ref_frame != INTRA_FRAME)
        {
            near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
            near_ref[vcnt] =  above->mbmi.ref_frame;
        }
        vcnt++;
        if (left->mbmi.ref_frame != INTRA_FRAME)
        {
            near_mvs[vcnt].as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
            near_ref[vcnt] =  left->mbmi.ref_frame;
        }
        vcnt++;
        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
        {
            near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
            near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
        }
        vcnt++;

-        // read in 4 nearby block's MVs from last frame.
+        // read in 5 nearby block's MVs from last frame.
        if(cpi->common.last_frame_type != KEY_FRAME)
        {
-            mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride) + (-xd->mb_to_left_edge/128 +1) ;
+            mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride +1) + (-xd->mb_to_left_edge/128 +1) ;

            // current in last frame
            if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
            {
                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
-                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
            }
            vcnt++;

            // above in last frame
-            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride] != INTRA_FRAME)
+            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1] != INTRA_FRAME)
            {
-                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride].as_int;
-                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride];
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride-1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride-1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1];
            }
            vcnt++;

@@ -1629,17 +1622,26 @@ static void vp8_mv_pred
            if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
            {
                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
-                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
            }
            vcnt++;

-            // aboveleft in last frame
-            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride -1] != INTRA_FRAME)
+            // right in last frame
+            if (cpi->lf_ref_frame[mb_offset +1] != INTRA_FRAME)
            {
-                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride -1].as_int;
-                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
-                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride -1];
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset +1];
+            }
+            vcnt++;
+
+            // below in last frame
+            if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1];
            }
            vcnt++;
        }
@@ -1652,9 +1654,7 @@ static void vp8_mv_pred
                {
                    mv.as_int = near_mvs[near_sadidx[i]].as_int;
                    find = 1;
-                    if(vcnt<2)
-                        *sr = 4;
-                    else if (vcnt<4)
+                    if (i < 3)
                        *sr = 3;
                    else
                        *sr = 2;
@@ -1687,6 +1687,62 @@ static void vp8_mv_pred
    vp8_clamp_mv(mvp, xd);
 }

+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
+{
+
+    int near_sad[8] = {0}; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+
+    //calculate sad for current frame 3 nearby MBs.
+    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+    {
+        near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+    }else if(xd->mb_to_top_edge==0)
+    {   //only has left MB for sad calculation.
+        near_sad[0] = near_sad[2] = INT_MAX;
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+    }else if(xd->mb_to_left_edge ==0)
+    {   //only has left MB for sad calculation.
+        near_sad[1] = near_sad[2] = INT_MAX;
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+    }else
+    {
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        //calculate sad for last frame 5 nearby MBs.
+        unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+        int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+        if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
+        if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
+        if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
+        if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
+
+        if(near_sad[4] != INT_MAX)
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
+        if(near_sad[5] != INT_MAX)
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+        if(near_sad[6] != INT_MAX)
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff);
+        if(near_sad[7] != INT_MAX)
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        quicksortsad(near_sad, near_sadidx, 0, 7);
+    }else
+    {
+        quicksortsad(near_sad, near_sadidx, 0, 2);
+    }
+}
+
+#if !(CONFIG_REALTIME_ONLY)
 int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
    BLOCK *b = &x->block[0];
@@ -1724,8 +1780,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    int force_no_skip = 0;

    MV mvp;
-    int near_sad[7]; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf aboveleft
-    int near_sadidx[7] = {0, 1, 2, 3, 4, 5, 6};
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    int saddone=0;
    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)

@@ -1871,67 +1926,11 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
            lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
        }

-
        if(x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
        {
            if(!saddone)
            {
-                //calculate sad for current frame 3 nearby MBs.
-                if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
-                {
-                    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
-                }else if(xd->mb_to_top_edge==0)
-                {   //only has left MB for sad calculation.
-                    near_sad[0] = near_sad[2] = INT_MAX;
-                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
-                }else if(xd->mb_to_left_edge ==0)
-                {   //only has left MB for sad calculation.
-                    near_sad[1] = near_sad[2] = INT_MAX;
-                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
-                }else
-                {
-                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
-                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
-                    near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
-                }
-
-                if(cpi->common.last_frame_type != KEY_FRAME)
-                {
-                    //calculate sad for last frame 4 nearby MBs.
-                    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
-                    int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
-
-                    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
-                    {
-                        near_sad[4] = near_sad[5] = near_sad[6] = INT_MAX;
-                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
-                    }else if(xd->mb_to_top_edge==0)
-                    {   //only has left MB for sad calculation.
-                        near_sad[4] = near_sad[6] = INT_MAX;
-                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
-                        near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
-                    }else if(xd->mb_to_left_edge ==0)
-                    {   //only has left MB for sad calculation.
-                        near_sad[5] = near_sad[6] = INT_MAX;
-                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
-                        near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
-                    }else
-                    {
-                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
-                        near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
-                        near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
-                        near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16 -16, pre_y_stride, 0x7fffffff);
-                    }
-                }
-
-                if(cpi->common.last_frame_type != KEY_FRAME)
-                {
-                    quicksortsad(near_sad, near_sadidx, 0, 6);
-                }else
-                {
-                    quicksortsad(near_sad, near_sadidx, 0, 2);
-                }
-
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
                saddone = 1;
            }

@@ -1990,27 +1989,34 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
            }

-            vp8cx_mb_init_quantizer(cpi, x);
+            vp8_update_zbin_extra(cpi, x);
        }

        switch (this_mode)
        {
        case B_PRED:
+        {
+            int tmp_rd;

-            for (i = 0; i < 16; i++)
-            {
-                vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
-            }
            // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
-            vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);
+            tmp_rd = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);
            rate2 += rate;
-
            distortion2 += distortion;
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            break;
+
+            if(tmp_rd < best_yrd)
+            {
+                rate2 += uv_intra_rate;
+                rate_uv = uv_intra_rate_tokenonly;
+                distortion2 += uv_intra_distortion;
+                distortion_uv = uv_intra_distortion;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;

        case SPLITMV:
        {
@@ -2046,22 +2052,16 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
        case V_PRED:
        case H_PRED:
        case TM_PRED:
-            for (i = 0; i < 16; i++)
-            {
-                vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
-            }
            x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
-            {
-                macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
-                rate2 += rate_y;
-                distortion2 += distortion;
-                rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
-                rate2 += uv_intra_rate;
-                rate_uv = uv_intra_rate_tokenonly;
-                distortion2 += uv_intra_distortion;
-                distortion_uv = uv_intra_distortion;
-            }
+            macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
+            rate2 += rate_y;
+            distortion2 += distortion;
+            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            rate2 += uv_intra_rate;
+            rate_uv = uv_intra_rate_tokenonly;
+            distortion2 += uv_intra_distortion;
+            distortion_uv = uv_intra_distortion;
            break;

        case NEWMV:
@@ -2116,7 +2116,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int

                    if (cpi->sf.search_method == HEX)
                    {
-                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
                    }
@@ -2266,22 +2266,28 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
            else if (x->encode_breakout)
            {
                int sum, sse;
+                int threshold = (xd->block[0].dequant[1]
+                            * xd->block[0].dequant[1] >>4);
+
+                if(threshold < x->encode_breakout)
+                    threshold = x->encode_breakout;

                VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)
                    (x->src.y_buffer, x->src.y_stride,
                     x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);

-                if (sse < x->encode_breakout)
+                if (sse < threshold)
                {
                    // Check u and v to make sure skip is ok
                    int sse2 = 0;
-
-                    // add dc check
-                    if (abs(sum) < (cpi->common.Y2dequant[0][0] << 2))
+                    /* If theres is no codeable 2nd order dc
+                       or a very small uniform pixel change change */
+                    if (abs(sum) < (xd->block[24].dequant[0]<<2)||
+                        ((sum * sum>>8) > sse && abs(sum) <128))
                    {
                        sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));

-                        if (sse2 * 2 < x->encode_breakout)
+                        if (sse2 * 2 < threshold)
                        {
                            x->skip = 1;
                            distortion2 = sse + sse2;
@@ -2427,6 +2433,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int

        if (x->skip)
            break;
+
    }

    // Reduce the activation RD thresholds for the best choice mode
@@ -2497,6 +2504,15 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    }


+    if(best_mbmode.mode <= B_PRED)
+    {
+        int i;
+        for (i = 0; i < 16; i++)
+        {
+            best_bmodes[i].mv.as_int = 0;
+        }
+    }
+
    // macroblock modes
    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
@@ -2511,4 +2527,3 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
    return best_rd;
 }
 #endif
-
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -12,10 +12,22 @@
 #ifndef __INC_RDOPT_H
 #define __INC_RDOPT_H
 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);
 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
 extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);

+extern void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);

 #endif
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -70,7 +70,7 @@ static void vp8_temporal_filter_predictors_mb_c
    // U & V
    mv_row >>= 1;
    mv_col >>= 1;
-    stride >>= 1;
+    stride = (stride + 1) >> 1;
    offset = (mv_row >> 3) * stride + (mv_col >> 3);
    uptr = u_mb_ptr + offset;
    vptr = v_mb_ptr + offset;
@@ -204,7 +204,7 @@ static int vp8_temporal_filter_find_matching_mb_c
            step_param,
            sadpb/*x->errorperbit*/,
            &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost);
+            mvsadcost, mvcost, &best_ref_mv1);
    }
    else
    {
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -11,220 +11,169 @@
 %include "vpx_ports/x86_abi_support.asm"


-;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
-;               short *qcoeff_ptr,short *dequant_ptr,
-;               const int *default_zig_zag, short *round_ptr,
-;               short *quant_ptr, short *dqcoeff_ptr,
+;int vp8_regular_quantize_b_impl_sse2(
+;               short *coeff_ptr,
+;               short *zbin_ptr,
+;               short *qcoeff_ptr,
+;               short *dequant_ptr,
+;               const int *default_zig_zag,
+;               short *round_ptr,
+;               short *quant_ptr,
+;               short *dqcoeff_ptr,
 ;               unsigned short zbin_oq_value,
-;               short *zbin_boost_ptr);
+;               short *zbin_boost_ptr,
+;               short *quant_shift);
 ;
 global sym(vp8_regular_quantize_b_impl_sse2)
 sym(vp8_regular_quantize_b_impl_sse2):
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 10
+    SHADOW_ARGS_TO_STACK 11
+    SAVE_XMM
    push        rsi
    push        rdi
    push        rbx
+    ALIGN_STACK 16, rax
+    %define abs_minus_zbin    0
+    %define temp_qcoeff       32
+    %define qcoeff            64
+    %define eob_tmp           96
+    %define stack_size        112
+    sub         rsp, stack_size
    ; end prolog

-    ALIGN_STACK 16, rax
-
-    %define abs_minus_zbin_lo 0
-    %define abs_minus_zbin_hi 16
-    %define temp_qcoeff_lo 32
-    %define temp_qcoeff_hi 48
-    %define save_xmm6 64
-    %define save_xmm7 80
-    %define eob 96
-
-    %define vp8_regularquantizeb_stack_size eob + 16
-
-    sub         rsp, vp8_regularquantizeb_stack_size
-
-    movdqa      OWORD PTR[rsp + save_xmm6], xmm6
-    movdqa      OWORD PTR[rsp + save_xmm7], xmm7
-
-    mov         rdx, arg(0)                 ;coeff_ptr
-    mov         eax, arg(8)                 ;zbin_oq_value
-
-    mov         rcx, arg(1)                 ;zbin_ptr
-    movd        xmm7, eax
+    mov         rdx, arg(0)                 ; coeff_ptr
+    mov         rcx, arg(1)                 ; zbin_ptr
+    movd        xmm7, arg(8)                ; zbin_oq_value
+    mov         rdi, arg(5)                 ; round_ptr
+    mov         rsi, arg(6)                 ; quant_ptr

+    ; z
    movdqa      xmm0, OWORD PTR[rdx]
    movdqa      xmm4, OWORD PTR[rdx + 16]

+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
+
    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

-    psraw       xmm0, 15                    ;sign of z (aka sz)
-    psraw       xmm4, 15                    ;sign of z (aka sz)
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr
-    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    psubw       xmm1, xmm0                  ;x = abs(z)
-
-    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value
-    psubw       xmm5, xmm4                  ;x = abs(z)
-
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)
-    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)
-
-    mov         rdi, arg(5)                 ;round_ptr
-    mov         rsi, arg(6)                 ;quant_ptr
-
-    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
-    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
-
-    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back
-
-    movdqa      xmm2, OWORD PTR[rdi]
-    movdqa      xmm3, OWORD PTR[rsi]
-
-    movdqa      xmm6, OWORD PTR[rdi + 16]
-    movdqa      xmm7, OWORD PTR[rsi + 16]
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    pmulhw      xmm1, xmm3
-    pmulhw      xmm5, xmm7
-
-    mov         rsi, arg(2)                 ;qcoeff_ptr
-    pxor        xmm6, xmm6
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm4, 15

+    ; (z ^ sz)
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4

+    ; x = abs(z)
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

-    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1
-    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5
+    movdqa      xmm2, OWORD PTR[rcx]
+    movdqa      xmm3, OWORD PTR[rcx + 16]

-    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff
-    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7

-    xor         rax, rax
-    mov         rcx, -1
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm1, xmm2
+    psubw       xmm5, xmm3
+    movdqa      OWORD PTR[rsp + abs_minus_zbin], xmm1
+    movdqa      OWORD PTR[rsp + abs_minus_zbin + 16], xmm5

-    mov         [rsp + eob], rcx
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-
-    mov         rbx, arg(4)                 ;default_zig_zag
-
-rq_zigzag_loop:
-    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1a
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1a
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1a:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1b
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1b
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1b:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1c
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1c
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1c:
-    lea         rax, [rax + 1]
-
-    cmp         rax, 16
-    jl          rq_zigzag_loop
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-    mov         rcx, arg(3)                 ;dequant_ptr
-    mov         rsi, arg(7)                 ;dqcoeff_ptr
+    ; add (zbin_ptr + zbin_oq_value) back
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3

    movdqa      xmm2, OWORD PTR[rdi]
-    movdqa      xmm3, OWORD PTR[rdi + 16]
+    movdqa      xmm6, OWORD PTR[rdi + 16]
+
+    movdqa      xmm3, OWORD PTR[rsi]
+    movdqa      xmm7, OWORD PTR[rsi + 16]
+
+    ; x + round
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm6
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm3, xmm1
+    pmulhw      xmm7, xmm5
+
+    ; y += x
+    paddw       xmm1, xmm3
+    paddw       xmm5, xmm7
+
+    movdqa      OWORD PTR[rsp + temp_qcoeff], xmm1
+    movdqa      OWORD PTR[rsp + temp_qcoeff + 16], xmm5
+
+    pxor        xmm6, xmm6
+    ; zero qcoeff
+    movdqa      OWORD PTR[rsp + qcoeff], xmm6
+    movdqa      OWORD PTR[rsp + qcoeff + 16], xmm6
+
+    mov         [rsp + eob_tmp], DWORD -1   ; eob
+    mov         rsi, arg(9)                 ; zbin_boost_ptr
+    mov         rdi, arg(4)                 ; default_zig_zag
+    mov         rax, arg(10)                ; quant_shift_ptr
+
+%macro ZIGZAG_LOOP 2
+rq_zigzag_loop_%1:
+    movsxd      rdx, DWORD PTR[rdi + (%1 * 4)] ; rc
+    movsx       ebx, WORD PTR [rsi]         ; *zbin_boost_ptr
+    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
+
+    ; x
+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
+
+    ; if (x >= zbin)
+    sub         ecx, ebx                    ; x - zbin
+    jl          rq_zigzag_loop_%2           ; x < zbin
+
+    movsx       ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]
+
+    ; downshift by quant_shift[rdx]
+    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
+    sar         ebx, cl                     ; also sets Z bit
+    je          rq_zigzag_loop_%2           ; !y
+    mov         WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ; reset to b->zrun_zbin_boost
+    mov         [rsp + eob_tmp], DWORD %1   ; eob = i
+%endmacro
+ZIGZAG_LOOP 0, 1
+ZIGZAG_LOOP 1, 2
+ZIGZAG_LOOP 2, 3
+ZIGZAG_LOOP 3, 4
+ZIGZAG_LOOP 4, 5
+ZIGZAG_LOOP 5, 6
+ZIGZAG_LOOP 6, 7
+ZIGZAG_LOOP 7, 8
+ZIGZAG_LOOP 8, 9
+ZIGZAG_LOOP 9, 10
+ZIGZAG_LOOP 10, 11
+ZIGZAG_LOOP 11, 12
+ZIGZAG_LOOP 12, 13
+ZIGZAG_LOOP 13, 14
+ZIGZAG_LOOP 14, 15
+ZIGZAG_LOOP 15, end
+rq_zigzag_loop_end:
+
+    mov         rbx, arg(2)                 ; qcoeff_ptr
+    mov         rcx, arg(3)                 ; dequant_ptr
+    mov         rsi, arg(7)                 ; dqcoeff_ptr
+    mov         rax, [rsp + eob_tmp]        ; eob
+
+    movdqa      xmm2, OWORD PTR[rsp + qcoeff]
+    movdqa      xmm3, OWORD PTR[rsp + qcoeff + 16]
+
+    ; y ^ sz
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm4
+    ; x = (y ^ sz) - sz
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm4

    movdqa      xmm0, OWORD PTR[rcx]
    movdqa      xmm1, OWORD PTR[rcx + 16]
@@ -232,23 +181,20 @@ rq_zigzag_1c:
    pmullw      xmm0, xmm2
    pmullw      xmm1, xmm3

-    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff
-    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff
-
-    mov         rax, [rsp + eob]
-
-    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]
-    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]
+    movdqa      OWORD PTR[rbx], xmm2
+    movdqa      OWORD PTR[rbx + 16], xmm3
+    movdqa      OWORD PTR[rsi], xmm0        ; store dqcoeff
+    movdqa      OWORD PTR[rsi + 16], xmm1   ; store dqcoeff

    add         rax, 1

-    add         rsp, vp8_regularquantizeb_stack_size
-    pop         rsp
-
    ; begin epilog
+    add         rsp, stack_size
+    pop         rsp
    pop         rbx
    pop         rdi
    pop         rsi
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -27,11 +27,11 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT

-/* The sse2 quantizer has not been updated to match the new exact
- * quantizer introduced in commit e04e2935
- *#undef vp8_quantize_quantb
- *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
- */
+// Currently, this function realizes a gain on x86 and a loss on x86_64
+#if ARCH_X86
+#undef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+#endif

 #endif

--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -586,52 +586,45 @@ sym(vp8_sad16x16_sse3):

    STACK_FRAME_CREATE_X3

-        lea             end_ptr,    [src_ptr+src_stride*8]
-
-        lea             end_ptr,    [end_ptr+src_stride*8]
-        pxor            mm7,        mm7
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7

 .vp8_sad16x16_sse3_loop:
-
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
-
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
-
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
-
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]

        lea             src_ptr,    [src_ptr+src_stride*2]
        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]

-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
+        psadbw          xmm0,       xmm1

-        cmp             src_ptr,    end_ptr
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
+
+        sub             end_ptr,     1
        jne             .vp8_sad16x16_sse3_loop

-        movq            ret_var,    mm7
-
-.vp8_sad16x16_early_exit:
-
-        mov             rax,        ret_var
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0

    STACK_FRAME_DESTROY_X3

--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
 ;    unsigned char *src_ptr,
 ;    int src_pixels_per_line,
 ;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
+;    int  xoffset,
+;    int  yoffset,
 ;    int *sum,
 ;    unsigned int *sumsquared;;
 ;
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
    GET_GOT     rbx
    push rsi
    push rdi
-    sub         rsp, 16
+    push rbx
    ; end prolog

        pxor            xmm6,           xmm6                 ;
        pxor            xmm7,           xmm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;

-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]

-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height

        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;

-        movq            xmm3,           QWORD PTR [rsi+1]        ;
        punpcklbw       xmm1,           xmm0                 ;
-
-        pmullw          xmm1,           [rax]               ;
+        pmullw          xmm1,           [rax]                ;
        punpcklbw       xmm3,           xmm0
-            ;
        pmullw          xmm3,           [rax+16]             ;
+
        paddw           xmm1,           xmm3                 ;
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
        movdqa          xmm5,           xmm1
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rsi,            r8
-%endif
-filter_block2d_bil_var_sse2_loop:

+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
        movq            xmm1,           QWORD PTR [rsi]               ;
        movq            xmm3,           QWORD PTR [rsi+1]             ;

        punpcklbw       xmm1,           xmm0                 ;
        pmullw          xmm1,           [rax]               ;
-
        punpcklbw       xmm3,           xmm0                 ;
        pmullw          xmm3,           [rax+16]             ;

        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-
+        paddw           xmm1,           xmm4               ;
        psraw           xmm1,           xmm_filter_shift    ;
+
        movdqa          xmm3,           xmm5                 ;
-
        movdqa          xmm5,           xmm1                 ;
-        pmullw          xmm3,           [rdx]               ;

+        pmullw          xmm3,           [rdx]               ;
        pmullw          xmm1,           [rdx+16]             ;
        paddw           xmm1,           xmm3                 ;
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
+        paddw           xmm1,           xmm4                 ;
        psraw           xmm1,           xmm_filter_shift    ;

        movq            xmm3,           QWORD PTR [rdi]               ;
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
        pmaddwd         xmm1,           xmm1                 ;
        paddd           xmm7,           xmm1                 ;

+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
 %if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
 %else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
+        lea             rdi,            [rdi + r9]
 %endif

        sub             rcx,            1                   ;
        jnz             filter_block2d_bil_var_sse2_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
        movdq2q         mm6,            xmm6                ;
        movdq2q         mm7,            xmm7                ;

@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
        movd            [rsi],          mm2    ; xsum
        movd            [rdi],          mm4    ; xxsum

-
    ; begin epilog
-    add rsp, 16
+    pop rbx
    pop rdi
    pop rsi
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
@@ -974,3 +1069,13 @@ SECTION_RODATA
 align 16
 xmm_bi_rd:
    times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
+    int  xoffset,
+    int  yoffset,
    int *sum,
    unsigned int *sumsquared
 );
@@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt

 }

-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
-{
-    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
-    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
    const unsigned char  *src_ptr,
@@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
    unsigned int *sse
 )
 {
-
    int xsum;
    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }

    *sse = xxsum;
    return (xxsum - ((xsum * xsum) >> 6));
@@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
        vp8_filter_block2d_bil_var_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
            &xsum0, &xxsum0
        );

@@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
        vp8_filter_block2d_bil_var_sse2(
            src_ptr + 8, src_pixels_per_line,
            dst_ptr + 8, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
            &xsum1, &xxsum1
        );
    }
@@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
    int xsum0, xsum1;
    unsigned int xxsum0, xxsum1;

+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum0, &xxsum0
-    );
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum1, &xxsum1
-    );
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum1, &xxsum1);
+    }

    xsum0 += xsum1;
    xxsum0 += xxsum1;
@@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 {
    int xsum;
    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }

    *sse = xxsum;
    return (xxsum - ((xsum * xsum) >> 7));
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -108,37 +108,26 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)


 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
-                               short *qcoeff_ptr,short *dequant_ptr,
-                               const int *default_zig_zag, short *round_ptr,
-                               short *quant_ptr, short *dqcoeff_ptr,
-                               unsigned short zbin_oq_value,
-                               short *zbin_boost_ptr);
+                                     short *qcoeff_ptr,short *dequant_ptr,
+                                     const int *default_zig_zag, short *round_ptr,
+                                     short *quant_ptr, short *dqcoeff_ptr,
+                                     unsigned short zbin_oq_value,
+                                     short *zbin_boost_ptr,
+                                     short *quant_shift_ptr);

 void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
 {
-    short *zbin_boost_ptr = b->zrun_zbin_boost;
-    short *coeff_ptr      = b->coeff;
-    short *zbin_ptr       = b->zbin;
-    short *round_ptr      = b->round;
-    short *quant_ptr      = b->quant;
-    short *qcoeff_ptr     = d->qcoeff;
-    short *dqcoeff_ptr    = d->dqcoeff;
-    short *dequant_ptr    = d->dequant;
-    short zbin_oq_value   = b->zbin_extra;
-
-    d->eob = vp8_regular_quantize_b_impl_sse2(
-        coeff_ptr,
-        zbin_ptr,
-        qcoeff_ptr,
-        dequant_ptr,
-        vp8_default_zig_zag1d,
-
-        round_ptr,
-        quant_ptr,
-        dqcoeff_ptr,
-        zbin_oq_value,
-        zbin_boost_ptr
-        );
+    d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff,
+                                              b->zbin,
+                                              d->qcoeff,
+                                              d->dequant,
+                                              vp8_default_zig_zag1d,
+                                              b->round,
+                                              b->quant,
+                                              d->dqcoeff,
+                                              b->zbin_extra,
+                                              b->zrun_zbin_boost,
+                                              b->quant_shift);
 }

 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
@@ -307,7 +296,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;

-        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
+#if ARCH_X86
+        cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse2;
+#endif
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;

        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -35,7 +35,7 @@ VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
-VP8_COMMON_SRCS-yes += common/filter_c.c
+VP8_COMMON_SRCS-yes += common/filter.c
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
@@ -111,14 +111,15 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif

+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/asm_com_offsets.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c

 # common (c)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/vpx_asm_offsets.c

 # common (armv6)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
@@ -161,16 +162,3 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
-
-
-#
-# Rule to extract assembly constants from C sources
-#
-ifeq ($(ARCH_ARM),yes)
-vpx_asm_offsets.asm: obj_int_extract
-vpx_asm_offsets.asm: $(VP8_PREFIX)common/arm/vpx_asm_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
-OBJS-yes += $(VP8_PREFIX)common/arm/vpx_asm_offsets.c.o
-CLEAN-OBJS += vpx_asm_offsets.asm
-$(filter %$(ASM).o,$(OBJS-yes)): vpx_asm_offsets.asm
-endif
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -142,8 +142,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
    RANGE_CHECK_HI(cfg, g_profile,          3);
-    RANGE_CHECK_HI(cfg, rc_min_quantizer,   63);
    RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+    RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
    RANGE_CHECK_HI(cfg, g_threads,          64);
 #if !(CONFIG_REALTIME_ONLY)
    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
@@ -912,8 +912,8 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
        ctx->preview_img.x_chroma_shift = 1;
        ctx->preview_img.y_chroma_shift = 1;

-        ctx->preview_img.d_w = ctx->cfg.g_w;
-        ctx->preview_img.d_h = ctx->cfg.g_h;
+        ctx->preview_img.d_w = sd.y_width;
+        ctx->preview_img.d_h = sd.y_height;
        ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
        ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
        ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -42,7 +42,7 @@ VP8_CX_SRCS-yes += encoder/encodeframe.c
 VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
-VP8_CX_SRCS-yes += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c
 VP8_CX_SRCS-yes += encoder/block.h
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -14,10 +14,13 @@
 #File list for arm
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/asm_enc_offsets.c

 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c

 VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
@@ -31,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar

 #File list for armv6
 # encoder
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

 #File list for neon
@@ -49,17 +53,3 @@ VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(A
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
-
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/vpx_vp8_enc_asm_offsets.c
-
-#
-# Rule to extract assembly constants from C sources
-#
-ifeq ($(ARCH_ARM),yes)
-vpx_vp8_enc_asm_offsets.asm: obj_int_extract
-vpx_vp8_enc_asm_offsets.asm: $(VP8_PREFIX)encoder/arm/vpx_vp8_enc_asm_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
-OBJS-yes += $(VP8_PREFIX)encoder/arm/vpx_vp7_enc_asm_offsets.c.o
-CLEAN-OBJS += vpx_vp8_enc_asm_offsets.asm
-$(filter %$(ASM).o,$(OBJS-yes)): vpx_vp8_enc_asm_offsets.asm
-endif
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -65,7 +65,7 @@ VP8_DX_SRCS-yes += decoder/detokenize.h
 VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
-VP8_DX_SRCS-yes += decoder/threading.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
 VP8_DX_SRCS-yes += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -12,9 +12,9 @@
 #VP8_DX_SRCS list is modified according to different platforms.

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c
+VP8_CX_SRCS-$(ARCH_ARM)  += decoder/asm_dec_offsets.c

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK)  += decoder/arm/detokenize$(ASM)

 #File list for armv6
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -321,7 +321,7 @@ typedef struct vpx_codec_priv_cb_pair
    {
        vpx_codec_put_frame_cb_fn_t    put_frame;
        vpx_codec_put_slice_cb_fn_t    put_slice;
-    };
+    } u;
    void                            *user_priv;
 } vpx_codec_priv_cb_pair_t;

--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -160,7 +160,7 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx
        res = VPX_CODEC_ERROR;
    else
    {
-        ctx->priv->dec.put_frame_cb.put_frame = cb;
+        ctx->priv->dec.put_frame_cb.u.put_frame = cb;
        ctx->priv->dec.put_frame_cb.user_priv = user_priv;
        res = VPX_CODEC_OK;
    }
@@ -182,7 +182,7 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx
        res = VPX_CODEC_ERROR;
    else
    {
-        ctx->priv->dec.put_slice_cb.put_slice = cb;
+        ctx->priv->dec.put_slice_cb.u.put_slice = cb;
        ctx->priv->dec.put_slice_cb.user_priv = user_priv;
        res = VPX_CODEC_OK;
    }
--- a/Show More
+++ b/Show More