Use memcpy for save/restore_predictor

The save_predictor and restore_predictor functions perform a 1D backup of the 2D predictor block. Use memcpy to get a faster copy operation than 4 individual load/stores. Change-Id: Ia609ed71fbff1ade6fa677186efce9ee29167fd6
Merge "Improve vp8_sad16x16_sse3 function"
2011-02-15 10:22:21 -05:00 · 2011-02-14 14:09:25 -08:00 · 2011-02-14 13:58:12 -08:00 · 2011-02-14 16:34:33 -05:00 · 2011-02-14 16:23:49 -05:00 · 2011-02-14 11:29:22 -08:00
122 changed files with 3331 additions and 3520 deletions
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -17,15 +17,17 @@ for i; do
        on_of=1
    elif [ "$i" == "-v" ]; then
        verbose=1
+    elif [ "$i" == "-g" ]; then
+        args="${args} --debug"
    elif [ "$on_of" == "1" ]; then
        outfile=$i
-    on_of=0
+        on_of=0
    elif [ -f "$i" ]; then
        infiles="$infiles $i"
    elif [ "${i:0:2}" == "-l" ]; then
        libs="$libs ${i#-l}"
    elif [ "${i:0:2}" == "-L" ]; then
-    libpaths="${libpaths} ${i#-L}"
+        libpaths="${libpaths} ${i#-L}"
    else
        args="${args} ${i}"
    fi
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -78,6 +78,7 @@ Build options:
  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
+  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
@@ -442,6 +443,9 @@ process_common_cmdline() {
        ;;
        --cpu=*) tune_cpu="$optval"
        ;;
+        --extra-cflags=*)
+        extra_cflags="${optval}"
+        ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        echo "${CMDLINE_SELECT} ${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null || die_unknown $opt
@@ -660,12 +664,12 @@ process_common_toolchain() {
            elif enabled armv7
            then
                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-ftree-vectorize
-        check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
+                check_add_asflags -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp  #-march=armv7-a
            else
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
            fi
-
+            enabled debug && add_asflags -g
            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
            ;;
        rvct)
@@ -690,16 +694,24 @@ process_common_toolchain() {
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+            enabled debug && add_asflags -g
+            add_cflags --gnu
+            add_cflags --enum_is_int
+            add_cflags --wchar32
        ;;
        esac

        case ${tgt_os} in
+        none*)
+            disable multithread
+            disable os_support
+            ;;
        darwin*)
            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin9-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -713,14 +725,14 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.1.sdk"
+            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.sdk"

            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS3.1.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.2.sdk

            # Add the paths for the alternate libc
 #            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin9/4.0.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done
@@ -742,13 +754,9 @@ process_common_toolchain() {
                    || die "Must supply --libc when targetting *-linux-rvct"

                # Set up compiler
-                add_cflags --gnu
-                add_cflags --enum_is_int
                add_cflags --library_interface=aeabi_glibc
                add_cflags --no_hide_all
-                add_cflags --wchar32
                add_cflags --dwarf2
-                add_cflags --gnu

                # Set up linker
                add_ldflags --sysv --no_startup --no_ref_cpp_init
@@ -972,6 +980,12 @@ EOF
        add_cflags -D_LARGEFILE_SOURCE
        add_cflags -D_FILE_OFFSET_BITS=64
    fi
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
 }

 process_toolchain() {
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -590,7 +590,7 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
    //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
    //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);

-    fp = fopen("vpx_asm_offsets.asm", "w");
+    fp = fopen("assembly_offsets.asm", "w");

    if (fp == NULL)
    {
--- a/10
+++ b/10
@@ -40,7 +40,6 @@ Advanced options:
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_small}                 favor smaller size over speed
-  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
@@ -79,11 +78,13 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
+all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv5te-wince-vs8"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
+all_platforms="${all_platforms} armv6-none-rvct"
 all_platforms="${all_platforms} armv6-symbian-gcc"
 all_platforms="${all_platforms} armv6-wince-vs8"
 all_platforms="${all_platforms} iwmmxt-linux-rvct"
@@ -95,6 +96,7 @@ all_platforms="${all_platforms} iwmmxt2-wince-vs8"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -159,6 +161,7 @@ enable fast_unaligned #allow unaligned accesses, if supported by hw
 enable md5
 enable spatial_resampling
 enable multithread
+enable os_support

 [ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8; do
@@ -251,8 +254,8 @@ CONFIG_LIST="
    realtime_only
    shared
    small
-    arm_asm_detok
    postproc_visualizer
+    os_support
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -291,7 +294,6 @@ CMDLINE_SELECT="
    realtime_only
    shared
    small
-    arm_asm_detok
    postproc_visualizer
 "

@@ -300,7 +302,7 @@ process_cmdline() {
        optval="${opt#*=}"
        case "$opt" in
        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
-        *) process_common_cmdline $opt
+        *) process_common_cmdline "$opt"
        ;;
        esac
    done
--- a/examples.mk
+++ b/examples.mk
@@ -93,8 +93,16 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame


 # Handle extra library flags depending on codec configuration
-CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m

+# We should not link to math library (libm) on RVCT
+# when building for bare-metal targets
+ifeq ($(CONFIG_OS_SUPPORT), yes)
+CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+else
+    ifeq ($(CONFIG_GCC), yes)
+    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+    endif
+endif
 #
 # End of specified files. The rest of the build rules should happen
 # automagically from here.
--- a/libs.mk
+++ b/libs.mk
@@ -230,10 +230,39 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
-#$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(BUILD_PFX)vpx_asm_offsets.asm
 $(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
 $(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm

+#
+# Calculate platform- and compiler-specific offsets for hand coded assembly
+#
+ifeq ($(ARCH_ARM), yes)
+  asm_com_offsets.asm: obj_int_extract
+  asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+  OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+  CLEAN-OBJS += asm_com_offsets.asm
+  $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+
+  ifeq ($(CONFIG_VP8_ENCODER), yes)
+    asm_enc_offsets.asm: obj_int_extract
+    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    CLEAN-OBJS += asm_enc_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+  endif
+
+  ifeq ($(CONFIG_VP8_DECODER), yes)
+    asm_dec_offsets.asm: obj_int_extract
+    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    CLEAN-OBJS += asm_dec_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+  endif
+endif
+
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

--- a/md5_utils.c
+++ b/md5_utils.c
@@ -20,8 +20,6 @@
 * Still in the public domain.
 */

-#include <sys/types.h>    /* for stupid systems */
-
 #include <string.h>   /* for memcpy() */

 #include "md5_utils.h"
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -15,19 +15,19 @@
    AREA    |.text|, CODE, READONLY  ; name this block of code

 ;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
-; stack    const short *vp8_filter
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    mov     r12, r3                         ; outer-loop counter
    sub     r2, r2, r4                      ; src increment for height loop
@@ -38,10 +38,10 @@

    ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row

    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_1st_filter
@@ -140,17 +140,17 @@

 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
-; stack const short *vp8_filter
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -243,8 +243,6 @@ skip_secondpass_hloop
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _filter8_coeff_
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -10,128 +10,29 @@


 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-);
-
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
+#include "arm/bilinearfilter_arm.h"

 void vp8_filter_block2d_bil_armv6
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const short      *HFilter,
-    const short      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -148,8 +49,8 @@ void vp8_bilinear_predict4x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +68,8 @@ void vp8_bilinear_predict8x8_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +87,8 @@ void vp8_bilinear_predict8x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +106,8 @@ void vp8_bilinear_predict16x16_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -11,26 +11,10 @@

 #include "vpx_ports/config.h"
 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
 #include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
-
-
 extern void vp8_filter_block2d_first_pass_armv6
 (
    unsigned char *src_ptr,
@@ -93,11 +77,11 @@ void vp8_sixtap_predict_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* Vfilter is null. First pass only */
    if (xoffset && !yoffset)
@@ -129,47 +113,6 @@ void vp8_sixtap_predict_armv6
    }
 }

-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
    unsigned char  *src_ptr,
@@ -182,10 +125,10 @@ void vp8_sixtap_predict8x8_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
@@ -224,10 +167,10 @@ void vp8_sixtap_predict16x16_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -41,13 +41,13 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -57,7 +57,7 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -65,13 +65,13 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -81,7 +81,7 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -94,10 +94,10 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -122,10 +122,10 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -148,10 +148,10 @@ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }

 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -161,7 +161,7 @@ void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -169,10 +169,10 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }

 void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -182,7 +182,7 @@ void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -195,7 +195,7 @@ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }

 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -220,7 +220,7 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }

 void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -350,10 +350,7 @@ filt_blk2d_spo16x16_loop_neon
    ENDP

 ;-----------------
-    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter16_coeff_
    DCD     bifilter16_coeff
 bifilter16_coeff
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -123,10 +123,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter4_coeff_
    DCD     bifilter4_coeff
 bifilter4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -128,10 +128,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter8x4_coeff_
    DCD     bifilter8x4_coeff
 bifilter8x4_coeff
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -176,10 +176,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _bifilter8_coeff_
    DCD     bifilter8_coeff
 bifilter8_coeff
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -397,7 +397,8 @@
    bx          lr
    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|

-    AREA    loopfilter_dat, DATA, READONLY
+;-----------------
+
 _lf_coeff_
    DCD     lf_coeff
 lf_coeff
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -104,10 +104,7 @@
    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|

 ;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _lfhy_coeff_
    DCD     lfhy_coeff
 lfhy_coeff
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -145,10 +145,7 @@
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|

 ;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _vlfy_coeff_
    DCD     vlfy_coeff
 vlfy_coeff
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -505,7 +505,8 @@
    bx          lr
    ENDP        ; |vp8_mbloop_filter_neon|

-    AREA    mbloopfilter_dat, DATA, READONLY
+;-----------------
+
 _mblf_coeff_
    DCD     mblf_coeff
 mblf_coeff
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -113,10 +113,7 @@
    ENDP

 ;-----------------
-    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _idct_coeff_
    DCD     idct_coeff
 idct_coeff
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -476,10 +476,7 @@ secondpass_only_inner_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter16_coeff_
    DCD     filter16_coeff
 filter16_coeff
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -407,10 +407,7 @@ secondpass_filter4x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter4_coeff_
    DCD     filter4_coeff
 filter4_coeff
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -458,10 +458,7 @@ secondpass_filter8x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -509,10 +509,7 @@ filt_blk2d_spo8x8_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _filter8_coeff_
    DCD     filter8_coeff
 filter8_coeff
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "vpx_scale/yv12config.h"
+
+#define ct_assert(name,cond) \
+    static void assert_##name(void) UNUSED;\
+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+//vpx_scale
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -282,6 +282,8 @@ typedef struct

    void *current_bc;

+    int corrupted;
+
 #if CONFIG_RUNTIME_CPU_DETECT
    struct VP8_COMMON_RTCD  *rtcd;
 #endif
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter_c.c
@@ -10,13 +10,10 @@


 #include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
    { 128,   0 },
    { 112,  16 },
@@ -28,8 +25,7 @@ static const int bilinear_filters[8][2] =
    {  16, 112 }
 };

-
-static const short sub_pel_filters[8][6] =
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {

    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
@@ -40,9 +36,6 @@ static const short sub_pel_filters[8][6] =
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
-
-
-
 };

 void vp8_filter_block2d_first_pass
@@ -146,7 +139,7 @@ void vp8_filter_block2d
    const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
@@ -195,8 +188,8 @@ void vp8_sixtap_predict_c
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
@@ -212,10 +205,10 @@ void vp8_sixtap_predict8x8_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
@@ -238,10 +231,10 @@ void vp8_sixtap_predict8x4_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -264,11 +257,11 @@ void vp8_sixtap_predict16x16_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
@@ -283,57 +276,50 @@ void vp8_sixtap_predict16x16_c
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_first_pass
 (
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int i, j;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
+        src_ptr += src_stride - width;
+        dst_ptr += width;
    }
 }

@@ -341,60 +327,51 @@ void vp8_filter_block2d_bil_first_pass
 *
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
 void vp8_filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int  i, j;
    int  Temp;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply filter */
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
+        dst_ptr += dst_pitch;
    }
 }

@@ -404,11 +381,14 @@ void vp8_filter_block2d_bil_second_pass
 *  ROUTINE       : filter_block2d_bil
 *
 *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@@ -422,23 +402,23 @@ void vp8_filter_block2d_bil_second_pass
 void vp8_filter_block2d_bil
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {

-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -452,11 +432,11 @@ void vp8_bilinear_predict4x4_c
    int dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
    {
        int i;
@@ -490,11 +470,11 @@ void vp8_bilinear_predict8x8_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

@@ -510,11 +490,11 @@ void vp8_bilinear_predict8x4_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

@@ -530,11 +510,11 @@ void vp8_bilinear_predict16x16_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/filter.h
+++ b/vp8/common/filter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif //FILTER_H
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,47 +11,9 @@

 #include "findnearmv.h"

-#define FINDNEAR_SEARCH_SITES   3
-
 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
-
-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
-static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = x->mbmi.mv.as_mv;
-
-    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-
-void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
-{
-    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
-
-
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -82,7 +44,7 @@ void vp8_find_near_mvs
        if (above->mbmi.mv.as_int)
        {
            (++mv)->as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
            ++cntx;
        }

@@ -97,7 +59,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
@@ -119,7 +81,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -17,6 +17,41 @@
 #include "modecont.h"
 #include "treecoder.h"

+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = mvp->as_mv;
+
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
+{
+    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
@@ -35,8 +70,4 @@ const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);

 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);

-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
-
-
 #endif
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -28,13 +28,13 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -44,7 +44,7 @@ void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Vertical MB Filtering */
@@ -52,13 +52,13 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -68,7 +68,7 @@ void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }

 /* Horizontal B Filtering */
@@ -81,10 +81,10 @@ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -109,10 +109,10 @@ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }

 void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -137,8 +137,6 @@ void vp8_init_loop_filter(VP8_COMMON *cm)

    int block_inside_limit = 0;
    int HEVThresh;
-    const int yhedge_boost  = 2;
-    const int uvhedge_boost = 2;

    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
    for (i = 0; i <= MAX_LOOP_FILTER; i++)
@@ -182,15 +180,9 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
        for (j = 0; j < 16; j++)
        {
            lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
-            lfi[i].mbthr[j] = HEVThresh;
+            lfi[i].mbflim[j] = filt_lvl + 2;
            lfi[i].flim[j] = filt_lvl;
            lfi[i].thr[j] = HEVThresh;
-            lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost;
-            lfi[i].uvmbthr[j] = HEVThresh;
-            lfi[i].uvflim[j] = filt_lvl;
-            lfi[i].uvthr[j] = HEVThresh;
        }

    }
@@ -249,57 +241,52 @@ void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
        for (j = 0; j < 16; j++)
        {
            /*lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
-            lfi[i].mbthr[j] = HEVThresh;
+            lfi[i].mbflim[j] = filt_lvl+2;*/
            /*lfi[i].flim[j] = filt_lvl;*/
            lfi[i].thr[j] = HEVThresh;
-            /*lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
-            lfi[i].uvmbthr[j] = HEVThresh;
-            /*lfi[i].uvflim[j] = filt_lvl;*/
-            lfi[i].uvthr[j] = HEVThresh;
        }
    }
 }


-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
 {
    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;

    if (mbd->mode_ref_lf_delta_enabled)
    {
        /* Apply delta for reference frame */
-        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+        filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];

        /* Apply delta for mode */
        if (mbmi->ref_frame == INTRA_FRAME)
        {
            /* Only the split mode BPRED has a further special case */
            if (mbmi->mode == B_PRED)
-                *filter_level +=  mbd->mode_lf_deltas[0];
+                filter_level +=  mbd->mode_lf_deltas[0];
        }
        else
        {
            /* Zero motion mode */
            if (mbmi->mode == ZEROMV)
-                *filter_level +=  mbd->mode_lf_deltas[1];
+                filter_level +=  mbd->mode_lf_deltas[1];

            /* Split MB motion mode */
            else if (mbmi->mode == SPLITMV)
-                *filter_level +=  mbd->mode_lf_deltas[3];
+                filter_level +=  mbd->mode_lf_deltas[3];

            /* All other inter motion modes (Nearest, Near, New) */
            else
-                *filter_level +=  mbd->mode_lf_deltas[2];
+                filter_level +=  mbd->mode_lf_deltas[2];
        }

        /* Range check */
-        if (*filter_level > MAX_LOOP_FILTER)
-            *filter_level = MAX_LOOP_FILTER;
-        else if (*filter_level < 0)
-            *filter_level = 0;
+        if (filter_level > MAX_LOOP_FILTER)
+            filter_level = MAX_LOOP_FILTER;
+        else if (filter_level < 0)
+            filter_level = 0;
    }
+    return filter_level;
 }


@@ -373,7 +360,7 @@ void vp8_loop_filter_frame
             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
             * Apply any context driven MB level adjustment
             */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);

            if (filter_level)
            {
@@ -473,7 +460,7 @@ void vp8_loop_filter_frame_yonly
            filter_level = baseline_filter_level[Segment];

            /* Apply any context driven MB level adjustment */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);

            if (filter_level)
            {
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -32,12 +32,6 @@ typedef struct
    DECLARE_ALIGNED(16, signed char, flim[16]);
    DECLARE_ALIGNED(16, signed char, thr[16]);
    DECLARE_ALIGNED(16, signed char, mbflim[16]);
-    DECLARE_ALIGNED(16, signed char, mbthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvlim[16]);
-    DECLARE_ALIGNED(16, signed char, uvflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbthr[16]);
 } loop_filter_info;


--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -46,7 +46,8 @@ extern "C"
    typedef enum
    {
        USAGE_STREAM_FROM_SERVER    = 0x0,
-        USAGE_LOCAL_FILE_PLAYBACK   = 0x1
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+        USAGE_CONSTRAINED_QUALITY   = 0x2
    } END_USAGE;


@@ -150,6 +151,7 @@ extern "C"
        int fixed_q;
        int worst_allowed_q;
        int best_allowed_q;
+        int cq_level;

        // allow internal resizing ( currently disabled in the build !!!!!)
        int allow_spatial_resampling;
@@ -187,7 +189,6 @@ extern "C"
        int arnr_strength ;
        int arnr_type     ;

-
        struct vpx_fixed_buf         two_pass_stats_in;
        struct vpx_codec_pkt_list  *output_pkt_list;

--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -200,7 +200,7 @@ typedef struct VP8Common
 } VP8_COMMON;


-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
+int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
 void vp8_init_loop_filter(VP8_COMMON *cm);
 void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -680,7 +680,6 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei

 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
-    char message[512];
    int q = oci->filter_level * 10 / 6;
    int flags = ppflags->post_proc_flag;
    int deblock_level = ppflags->deblocking_level;
@@ -744,6 +743,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 #if CONFIG_POSTPROC_VISUALIZER
    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
    {
+        char message[512];
        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                (oci->frame_type == KEY_FRAME),
                oci->refresh_golden_frame,
@@ -823,6 +823,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t

    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
    {
+        char message[512];
        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
    }
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ b/vp8/common/ppc/loopfilter_altivec.c
@@ -56,10 +56,10 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }

 void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -77,10 +77,10 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
                         int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void)simpler_lpf;
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr);
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr);
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
 }

 void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -104,7 +104,7 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }

 void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -127,7 +127,7 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);

    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr);
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
 }

 void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -14,6 +14,8 @@

 #define VPXINFINITE 10000       /* 10second. */

+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
 /* Thread management macros */
 #ifdef _WIN32
 /* Win32 */
@@ -88,4 +90,6 @@
 #define x86_pause_hint()
 #endif

+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
 #endif
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -45,13 +45,13 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }


@@ -62,7 +62,7 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -71,13 +71,13 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
 }


@@ -88,7 +88,7 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -102,10 +102,10 @@ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }


@@ -132,10 +132,10 @@ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
 }


@@ -159,10 +159,10 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }


@@ -173,7 +173,7 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -182,10 +182,10 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
 }


@@ -196,7 +196,7 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
    (void) v_ptr;
    (void) uv_stride;
    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
 }


@@ -210,7 +210,7 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
 }


@@ -237,7 +237,7 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
 }


--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -14,7 +14,6 @@
 #include "blockd.h"
 #include "pragmas.h"
 #include "postproc.h"
-#include "dboolhuff.h"
 #include "dequantize.h"
 #include "onyxd_int.h"

@@ -35,12 +34,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
    }
 #endif

@@ -54,12 +47,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
    }
 #endif
 #endif
--- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -1,163 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_v6|
-    EXPORT  |vp8dx_start_decode_v6|
-    EXPORT  |vp8dx_stop_decode_v6|
-    EXPORT  |vp8dx_decode_bool_v6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-br      RN  r0
-prob    RN  r1
-bits    RN  r1
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_v6| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, br
-    mov     r5, bits
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     prob, #0x80
-    mov     br, r4
-    bl      vp8dx_decode_bool_v6_internal     ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_v6|
-
-
-;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_v6| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [br, #bool_decoder_lowvalue]
-    str     r3, [br, #bool_decoder_range]
-    str     r1, [br, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [br, #bool_decoder_count]
-    str     r2, [br, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [br, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_v6|
-
-
-;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_v6| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_v6|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_v6| PROC
-vp8dx_decode_bool_v6_internal
-    stmdb   sp!, {r4 - r6, lr}
-
-    ldr     r2, [br, #bool_decoder_range]
-    ldr     r3, [br, #bool_decoder_value]
-
-    mov     r6, r2, lsl #8
-    sub     r6, r6, #256                ;   split = 1 +  (((range-1) * probability) >> 8)
-    mov     r12, #1
-    smlawb  r6, r6, prob, r12
-
-    mov     lr, #0
-    subs    r5, r3, r6, lsl #24
-
-    ;cmp        r3, r1
-    movhs   lr, #1
-    movhs   r3, r5
-    subhs   r2, r2, r6
-    movlo   r2, r6
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    ;strd   r2, r3, [br, #bool_decoder_range]
-    str     r2, [br, #bool_decoder_range]
-    str     r3, [br, #bool_decoder_value]
-    mov     r0, lr
-    ldmia   sp!, {r4 - r6, pc}
-
-range_less_0x80
-    ldr     r5, [br, #bool_decoder_pos]
-    ldr     r1, [br, #bool_decoder_buffer]
-    ldr     r4, [br, #bool_decoder_count]
-    add     r1, r1, r5
-
-    clz       r12, r2
-    sub       r12, r12, #24
-    subs      r4, r4, r12
-    ldrleb    r6, [r1], #1
-    mov       r2, r2, lsl r12
-    mov       r3, r3, lsl r12
-    addle     r4, r4, #8
-    rsble     r12, r4, #8
-    addle     r5, r5, #1
-    orrle     r3, r3, r6, lsl r12
-
-    ;strd       r2, r3, [br, #bool_decoder_range]
-    ;strd       r4, r5, [br, #bool_decoder_count]
-    str         r2, [br, #bool_decoder_range]
-    str         r3, [br, #bool_decoder_value]
-    str         r4, [br, #bool_decoder_count]
-    str         r5, [br, #bool_decoder_pos]
-
-    mov     r0, lr
-
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8dx_decode_bool_v6|
-
-    END
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -1,43 +0,0 @@
-#ifndef DBOOLHUFF_ARM_H
-#define DBOOLHUFF_ARM_H
-
-/* JLK
- * There are currently no arm-optimized versions of
- * these functions. As they are implemented, they
- * can be uncommented below and added to
- * arm/dsystemdependent.c
- *
- * The existing asm code is likely so different as
- * to be useless. However, its been left (for now)
- * for reference.
- */
-#if 0
-#if HAVE_ARMV6
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_v6
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_v6
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_v6
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_v6
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_neon
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_neon
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_neon
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_neon
-#endif /* HAVE_ARMV7 */
-#endif
-#endif /* DBOOLHUFF_ARM_H */
--- a/vp8/decoder/arm/detokenize.asm
+++ b/vp8/decoder/arm/detokenize.asm
@@ -1,320 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_mb_tokens_v6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    INCLUDE vpx_asm_offsets.asm
-
-l_qcoeff    EQU     0
-l_i         EQU     4
-l_type      EQU     8
-l_stop      EQU     12
-l_c         EQU     16
-l_l_ptr     EQU     20
-l_a_ptr     EQU     24
-l_bc        EQU     28
-l_coef_ptr  EQU     32
-l_stacksize EQU     64
-
-
-;; constant offsets -- these should be created at build time
-c_block2above_offset         EQU 25
-c_entropy_nodes              EQU 11
-c_dct_eob_token              EQU 11
-
-|vp8_decode_mb_tokens_v6| PROC
-    stmdb       sp!, {r4 - r11, lr}
-    sub         sp, sp, #l_stacksize
-    mov         r7, r1                      ; type
-    mov         r9, r0                      ; detoken
-
-    ldr         r1, [r9, #detok_current_bc]
-    ldr         r0, [r9, #detok_qcoeff_start_ptr]
-    mov         r11, #0                     ; i
-    mov         r3, #16                     ; stop
-
-    cmp         r7, #1                      ; type ?= 1
-    addeq       r11, r11, #24               ; i = 24
-    addeq       r3, r3, #8                  ; stop = 24
-    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
-
-    str         r0, [sp, #l_qcoeff]
-    str         r11, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-    str         r1, [sp, #l_bc]
-
-    add         lr, r9, r7, lsl #2          ; detoken + type*4
-
-    ldr         r8, [r1, #bool_decoder_user_buffer]
-
-    ldr         r10, [lr, #detok_coef_probs]
-    ldr         r5, [r1, #bool_decoder_count]
-    ldr         r6, [r1, #bool_decoder_range]
-    ldr         r4, [r1, #bool_decoder_value]
-
-    str         r10, [sp, #l_coef_ptr]
-
-BLOCK_LOOP
-    ldr         r3, [r9, #detok_ptr_block2leftabove]
-    ldr         r1, [r9, #detok_L]
-    ldr         r2, [r9, #detok_A]
-    ldrb        r12, [r3, r11]!             ; block2left[i]
-    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
-
-    cmp         r7, #0                      ; c = !type
-    moveq       r7, #1
-    movne       r7, #0
-
-    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
-    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
-    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
-
-; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
-    cmp         r0, #0                      ; *l ?= 0
-    movne       r0, #1
-    cmp         r3, #0                      ; *a ?= 0
-    addne       r0, r0, #1                  ; t
-
-    str         r1, [sp, #l_l_ptr]          ; save &l
-    str         r2, [sp, #l_a_ptr]          ; save &a
-    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
-    mov         r1, #0                      ; t = 0
-    str         r7, [sp, #l_c]
-
-    ;align 4
-COEFF_LOOP
-    ldr         r3, [r9, #detok_ptr_coef_bands_x]
-    ldr         lr, [r9, #detok_coef_tree_ptr]
-    ;STALL
-    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
-    ;STALL
-    ;STALL
-    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
-
-get_token_loop
-    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
-    mov         r10, #1                     ; 1
-
-    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
-    ;++
-
-    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
-    addhs       r1, r1, #1                  ; t += 1
-    movhs       r4, r3                      ; value -= bigsplit (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
- ;   movlo       r6, r2                      ; range = split
-
-    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
-
-; NORMALIZE
-    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
-    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
-    addle         r5, r5, #8                ; count += 8
-    rsble         r3, r5, #24               ; 24 - count
-    addle         r8, r8, #1                ; bufptr++
-    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
-
-    cmp         r1, #0                      ; t ?= 0
-    bgt         get_token_loop              ; while (t > 0)
-
-    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
-    beq         END_OF_BLOCK                ; break
-
-    rsb         lr, r1, #0                  ; v = -t;
-
-    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
-    ble         SKIP_EXTRABITS
-
-    ldr         r3, [r9, #detok_teb_base_ptr]
-    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
-    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
-
-    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
-    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
-
-extrabits_loop
-    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
-
-    ldrb        r2, [r3, #4]                ; probability. why +4?
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; range << 8 + 1 << 8
-
-    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; *bufptr
-    ;++
-
-    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
-    movhs       r4, r10                     ; value = value - (split << 24)
-    subhs       r2, r6, r2                  ; range = range - split
-    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
-
-; NORMALIZE
-    clz         r3, r2                      ; shift - leading zeros in split
-    sub         r3, r3, #24                 ; don't count first 3 bytes
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range = range << shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-    addle       r5, r5, #8                  ; count += BR_COUNT
-    addle       r8, r8, #1                  ; bufptr++
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
-
-    subs        r0, r0, #1                  ; bits_count --
-    bpl         extrabits_loop
-
-
-SKIP_EXTRABITS
-    ldr         r11, [sp, #l_qcoeff]
-    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
-
-    cmp         r1, #0                      ; check for nonzero token - if (t)
-    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
-
-    add         r3, r6, #1                  ; range + 1
-    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
-
-    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
-    movhs       r4, r3                      ; value -= (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
-    mvnhs       r3, lr                      ; -v
-    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
-
-; NORMALIZE
-    clz         r3, r2                      ; leading 0s in split
-    sub         r3, r3, #24                 ; shift
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-    ldrleb      r2, [r8], #1                ; *(bufptr++)
-    addle       r5, r5, #8                  ; count += 8
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
-
-    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    cmn         r1, #1                      ; t < -ONE_TOKEN
-
-    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    mvn         r1, #1                      ; t = -1 ???? C is -2
-
-SKIP_EOB_CHECK
-    ldr         r7, [sp, #l_c]              ; c
-    ldr         r3, [r9, #detok_scan]
-    add         r1, r1, #2                  ; t+= 2
-    cmp         r7, #15                     ; c should will be one higher
-
-    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
-    add         r7, r7, #1                  ; c++
-    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
-
-    str         r7, [sp, #l_c]              ; store c
-    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
-
-    blt         COEFF_LOOP
-
-    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
-
-END_OF_BLOCK
-    ldr         r3, [sp, #l_type]           ; type
-    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
-    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
-    ldr         r11, [sp, #l_i]             ; i
-    ldr         r12, [sp, #l_stop]          ; stop
-
-    cmp         r3, #0                      ; type ?= 0
-    moveq       r1, #1
-    movne       r1, #0
-    add         r3, r11, r9                 ; detok + i
-
-    cmp         r7, r1                      ; c ?= !type
-    strb        r7, [r3, #detok_eob]        ; eob[i] = c
-
-    ldr         r7, [sp, #l_l_ptr]          ; l
-    ldr         r2, [sp, #l_a_ptr]          ; a
-    movne       r3, #1                      ; t
-    moveq       r3, #0
-
-    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
-    add         r11, r11, #1                ; i++
-    strb        r3, [r7]                    ; *l = t
-    strb        r3, [r2]                    ; *a = t
-    str         r0, [sp, #l_qcoeff]         ; qcoeff
-    str         r11, [sp, #l_i]             ; i
-
-    cmp         r11, r12                    ; i < stop
-    ldr         r7, [sp, #l_type]           ; type
-
-    blt         BLOCK_LOOP
-
-    cmp         r11, #25                    ; i ?= 25
-    bne         ln2_decode_mb_to
-
-    ldr         r12, [r9, #detok_qcoeff_start_ptr]
-    ldr         r10, [r9, #detok_coef_probs]
-    mov         r7, #0                      ; type/i = 0
-    mov         r3, #16                     ; stop = 16
-    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
-    str         r7, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
-
-    b           BLOCK_LOOP
-
-ln2_decode_mb_to
-    cmp         r11, #16                    ; i ?= 16
-    bne         ln1_decode_mb_to
-
-    mov         r10, #detok_coef_probs
-    add         r10, r10, #2*4              ; coef_probs[type]
-    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
-
-    mov         r7, #2                      ; type = 2
-    mov         r3, #24                     ; stop = 24
-
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
-    b           BLOCK_LOOP
-
-ln1_decode_mb_to
-    ldr         r2, [sp, #l_bc]
-    mov         r0, #0
-    nop
-
-    str         r8, [r2, #bool_decoder_user_buffer]
-    str         r5, [r2, #bool_decoder_count]
-    str         r4, [r2, #bool_decoder_value]
-    str         r6, [r2, #bool_decoder_range]
-
-    add         sp, sp, #l_stacksize
-    ldmia       sp!, {r4 - r11, pc}
-
-    ENDP  ; |vp8_decode_mb_tokens_v6|
-
-    END
--- a/vp8/decoder/arm/neon/dboolhuff_neon.asm
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -1,160 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_neon|
-    EXPORT  |vp8dx_start_decode_neon|
-    EXPORT  |vp8dx_stop_decode_neon|
-    EXPORT  |vp8dx_decode_bool_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_neon| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, r0
-    mov     r5, r1
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     r1, #0x80
-    mov     r0, r4
-    bl      vp8dx_decode_bool_neon_internal       ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_neon|
-
-
-;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_neon| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [r0, #bool_decoder_lowvalue]
-    str     r3, [r0, #bool_decoder_range]
-    str     r1, [r0, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [r0, #bool_decoder_count]
-    str     r2, [r0, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [r0, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_neon|
-
-
-;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_neon| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_neon|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_neon| PROC
-vp8dx_decode_bool_neon_internal
-;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
-;before memory allocation
-    stmdb   sp!, {r4 - r5, lr}
-
-    ldr     r2, [r0, #bool_decoder_range]       ;load range (r2), value(r3)
-    ldr     r3, [r0, #bool_decoder_value]
-    ;ldrd   r2, r3, [r0, #bool_decoder_range]   ;ldrd costs 2 cycles
-    ;
-
-    mov     r4, r2, lsl #8
-    sub     r4, r4, #256
-    mov     r12, #1
-
-    smlawb  r4, r4, r1, r12         ;split = 1 +  (((range-1) * probability) >> 8)
-
-    mov     lr, r0
-    mov     r0, #0                  ;bit = 0
-    ;
-    subs    r5, r3, r4, lsl #24
-
-    subhs   r2, r2, r4              ;range = br->range-split
-    movlo   r2, r4                  ;range = split
-    movhs   r0, #1                  ;bit = 1
-    movhs   r3, r5                  ;value = value-bigsplit
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-
-    ldmia   sp!, {r4 - r5, pc}
-
-range_less_0x80
-
-    ldrd    r4, r5, [lr, #bool_decoder_count]   ;load count, pos, buffer
-    ldr     r1, [lr, #bool_decoder_buffer]
-
-    clz     r12, r2
-    add     r1, r1, r5
-
-    sub     r12, r12, #24
-    subs    r4, r4, r12             ;count -= shift
-    mov     r2, r2, lsl r12         ;range <<= shift
-    mov     r3, r3, lsl r12         ;value <<= shift
-    addle   r4, r4, #8              ;count += 8
-    ldrleb  r12, [r1], #1           ;br->buffer[br->pos]
-
-    rsble   r1, r4, #8              ;-count
-    addle   r5, r5, #1              ;br->pos++
-    orrle   r3, r3, r12, lsl r1     ;value |= (br->buffer[br->pos]) << (-count)
-
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-    strd    r4, r5, [lr, #bool_decoder_count]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_decode_bool_neon|
-
-    END
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,13 +12,7 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>

-#if CONFIG_VP8_ENCODER
-#include "vpx_scale/yv12config.h"
-#endif
-
-#if CONFIG_VP8_DECODER
 #include "onyxd_int.h"
-#endif

 #define DEFINE(sym, val) int sym = val;

@@ -31,29 +25,6 @@
 * {
 */

-#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-#endif
-
-#if CONFIG_VP8_DECODER
-DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
-DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
-DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
-DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
-DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
-DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
-DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
-DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
-
 DEFINE(detok_scan,                              offsetof(DETOK, scan));
 DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
 DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
@@ -77,7 +48,6 @@ DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));

 DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
 DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
-#endif

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -26,8 +26,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8dx_bitreader_norm[256]) =
 };


-int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
-                        unsigned int source_sz)
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz)
 {
    br->user_buffer_end = source+source_sz;
    br->user_buffer     = source;
@@ -39,13 +40,13 @@ int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
        return 1;

    /* Populate the buffer */
-    vp8dx_bool_decoder_fill_c(br);
+    vp8dx_bool_decoder_fill(br);

    return 0;
 }


-void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
    const unsigned char *bufptr;
    const unsigned char *bufend;
@@ -62,69 +63,3 @@ void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
    br->value = value;
    br->count = count;
 }
-
-#if 0
-/*
- * Until optimized versions of these functions are available, we
- * keep the implementation in the header to allow inlining.
- *
- * The RTCD-style invocations are still in place so this can
- * be switched by just uncommenting these functions here and
- * the DBOOLHUFF_INVOKE calls in the header.
- */
-int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
-{
-    unsigned int bit=0;
-    VP8_BD_VALUE value;
-    unsigned int split;
-    VP8_BD_VALUE bigsplit;
-    int count;
-    unsigned int range;
-
-    value = br->value;
-    count = br->count;
-    range = br->range;
-
-    split = 1 + (((range-1) * probability) >> 8);
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
-
-    range = split;
-    if(value >= bigsplit)
-    {
-        range = br->range-split;
-        value = value-bigsplit;
-        bit = 1;
-    }
-
-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit;
-    }*/
-
-    {
-        register unsigned int shift = vp8dx_bitreader_norm[range];
-        range <<= shift;
-        value <<= shift;
-        count -= shift;
-    }
-    br->value = value;
-    br->count = count;
-    br->range = range;
-    if (count < 0)
-        vp8dx_bool_decoder_fill_c(br);
-    return bit;
-}
-
-int vp8dx_decode_value_c(BOOL_DECODER *br, int bits)
-{
-    int z = 0;
-    int bit;
-    for ( bit=bits-1; bit>=0; bit-- )
-    {
-        z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-    }
-    return z;
-}
-#endif
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -25,10 +25,6 @@ typedef size_t VP8_BD_VALUE;
  Even relatively modest values like 100 would work fine.*/
 # define VP8_LOTS_OF_BITS (0x40000000)

-
-
-struct vp8_dboolhuff_rtcd_vtable;
-
 typedef struct
 {
    const unsigned char *user_buffer_end;
@@ -36,82 +32,15 @@ typedef struct
    VP8_BD_VALUE         value;
    int                  count;
    unsigned int         range;
-#if CONFIG_RUNTIME_CPU_DETECT
-    struct vp8_dboolhuff_rtcd_vtable *rtcd;
-#endif
 } BOOL_DECODER;

-#define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
-    const unsigned char *source, unsigned int source_sz)
-#define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
-#define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
-#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
-
-#if ARCH_ARM
-#include "arm/dboolhuff_arm.h"
-#endif
-
-#ifndef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_c
-#endif
-
-#ifndef vp8_dbool_fill
-#define vp8_dbool_fill vp8dx_bool_decoder_fill_c
-#endif
-
-#ifndef vp8_dbool_debool
-#define vp8_dbool_debool vp8dx_decode_bool_c
-#endif
-
-#ifndef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8dx_decode_value_c
-#endif
-
-extern prototype_dbool_start(vp8_dbool_start);
-extern prototype_dbool_fill(vp8_dbool_fill);
-extern prototype_dbool_debool(vp8_dbool_debool);
-extern prototype_dbool_devalue(vp8_dbool_devalue);
-
-typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
-typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
-typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
-typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
-
-typedef struct vp8_dboolhuff_rtcd_vtable {
-    vp8_dbool_start_fn_t   start;
-    vp8_dbool_fill_fn_t    fill;
-    vp8_dbool_debool_fn_t  debool;
-    vp8_dbool_devalue_fn_t devalue;
-} vp8_dboolhuff_rtcd_vtable_t;
-
-/* There are no processor-specific versions of these
- * functions right now. Disable RTCD to avoid using
- * function pointers which gives a speed boost
- */
-/*#ifdef ENABLE_RUNTIME_CPU_DETECT
-#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
-#define IF_RTCD(x) (x)
-#else*/
-#define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
-#define IF_RTCD(x) NULL
-/*#endif*/
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);

-/* wrapper functions to hide RTCD. static means inline means hopefully no
- * penalty
- */
-static int vp8dx_start_decode(BOOL_DECODER *br,
-        struct vp8_dboolhuff_rtcd_vtable *rtcd,
-        const unsigned char *source, unsigned int source_sz) {
-#if CONFIG_RUNTIME_CPU_DETECT
-    br->rtcd = rtcd;
-#endif
-    return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
-}
-static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
-    DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
-}
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);

 /*The refill loop is used in several places, so define it in a macro to make
   sure they're all consistent.
@@ -138,12 +67,6 @@ static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {


 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
-   */
    unsigned int bit = 0;
    VP8_BD_VALUE value;
    unsigned int split;
@@ -167,13 +90,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
        bit = 1;
    }

-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit
-    }*/
-
    {
        register unsigned int shift = vp8dx_bitreader_norm[range];
        range <<= shift;
@@ -190,12 +106,6 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {

 static int vp8_decode_value(BOOL_DECODER *br, int bits)
 {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, devalue)(br, bits);
-   */
    int z = 0;
    int bit;

@@ -206,4 +116,29 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits)

    return z;
 }
+
+static int vp8dx_bool_error(BOOL_DECODER *br)
+{
+  /* Check if we have reached the end of the buffer.
+   *
+   * Variable 'count' stores the number of bits in the 'value' buffer,
+   * minus 8. So if count == 8, there are 16 bits available to be read.
+   * Normally, count is filled with 8 and one byte is filled into the
+   * value buffer. When we reach the end of the buffer, count is instead
+   * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real
+   * bits from the bitstream. So the last bit in the bitstream will be
+   * represented by count == VP8_LOTS_OF_BITS - 16.
+   */
+    if ((br->count > VP8_BD_VALUE_SIZE)
+        && (br->count <= VP8_LOTS_OF_BITS - 16))
+    {
+       /* We have tried to decode bits after the end of
+        * stream was encountered.
+        */
+        return 1;
+    }
+
+    /* No error. */
+    return 0;
+}
 #endif
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -381,6 +381,12 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+        if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
+        {
+            /* propagate errors from reference frames */
+            xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+        }
+
        vp8_build_uvmvs(xd, pc->full_pixel);

        /*
@@ -391,6 +397,8 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        */
        vp8_decode_macroblock(pbi, xd);

+        /* check if the boolean decoder has suffered an error */
+        xd->corrupted |= vp8dx_bool_error(xd->current_bc);

        recon_yoffset += 16;
        recon_uvoffset += 8;
@@ -467,8 +475,7 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                               "Truncated packet or corrupt partition "
                               "%d length", i + 1);

-        if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
-                               partition, partition_size))
+        if (vp8dx_start_decode(bool_decoder, partition, partition_size))
            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                               "Failed to allocate bool decoder %d", i + 1);

@@ -477,15 +484,16 @@ static void setup_token_decoder(VP8D_COMP *pbi,
        bool_decoder++;
    }

+#if CONFIG_MULTITHREAD
    /* Clamp number of decoder threads */
    if (pbi->decoding_thread_count > num_part - 1)
        pbi->decoding_thread_count = num_part - 1;
+#endif
 }


 static void stop_token_decoder(VP8D_COMP *pbi)
 {
-    int i;
    VP8_COMMON *pc = &pbi->common;

    if (pc->multi_token_partition != ONE_PARTITION)
@@ -556,6 +564,7 @@ static void init_frame(VP8D_COMP *pbi)
    xd->frame_type = pc->frame_type;
    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_stride = pc->mode_info_stride;
+    xd->corrupted = 0; /* init without corruption */
 }

 int vp8_decode_frame(VP8D_COMP *pbi)
@@ -571,6 +580,10 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    int i, j, k, l;
    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;

+    /* start with no corruption of current frame */
+    xd->corrupted = 0;
+    pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+
    if (data_end - data < 3)
        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet");
@@ -639,8 +652,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    init_frame(pbi);

-    if (vp8dx_start_decode(bc, IF_RTCD(&pbi->dboolhuff),
-                           data, data_end - data))
+    if (vp8dx_start_decode(bc, data, data_end - data))
        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                           "Failed to allocate bool decoder 0");
    if (pc->frame_type == KEY_FRAME) {
@@ -834,7 +846,9 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

    /* set up frame new frame for intra coded blocks */
+#if CONFIG_MULTITHREAD
    if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+#endif
        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

    vp8_setup_block_dptrs(xd);
@@ -854,6 +868,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));

+#if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
    {
        vp8mt_decode_mb_rows(pbi, xd);
@@ -868,6 +883,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
    }
    else
+#endif
    {
        int ibc = 0;
        int num_part = 1 << pc->multi_token_partition;
@@ -892,6 +908,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)

    stop_token_decoder(pbi);

+    /* Collect information about decoder corruption. */
+    /* 1. Check first boolean decoder for errors. */
+    pc->yv12_fb[pc->new_fb_idx].corrupted =
+        vp8dx_bool_error(bc);
+    /* 2. Check the macroblock information */
+    pc->yv12_fb[pc->new_fb_idx].corrupted |=
+        xd->corrupted;
+
    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */

    /* If this was a kf or Gf note the Q used */
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -74,37 +74,6 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
    }
 }

-#if CONFIG_ARM_ASM_DETOK
-/* mashup of vp8_block2left and vp8_block2above so we only need one pointer
- * for the assembly version.
- */
-DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
-{
-    /* vp8_block2left */
-    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    /* vp8_block2above */
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
-
-void vp8_init_detokenizer(VP8D_COMP *dx)
-{
-    const VP8_COMMON *const oc = & dx->common;
-    MACROBLOCKD *x = & dx->mb;
-
-    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
-    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
-    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
-    dx->detoken.scan = vp8_default_zig_zag1d;
-    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
-    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
-
-    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
-    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
-    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
-    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
-}
-#endif
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 #define FILL \
    if(count < 0) \
@@ -202,35 +171,6 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
    }\
    NORMALIZE

-#if CONFIG_ARM_ASM_DETOK
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
-{
-    int eobtotal = 0;
-    int i, type;
-
-    dx->detoken.current_bc = x->current_bc;
-    dx->detoken.A = x->above_context;
-    dx->detoken.L = x->left_context;
-
-    type = 3;
-
-    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        type = 1;
-        eobtotal -= 16;
-    }
-
-    vp8_decode_mb_tokens_v6(&dx->detoken, type);
-
-    for (i = 0; i < 25; i++)
-    {
-        x->eobs[i] = dx->detoken.eob[i];
-        eobtotal += dx->detoken.eob[i];
-    }
-
-    return eobtotal;
-}
-#else
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
@@ -423,4 +363,3 @@ BLOCK_FINISHED:
    return eobtotal;

 }
-#endif /*!CONFIG_ASM_DETOK*/
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -14,10 +14,6 @@

 #include "onyxd_int.h"

-#if ARCH_ARM
-#include "arm/detokenize_arm.h"
-#endif
-
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);

--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -27,12 +27,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
-    pbi->dboolhuff.start             = vp8dx_start_decode_c;
-    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-#if 0 /*For use with RTCD, when implemented*/
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-#endif
 #endif

 #if ARCH_X86 || ARCH_X86_64
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -114,8 +114,10 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
    pbi->ready_for_new_data = 1;

    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
+#if CONFIG_MULTITHREAD
    pbi->max_threads = oxcf->max_threads;
    vp8_decoder_create_threads(pbi);
+#endif

    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
@@ -131,9 +133,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
        cm->last_sharpness_level = cm->sharpness_level;
    }

-#if CONFIG_ARM_ASM_DETOK
-    vp8_init_detokenizer(pbi);
-#endif
    pbi->common.error.setjmp = 0;
    return (VP8D_PTR) pbi;
 }
@@ -149,8 +148,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 #if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd)
        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
-#endif
    vp8_decoder_remove_threads(pbi);
+#endif
    vp8_remove_common(&pbi->common);
    vpx_free(pbi);
 }
@@ -254,12 +253,7 @@ static void ref_cnt_fb (int *buf, int *idx, int new_idx)
 /* If any buffer copy / swapping is signalled it should be done here. */
 static int swap_frame_buffers (VP8_COMMON *cm)
 {
-    int fb_to_update_with, err = 0;
-
-    if (cm->refresh_last_frame)
-        fb_to_update_with = cm->lst_fb_idx;
-    else
-        fb_to_update_with = cm->new_fb_idx;
+    int err = 0;

    /* The alternate reference frame or golden frame can be updated
     *  using the new, last, or golden/alt ref frame.  If it
@@ -271,7 +265,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;

        if (cm->copy_buffer_to_arf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_arf == 2)
            new_fb = cm->gld_fb_idx;
        else
@@ -285,7 +279,7 @@ static int swap_frame_buffers (VP8_COMMON *cm)
        int new_fb = 0;

        if (cm->copy_buffer_to_gf == 1)
-            new_fb = fb_to_update_with;
+            new_fb = cm->lst_fb_idx;
        else if (cm->copy_buffer_to_gf == 2)
            new_fb = cm->alt_fb_idx;
        else
@@ -334,6 +328,23 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign

    pbi->common.error.error_code = VPX_CODEC_OK;

+    if (size == 0)
+    {
+       /* This is used to signal that we are missing frames.
+        * We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
+
+        /* Nothing more to do. */
+        return 0;
+    }
+
+
 #if HAVE_ARMV7
 #if CONFIG_RUNTIME_CPU_DETECT
    if (cm->rtcd.flags & HAS_NEON)
@@ -356,6 +367,13 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        }
 #endif
        pbi->common.error.setjmp = 0;
+
+       /* We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
        return -1;
@@ -388,6 +406,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        return retcode;
    }

+#if CONFIG_MULTITHREAD
    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
    {
        if (swap_frame_buffers (cm))
@@ -405,6 +424,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
            return -1;
        }
    } else
+#endif
    {
        if (swap_frame_buffers (cm))
        {
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -87,14 +87,15 @@ typedef struct VP8Decompressor
    unsigned int time_decoding;
    unsigned int time_loop_filtering;

+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
    volatile int b_multithreaded_rd;
    int max_threads;
    int current_mb_col_main;
    int decoding_thread_count;
    int allocated_decoding_thread_count;

-    /* variable for threading */
-#if CONFIG_MULTITHREAD
    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
    int sync_range;
    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
@@ -125,7 +126,6 @@ typedef struct VP8Decompressor

 #if CONFIG_RUNTIME_CPU_DETECT
    vp8_dequant_rtcd_vtable_t        dequant;
-    struct vp8_dboolhuff_rtcd_vtable dboolhuff;
 #endif


--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -21,7 +21,6 @@

 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -146,17 +145,10 @@ void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
    unsigned char *yleft_col;
    unsigned char yleft_buf[16];
@@ -289,17 +281,10 @@ void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_r
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;    /*[16];*/
    unsigned char uleft_buf[8];
@@ -452,17 +437,10 @@ void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_ro
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
    unsigned char *uleft_col;   /*[16];*/
    unsigned char uleft_buf[8];
@@ -621,12 +599,6 @@ void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_
    case MB_MODE_COUNT:
        break;
    }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }


@@ -638,7 +610,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
                          int mb_col,
                          int num)
 {
-#if CONFIG_MULTITHREAD
    int i, r, c;

    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
@@ -935,15 +906,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,


    }
-#else
-    (void) pbi;
-    (void) xd;
-    (void) b_mode;
-    (void) predictor;
-    (void) mb_row;
-    (void) mb_col;
-    (void) num;
-#endif
 }

 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
@@ -951,7 +913,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
 */
 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
    unsigned int *src_ptr;
    unsigned int *dst_ptr0;
@@ -973,10 +934,4 @@ void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
    *dst_ptr0 = *src_ptr;
    *dst_ptr1 = *src_ptr;
    *dst_ptr2 = *src_ptr;
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -9,7 +9,7 @@
 */


-#ifndef WIN32
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
 #ifdef __APPLE__
@@ -38,7 +38,6 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);

 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i, j;

@@ -88,18 +87,11 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC

    for (i=0; i< pc->mb_rows; i++)
        pbi->mt_current_mb_col[i]=-1;
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mbrd;
-    (void) count;
-#endif
 }


 void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
    int eobtotal = 0;
    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
    VP8_COMMON *pc = &pbi->common;
@@ -222,18 +214,11 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
                    (xd->qcoeff+16*16, xd->block[16].dequant,
                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }


 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -320,7 +305,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                             * Apply any context driven MB level adjustment
                             */
-                            vp8_adjust_mb_lf_value(xd, &filter_level);
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                        }

                        /* Distance of Mb to the various image edges.
@@ -438,9 +423,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
            sem_post(&pbi->h_event_end_decoding);
        }
    }
-#else
-    (void) p_data;
-#endif

    return 0 ;
 }
@@ -448,10 +430,8 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)

 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
    int core_count = 0;
    int ithread;
-    int i;

    pbi->b_multithreaded_rd = 0;
    pbi->allocated_decoding_thread_count = 0;
@@ -483,16 +463,11 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)

        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
    }
-
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;

@@ -590,15 +565,11 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
            pbi->mt_vleft_col = NULL ;
        }
    }
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i;
    int uv_width;
@@ -647,17 +618,11 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
        for (i=0; i< pc->mb_rows; i++)
            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
    }
-#else
-    (void) pbi;
-    (void) width;
-#endif
 }


 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
-
    /* shutdown MB Decoding thread; */
    if (pbi->b_multithreaded_rd)
    {
@@ -703,15 +668,11 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            pbi->de_thread_data = NULL;
        }
    }
-#else
-    (void) pbi;
-#endif
 }


 void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
-#if CONFIG_MULTITHREAD
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
@@ -721,7 +682,6 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
    /*int mb_row;
    int mb_col;
    int baseline_filter_level[MAX_MB_SEGMENTS];*/
-    int filter_level;
    int alt_flt_enabled = mbd->segmentation_enabled;

    int i;
@@ -754,22 +714,17 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
        vp8_init_loop_filter(cm);
    else if (frame_type != cm->last_frame_type)
        vp8_frame_init_loop_filter(lfi, frame_type);
-#else
-    (void) pbi;
-    (void) default_filt_lvl;
-#endif
 }


 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-#if CONFIG_MULTITHREAD
    int mb_row;
    VP8_COMMON *pc = &pbi->common;

    int ibc = 0;
    int num_part = 1 << pbi->common.multi_token_partition;
-    int i, j;
+    int i;
    volatile int *last_row_current_mb_col = NULL;
    int nsync = pbi->sync_range;

@@ -809,7 +764,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)

    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
    {
-        int i;

        xd->current_bc = &pbi->mbc[mb_row%num_part];

@@ -866,7 +820,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                     * Apply any context driven MB level adjustment
                     */
-                    vp8_adjust_mb_lf_value(xd, &filter_level);
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
                }

                /* Distance of Mb to the various image edges.
@@ -893,9 +847,18 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+                if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
+                {
+                    /* propagate errors from reference frames */
+                    xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+                }
+
                vp8_build_uvmvs(xd, pc->full_pixel);
                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);

+                /* check if the boolean decoder has suffered an error */
+                xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
                if (pbi->common.filter_level)
                {
                    /* Save decoded MB last row data for next-row decoding */
@@ -975,8 +938,4 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
    }

    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
-#else
-    (void) pbi;
-    (void) xd;
-#endif
 }
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;

        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;

        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -14,7 +14,7 @@
    EXPORT |vp8_stop_encode|
    EXPORT |vp8_encode_value|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_tokens_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_mb_row_tokens_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -11,7 +11,7 @@

    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|

-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm

    ARM
    REQUIRE8
@@ -65,6 +65,8 @@
 numparts_loop
    ldr     r10, [sp, #40]              ; ptr
    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    sub     r5, r5, r11                 ; move start point with each partition
+                                        ; mb_rows starts at i
    str     r5,  [sp, #12]

    ; Reset all of the VP8 Writer data for each partition that
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,147 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0x0]      ; load 4 src pixels
+    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #0x4]      ; load 4 src pixels
+    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #0x8]      ; load 4 src pixels
+    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #0xc]      ; load 4 src pixels
+    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #0x28]     ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
--- a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
@@ -112,10 +112,7 @@
    ENDP

 ;-----------------
-    AREA    fastfdct_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _ffdct_coeff_
    DCD     ffdct_coeff
 ffdct_coeff
--- a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
+++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
@@ -165,10 +165,7 @@
    ENDP

 ;-----------------
-    AREA    fastfdct8x4_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _ffdct8_coeff_
    DCD     ffdct8_coeff
 ffdct8_coeff
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -122,10 +122,7 @@
    ENDP

 ;-----------------
-    AREA    dct4x4_dat, DATA, READONLY
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _dct_matrix_
    DCD     dct_matrix
 dct_matrix
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -9,7 +9,7 @@
 ;


-    EXPORT  |vp8_sub_pixel_variance16x16_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
    ARM
    REQUIRE8
    PRESERVE8
@@ -24,7 +24,7 @@
 ; stack(r6) unsigned int *sse
 ;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.

-|vp8_sub_pixel_variance16x16_neon| PROC
+|vp8_sub_pixel_variance16x16_neon_func| PROC
    push            {r4-r6, lr}

    ldr             r12, _BilinearTaps_coeff_
@@ -416,10 +416,7 @@ sub_pixel_variance16x16_neon_loop
    ENDP

 ;-----------------
-    AREA    vp8e_bilinear_taps_dat, DATA, READWRITE          ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -215,10 +215,7 @@ sub_pixel_variance8x8_neon_loop
    ENDP

 ;-----------------
-    AREA    bilinear_taps_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 _BilinearTaps_coeff_
    DCD     bilinear_taps_coeff
 bilinear_taps_coeff
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "variance.h"
+#include "filter.h"
+#include "arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+#endif
+
+#if HAVE_ARMV7
+
+unsigned int vp8_sub_pixel_variance16x16_neon
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+  if (xoffset == 4 && yoffset == 0)
+    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 0 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 4 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else
+    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -12,6 +12,23 @@
 #ifndef VARIANCE_ARM_H
 #define VARIANCE_ARM_H

+#if HAVE_ARMV6
+
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
 #if HAVE_ARMV7
 extern prototype_sad(vp8_sad4x4_neon);
 extern prototype_sad(vp8_sad8x8_neon);
@@ -30,6 +47,7 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func);
 extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -12,9 +12,9 @@
 #include "vpx_ports/config.h"
 #include <stddef.h>

-#include "../treewriter.h"
-#include "../tokenize.h"
-#include "../onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "onyx_int.h"

 #define ct_assert(name,cond) \
    static void assert_##name(void) UNUSED;\
@@ -31,6 +31,7 @@
 * {
 */

+//pack tokens
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
 DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
@@ -40,19 +41,19 @@ DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));

 DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
 DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                  offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
 DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
 DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));

-DEFINE(vp8_extra_bit_struct_sz,                   sizeof(vp8_extra_bit_struct));
+DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));

 DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
 DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));

-DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));
+DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));

 DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
 DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
@@ -62,12 +63,14 @@ DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
 DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
 DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

-DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));
+DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

-// These two sizes are used in vp7cx_pack_tokens.  They are hard coded
-//  so if the size changes this will have to be adjusted.
+// These two sizes are used in vp8cx_pack_tokens.  They are hard coded
+// so if the size changes this will have to be adjusted.
+#if HAVE_ARMV5TE
 ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
 ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif

 //add asserts for any offset that is not supported by assembly code
 //add asserts for any size that is not supported by assembly code
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1654,10 +1654,12 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
    {
        vp8_start_encode(&cpi->bc2, cx_data + bc->pos);

-        if (!cpi->b_multi_threaded)
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
-        else
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
            pack_mb_row_tokens(cpi, &cpi->bc2);
+        else
+#endif
+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);

        vp8_stop_encode(&cpi->bc2);
        oh.first_partition_length_in_bytes = cpi->bc.pos ;
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -112,6 +112,7 @@ typedef struct

    unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
    int optimize;
+    int q_index;

    void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -365,6 +365,33 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
    x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
    x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
    x->block[24].zbin_extra = (short)zbin_extra;
+
+    /* save this macroblock QIndex for vp8_update_zbin_extra() */
+    x->q_index = QIndex;
+}
+void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    int i;
+    int QIndex = x->q_index;
+    int zbin_extra;
+
+    // Y
+    zbin_extra = (cpi->common.Y1dequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    for (i = 0; i < 16; i++)
+    {
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // UV
+    zbin_extra = (cpi->common.UVdequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    for (i = 16; i < 24; i++)
+    {
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // Y2
+    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].zbin_extra = (short)zbin_extra;
 }

 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
@@ -372,13 +399,6 @@ void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
    // Clear Zbin mode boost for default case
    cpi->zbin_mode_boost = 0;

-    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
-    // when these values are not all zero.
-    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
-    {
-        vp8cx_init_quantizer(cpi);
-    }
-
    // MB level quantizer setup
    vp8cx_mb_init_quantizer(cpi, &cpi->mb);
 }
@@ -408,7 +428,6 @@ unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
    int sum;
    unsigned int a;
    unsigned int b;
-    unsigned int d;
    /* TODO: This could also be done over smaller areas (8x8), but that would
     *  require extensive changes elsewhere, as lambda is assumed to be fixed
     *  over an entire MB in most of the code.
@@ -461,6 +480,16 @@ void encode_mb_row(VP8_COMP *cpi,
    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
    int seg_map_index = (mb_row * cpi->common.mb_cols);

+#if CONFIG_MULTITHREAD
+    const int nsync = cpi->mt_sync_range;
+    const int rightmost_col = cm->mb_cols - 1;
+    volatile const int *last_row_current_mb_col;
+
+    if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+    else
+        last_row_current_mb_col = &rightmost_col;
+#endif

    // reset above block coeffs
    xd->above_context = cm->above_context;
@@ -506,6 +535,21 @@ void encode_mb_row(VP8_COMP *cpi,
        x->rddiv = cpi->RDDIV;
        x->rdmult = cpi->RDMULT;

+#if CONFIG_MULTITHREAD
+        if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        {
+            if ((mb_col & (nsync - 1)) == 0)
+            {
+                while (mb_col > (*last_row_current_mb_col - nsync)
+                        && (*last_row_current_mb_col) != (cm->mb_cols - 1))
+                {
+                    x86_pause_hint();
+                    thread_sleep(0);
+                }
+            }
+        }
+#endif
+
        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
            activity_sum += vp8_activity_masking(cpi, x);

@@ -602,7 +646,12 @@ void encode_mb_row(VP8_COMP *cpi,
        x->partition_info++;

        xd->above_context++;
-        cpi->current_mb_col_main = mb_col;
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded != 0)
+        {
+            cpi->mt_current_mb_col[mb_row] = mb_col;
+        }
+#endif
    }

    //extend the recon for intra prediction
@@ -616,12 +665,15 @@ void encode_mb_row(VP8_COMP *cpi,
    xd->mode_info_context++;
    x->partition_info++;
    x->activity_sum += activity_sum;
+
+#if CONFIG_MULTITHREAD
+    if ((cpi->b_multi_threaded != 0) && (mb_row == cm->mb_rows - 1))
+    {
+        sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+    }
+#endif
 }

-
-
-
-
 void vp8_encode_frame(VP8_COMP *cpi)
 {
    int mb_row;
@@ -629,7 +681,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
    VP8_COMMON *const cm = & cpi->common;
    MACROBLOCKD *const xd = & x->e_mbd;

-    int i;
    TOKENEXTRA *tp = cpi->tok;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
@@ -712,9 +763,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
    }

    vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
-    //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
-    //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);

    // Copy data over into macro block data sturctures.

@@ -734,20 +783,6 @@ void vp8_encode_frame(VP8_COMP *cpi)

    x->activity_sum = 0;

-#if 0
-    // Experimental rd code
-    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
-    // such as cpi->rate_correction_factor that indicate relative complexity.
-    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
-    {
-        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
-        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
-    }
-    else
-        x->rdmult = cpi->RDMULT; */
-    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
-#endif
-
    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_context->mbmi.uv_mode = DC_PRED;

@@ -765,7 +800,76 @@ void vp8_encode_frame(VP8_COMP *cpi)
        struct vpx_usec_timer  emr_timer;
        vpx_usec_timer_start(&emr_timer);

-        if (!cpi->b_multi_threaded)
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
+            int i;
+
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+
+            for (i = 0; i < cm->mb_rows; i++)
+                cpi->mt_current_mb_col[i] = 0;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+            }
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+                vp8_zero(cm->left_context)
+
+                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
+
+            }
+
+            sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
+
+            cpi->tok_count = 0;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
+            }
+
+            if (xd->segmentation_enabled)
+            {
+                int i, j;
+
+                if (xd->segmentation_enabled)
+                {
+
+                    for (i = 0; i < cpi->encoding_thread_count; i++)
+                    {
+                        for (j = 0; j < 4; j++)
+                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+                    }
+                }
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                totalrate += cpi->mb_row_ei[i].totalrate;
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
+            }
+
+        }
+        else
+#endif
        {
            // for each macroblock row in image
            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
@@ -783,100 +887,6 @@ void vp8_encode_frame(VP8_COMP *cpi)

            cpi->tok_count = tp - cpi->tok;

-        }
-        else
-        {
-#if CONFIG_MULTITHREAD
-            int i;
-
-            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
-
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
-            {
-                cpi->current_mb_col_main = -1;
-
-                for (i = 0; i < cpi->encoding_thread_count; i++)
-                {
-                    if ((mb_row + i + 1) >= cm->mb_rows)
-                        break;
-
-                    cpi->mb_row_ei[i].mb_row = mb_row + i + 1;
-                    cpi->mb_row_ei[i].tp  = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24);
-                    cpi->mb_row_ei[i].current_mb_col = -1;
-                    //SetEvent(cpi->h_event_mbrencoding[i]);
-                    sem_post(&cpi->h_event_mbrencoding[i]);
-                }
-
-                vp8_zero(cm->left_context)
-
-                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
-
-                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-
-                // adjust to the next row of mbs
-                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
-                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-
-                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
-                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
-
-                if (mb_row < cm->mb_rows - 1)
-                    //WaitForSingleObject(cpi->h_event_main, INFINITE);
-                    sem_wait(&cpi->h_event_main);
-            }
-
-            /*
-            for( ;mb_row<cm->mb_rows; mb_row ++)
-            {
-            vp8_zero( cm->left_context)
-
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
-
-            encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-            // adjust to the next row of mbs
-            x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-            x->src.u_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
-            x->src.v_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
-
-            }
-            */
-            cpi->tok_count = 0;
-
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
-            {
-                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
-            }
-
-            if (xd->segmentation_enabled)
-            {
-
-                int i, j;
-
-                if (xd->segmentation_enabled)
-                {
-
-                    for (i = 0; i < cpi->encoding_thread_count; i++)
-                    {
-                        for (j = 0; j < 4; j++)
-                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
-                    }
-                }
-
-            }
-
-            for (i = 0; i < cpi->encoding_thread_count; i++)
-            {
-                totalrate += cpi->mb_row_ei[i].totalrate;
-            }
-
-            for (i = 0; i < cpi->encoding_thread_count; i++)
-            {
-                x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
-            }
-
-#endif
-
        }

        vpx_usec_timer_mark(&emr_timer);
@@ -1138,77 +1148,41 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 {
    int Error4x4, Error16x16, error_uv;
-    B_PREDICTION_MODE intra_bmodes[16];
    int rate4x4, rate16x16, rateuv;
    int dist4x4, dist16x16, distuv;
    int rate = 0;
    int rate4x4_tokenonly = 0;
    int rate16x16_tokenonly = 0;
    int rateuv_tokenonly = 0;
-    int i;

    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

 #if !(CONFIG_REALTIME_ONLY)
-
-    if (cpi->sf.RD || cpi->compressor_speed != 2)
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
    {
-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
-
-        //save the b modes for possible later use
-        for (i = 0; i < 16; i++)
-            intra_bmodes[i] = x->e_mbd.block[i].bmi.mode;
+        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        rate += rateuv;

        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);

-        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);

-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-        rate += rateuv;
-
-        if (Error4x4 < Error16x16)
-        {
-            rate += rate4x4;
-            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-
-            // get back the intra block modes
-            for (i = 0; i < 16; i++)
-                x->e_mbd.block[i].bmi.mode = intra_bmodes[i];
-
-            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error4x4 ;
-#if 0
-            // Experimental RD code
-            cpi->frame_distortion += dist4x4;
-#endif
-        }
-        else
-        {
-            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-            rate += rate16x16;
-
-#if 0
-            // Experimental RD code
-            cpi->prediction_error += Error16x16;
-            cpi->frame_distortion += dist16x16;
-#endif
-        }
-
-        sum_intra_stats(cpi, x);
-
-        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
    }
    else
 #endif
    {
-
-        int rate2, distortion2;
+        int rate2, best_distortion;
        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
        int this_rd;
        Error16x16 = INT_MAX;

+        vp8_pick_intra_mbuv_mode(x);
+
        for (mode = DC_PRED; mode <= TM_PRED; mode ++)
        {
+            int distortion2;
+
            x->e_mbd.mode_info_context->mbmi.mode = mode;
            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
@@ -1219,35 +1193,28 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
            {
                Error16x16 = this_rd;
                best_mode = mode;
+                best_distortion = distortion2;
            }
        }
+        x->e_mbd.mode_info_context->mbmi.mode = best_mode;

-        vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2);
-
-        if (distortion2 == INT_MAX)
-            Error4x4 = INT_MAX;
-        else
-            Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
-
-        if (Error4x4 < Error16x16)
-        {
-            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error4x4;
-        }
-        else
-        {
-            x->e_mbd.mode_info_context->mbmi.mode = best_mode;
-            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-            cpi->prediction_error += Error16x16;
-        }
-
-        vp8_pick_intra_mbuv_mode(x);
-        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-        sum_intra_stats(cpi, x);
-        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+        Error4x4 = vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &best_distortion);
    }

+    if (Error4x4 < Error16x16)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+        vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+    }
+    else
+    {
+        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+    }
+
+    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+    sum_intra_stats(cpi, x);
+    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+
    return rate;
 }
 #ifdef SPEEDSTATS
@@ -1279,10 +1246,17 @@ int vp8cx_encode_inter_macroblock

    if (cpi->sf.RD)
    {
+        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
+        {
            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);

+            /* the fast quantizer does not use zbin_extra, so
+             * do not recalculate */
+            cpi->zbin_mode_boost_enabled = 0;
+        }
        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);

        /* switch back to the regular quantizer for the encode */
@@ -1291,6 +1265,9 @@ int vp8cx_encode_inter_macroblock
            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
        }

+        /* restore cpi->zbin_mode_boost_enabled */
+        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+
    }
    else
 #endif
@@ -1307,7 +1284,7 @@ int vp8cx_encode_inter_macroblock
 #endif

    // MB level adjutment to quantizer setup
-    if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled)
+    if (xd->segmentation_enabled)
    {
        // If cyclic update enabled
        if (cpi->cyclic_refresh_mode_enabled)
@@ -1317,9 +1294,14 @@ int vp8cx_encode_inter_macroblock
                ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
            {
                xd->mode_info_context->mbmi.segment_id = 0;
+
+                /* segment_id changed, so update */
+                vp8cx_mb_init_quantizer(cpi, x);
            }
        }
+    }

+    {
        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
        if (cpi->zbin_mode_boost_enabled)
        {
@@ -1343,7 +1325,7 @@ int vp8cx_encode_inter_macroblock
        else
            cpi->zbin_mode_boost = 0;

-        vp8cx_mb_init_quantizer(cpi,  x);
+        vp8_update_zbin_extra(cpi, x);
    }

    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -58,21 +58,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK
    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }

-void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
-{
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
-
-    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-
-    x->quantize_b(be, b);
-
-    IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
-
-    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 {
    int i;
@@ -144,51 +129,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    }
 }

-void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    int b;
-
-    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
-
-    vp8_transform_intra_mby(x);
-
-    vp8_quantize_mby(x);
-
-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    RECON_INVOKE(&rtcd->common->recon, recon_mby)
-        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-
-    // make sure block modes are set the way we want them for context updates
-    for (b = 0; b < 16; b++)
-    {
-        BLOCKD *d = &x->e_mbd.block[b];
-
-        switch (x->e_mbd.mode_info_context->mbmi.mode)
-        {
-
-        case DC_PRED:
-            d->bmi.mode = B_DC_PRED;
-            break;
-        case V_PRED:
-            d->bmi.mode = B_VE_PRED;
-            break;
-        case H_PRED:
-            d->bmi.mode = B_HE_PRED;
-            break;
-        case TM_PRED:
-            d->bmi.mode = B_TM_PRED;
-            break;
-        default:
-            d->bmi.mode = B_DC_PRED;
-            break;
-
-        }
-    }
-}
-
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    vp8_build_intra_predictors_mbuv(&x->e_mbd);
@@ -213,17 +153,3 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }

-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_intra_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -19,7 +19,5 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
 void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);

 #endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -273,7 +273,6 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
    int x;
    int sz;
    int next;
-    int path;
    int rdmult;
    int rddiv;
    int final_eob;
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -128,7 +128,7 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)

        while (--i > 3);

-        if (x & 240)
+        if (x & 0xFFF0)
            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
    }

--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -8,15 +8,18 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "onyx_int.h"
 #include "threading.h"
 #include "common.h"
 #include "extend.h"

+#if CONFIG_MULTITHREAD

-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
-extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                         TOKENEXTRA **t, int recon_yoffset,
+                                         int recon_uvoffset);
+extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
+                                          TOKENEXTRA **t);
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
@@ -24,12 +27,12 @@ extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
-    VP8_COMP *cpi   = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+    VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
    ENTROPY_CONTEXT_PLANES mb_row_left_context;

+    const int nsync = cpi->mt_sync_range;
    //printf("Started thread %d\n", ithread);

    while (1)
@@ -38,218 +41,213 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
            break;

        //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0)
+        if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
        {
+            VP8_COMMON *cm = &cpi->common;
+            int mb_row;
+            MACROBLOCK *x = &mbri->mb;
+            MACROBLOCKD *xd = &x->e_mbd;
+            TOKENEXTRA *tp ;
+
+            int *segment_counts = mbri->segment_counts;
+            int *totalrate = &mbri->totalrate;
+
            if (cpi->b_multi_threaded == FALSE) // we're shutting down
                break;
-            else
+
+            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
            {
-                VP8_COMMON *cm      = &cpi->common;
-                int mb_row           = mbri->mb_row;
-                MACROBLOCK  *x      = &mbri->mb;
-                MACROBLOCKD *xd     = &x->e_mbd;
-                TOKENEXTRA **tp     = &mbri->tp;
-                int *segment_counts  = mbri->segment_counts;
-                int *totalrate      = &mbri->totalrate;

+                int i;
+                int recon_yoffset, recon_uvoffset;
+                int mb_col;
+                int ref_fb_idx = cm->lst_fb_idx;
+                int dst_fb_idx = cm->new_fb_idx;
+                int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+                int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+                volatile int *last_row_current_mb_col;
+                INT64 activity_sum = 0;
+
+                tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+
+                last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+
+                // reset above block coeffs
+                xd->above_context = cm->above_context;
+                xd->left_context = &mb_row_left_context;
+
+                vp8_zero(mb_row_left_context);
+
+                xd->up_available = (mb_row != 0);
+                recon_yoffset = (mb_row * recon_y_stride * 16);
+                recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+                cpi->tplist[mb_row].start = tp;
+
+                //printf("Thread mb_row = %d\n", mb_row);
+
+                // for each macroblock col in image
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                {
-                    int i;
-                    int recon_yoffset, recon_uvoffset;
-                    int mb_col;
-                    int ref_fb_idx = cm->lst_fb_idx;
-                    int dst_fb_idx = cm->new_fb_idx;
-                    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-                    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-                    volatile int *last_row_current_mb_col;
-                    INT64 activity_sum = 0;
+                    int seg_map_index = (mb_row * cm->mb_cols);

-                    if (ithread > 0)
-                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
-                    else
-                        last_row_current_mb_col = &cpi->current_mb_col_main;
-
-                    // reset above block coeffs
-                    xd->above_context = cm->above_context;
-                    xd->left_context = &mb_row_left_context;
-
-                    vp8_zero(mb_row_left_context);
-
-                    xd->up_available = (mb_row != 0);
-                    recon_yoffset = (mb_row * recon_y_stride * 16);
-                    recon_uvoffset = (mb_row * recon_uv_stride * 8);
-
-
-                    cpi->tplist[mb_row].start = *tp;
-
-                    //printf("Thread mb_row = %d\n", mb_row);
-
-                    // for each macroblock col in image
-                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                    if ((mb_col & (nsync - 1)) == 0)
                    {
-                        int seg_map_index = (mb_row * cm->mb_cols);
-
-                        while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1)
+                        while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
                        {
                            x86_pause_hint();
                            thread_sleep(0);
                        }
-
-                        // Distance of Mb to the various image edges.
-                        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-                        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-                        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-                        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-
-                        // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
-                        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-                        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-                        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-                        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
-
-                        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-                        xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-                        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-                        xd->left_available = (mb_col != 0);
-
-                        x->rddiv = cpi->RDDIV;
-                        x->rdmult = cpi->RDMULT;
-
-                        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
-                            activity_sum += vp8_activity_masking(cpi, x);
-
-                        // Is segmentation enabled
-                        // MB level adjutment to quantizer
-                        if (xd->segmentation_enabled)
-                        {
-                            // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
-                            if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
-                                xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
-                            else
-                                xd->mode_info_context->mbmi.segment_id = 0;
-
-                            vp8cx_mb_init_quantizer(cpi, x);
-                        }
-                        else
-                            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default
-
-                        x->active_ptr = cpi->active_map + seg_map_index + mb_col;
-
-                        if (cm->frame_type == KEY_FRAME)
-                        {
-                            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
-#ifdef MODE_STATS
-                            y_modes[xd->mbmi.mode] ++;
-#endif
-                        }
-                        else
-                        {
-                            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
-
-#ifdef MODE_STATS
-                            inter_y_modes[xd->mbmi.mode] ++;
-
-                            if (xd->mbmi.mode == SPLITMV)
-                            {
-                                int b;
-
-                                for (b = 0; b < xd->mbmi.partition_count; b++)
-                                {
-                                    inter_b_modes[x->partition->bmi[b].mode] ++;
-                                }
-                            }
-
-#endif
-
-                            // Count of last ref frame 0,0 useage
-                            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                                cpi->inter_zz_count ++;
-
-                            // Special case code for cyclic refresh
-                            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
-                            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
-                            {
-                                cpi->segmentation_map[seg_map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
-
-                                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
-                                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
-                                // else mark it as dirty (1).
-                                if (xd->mode_info_context->mbmi.segment_id)
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
-                                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                                {
-                                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
-                                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
-                                }
-                                else
-                                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
-
-                            }
-                        }
-                        cpi->tplist[mb_row].stop = *tp;
-
-                        x->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
-
-                        for (i = 0; i < 16; i++)
-                            vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
-
-                        // adjust to the next column of macroblocks
-                        x->src.y_buffer += 16;
-                        x->src.u_buffer += 8;
-                        x->src.v_buffer += 8;
-
-                        recon_yoffset += 16;
-                        recon_uvoffset += 8;
-
-                        // Keep track of segment useage
-                        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
-
-                        // skip to next mb
-                        xd->mode_info_context++;
-                        x->partition_info++;
-
-                        xd->above_context++;
-
-                        cpi->mb_row_ei[ithread].current_mb_col = mb_col;
-
                    }

-                    //extend the recon for intra prediction
-                    vp8_extend_mb_row(
-                        &cm->yv12_fb[dst_fb_idx],
-                        xd->dst.y_buffer + 16,
-                        xd->dst.u_buffer + 8,
-                        xd->dst.v_buffer + 8);
+                    // Distance of Mb to the various image edges.
+                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                    xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+                    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+                    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-                    // this is to account for the border
+                    // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+                    x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+                    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+                    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+                    xd->left_available = (mb_col != 0);
+
+                    x->rddiv = cpi->RDDIV;
+                    x->rdmult = cpi->RDMULT;
+
+                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                        activity_sum += vp8_activity_masking(cpi, x);
+
+                    // Is segmentation enabled
+                    // MB level adjutment to quantizer
+                    if (xd->segmentation_enabled)
+                    {
+                        // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+                        if (cpi->segmentation_map[seg_map_index + mb_col] <= 3)
+                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index + mb_col];
+                        else
+                            xd->mode_info_context->mbmi.segment_id = 0;
+
+                        vp8cx_mb_init_quantizer(cpi, x);
+                    }
+                    else
+                        xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default
+
+                    x->active_ptr = cpi->active_map + seg_map_index + mb_col;
+
+                    if (cm->frame_type == KEY_FRAME)
+                    {
+                        *totalrate += vp8cx_encode_intra_macro_block(cpi, x, &tp);
+#ifdef MODE_STATS
+                        y_modes[xd->mbmi.mode] ++;
+#endif
+                    }
+                    else
+                    {
+                        *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+                        inter_y_modes[xd->mbmi.mode] ++;
+
+                        if (xd->mbmi.mode == SPLITMV)
+                        {
+                            int b;
+
+                            for (b = 0; b < xd->mbmi.partition_count; b++)
+                            {
+                                inter_b_modes[x->partition->bmi[b].mode] ++;
+                            }
+                        }
+
+#endif
+
+                        // Count of last ref frame 0,0 useage
+                        if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                            cpi->inter_zz_count++;
+
+                        // Special case code for cyclic refresh
+                        // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+                        // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                        {
+                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+                            cpi->segmentation_map[seg_map_index + mb_col] = mbmi->segment_id;
+
+                            // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+                            // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+                            // else mark it as dirty (1).
+                            if (mbmi->segment_id)
+                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = -1;
+                            else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+                            {
+                                if (cpi->cyclic_refresh_map[seg_map_index + mb_col] == 1)
+                                    cpi->cyclic_refresh_map[seg_map_index + mb_col] = 0;
+                            }
+                            else
+                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = 1;
+
+                        }
+                    }
+                    cpi->tplist[mb_row].stop = tp;
+
+                    x->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
+
+                    for (i = 0; i < 16; i++)
+                        vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+                    // adjust to the next column of macroblocks
+                    x->src.y_buffer += 16;
+                    x->src.u_buffer += 8;
+                    x->src.v_buffer += 8;
+
+                    recon_yoffset += 16;
+                    recon_uvoffset += 8;
+
+                    // Keep track of segment useage
+                    segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+
+                    // skip to next mb
                    xd->mode_info_context++;
                    x->partition_info++;
-                    x->activity_sum += activity_sum;
-
-                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
-                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-                    x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
-
-                    xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
-                    x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
-
-                    if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1)
-                    {
-                        //SetEvent(cpi->h_event_main);
-                        sem_post(&cpi->h_event_main);
-                    }
+                    xd->above_context++;

+                    cpi->mt_current_mb_col[mb_row] = mb_col;
                }

+                //extend the recon for intra prediction
+                vp8_extend_mb_row(
+                    &cm->yv12_fb[dst_fb_idx],
+                    xd->dst.y_buffer + 16,
+                    xd->dst.u_buffer + 8,
+                    xd->dst.v_buffer + 8);
+
+                // this is to account for the border
+                xd->mode_info_context++;
+                x->partition_info++;
+                x->activity_sum += activity_sum;
+
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+
+                if (mb_row == cm->mb_rows - 1)
+                {
+                    //SetEvent(cpi->h_event_main);
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
            }
        }
    }

-#else
-    (void) p_data;
-#endif
-
    //printf("exit thread %d\n", ithread);
    return 0;
 }
@@ -363,7 +361,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    }
 }

-
 void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                               MACROBLOCK *x,
                               MB_ROW_COMP *mbr_ei,
@@ -414,7 +411,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);

-
        vp8_build_block_offsets(mb);

        vp8_setup_block_dptrs(mbd);
@@ -431,17 +427,12 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
    }
 }

-
 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
    cpi->b_multi_threaded = 0;

    cpi->processor_core_count = 32; //vp8_get_proc_core_count();

-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
-
-#if CONFIG_MULTITHREAD
-
    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
    {
        int ithread;
@@ -451,14 +442,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
        else
            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;

-
        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+
        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
-        sem_init(&cpi->h_event_main, 0, 0);
+        sem_init(&cpi->h_event_end_encoding, 0, 0);

        cpi->b_multi_threaded = 1;

@@ -466,11 +458,13 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)

        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
        {
+            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
+
            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
-            sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0);
-            cpi->en_thread_data[ithread].ithread = ithread;
-            cpi->en_thread_data[ithread].ptr1 = (void *)cpi;
-            cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread];
+            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+            ethd->ithread = ithread;
+            ethd->ptr1 = (void *)cpi;
+            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];

            //printf(" call begin thread %d \n", ithread);

@@ -482,19 +476,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
            //  0,
            //  NULL);

-            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread]));
-
+            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
        }

    }

-#endif
 }

 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
-#if CONFIG_MULTITHREAD
-
    if (cpi->b_multi_threaded)
    {
        //shutdown other threads
@@ -505,20 +495,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
            for (i = 0; i < cpi->encoding_thread_count; i++)
            {
                //SetEvent(cpi->h_event_mbrencoding[i]);
-                sem_post(&cpi->h_event_mbrencoding[i]);
+                sem_post(&cpi->h_event_start_encoding[i]);
                pthread_join(cpi->h_encoding_thread[i], 0);
-            }

-            for (i = 0; i < cpi->encoding_thread_count; i++)
-                sem_destroy(&cpi->h_event_mbrencoding[i]);
+                sem_destroy(&cpi->h_event_start_encoding[i]);
+            }
        }
+
+        sem_destroy(&cpi->h_event_end_encoding);
+
        //free thread related resources
-        vpx_free(cpi->h_event_mbrencoding);
+        vpx_free(cpi->h_event_start_encoding);
        vpx_free(cpi->h_encoding_thread);
        vpx_free(cpi->mb_row_ei);
        vpx_free(cpi->en_thread_data);
+        vpx_free(cpi->mt_current_mb_col);
    }
-
-#endif
-    vpx_free(cpi->tplist);
 }
+#endif
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -8,7 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "math.h"
 #include "limits.h"
 #include "block.h"
@@ -58,6 +57,7 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];

 #define KF_MB_INTRA_MIN 300
 #define GF_MB_INTRA_MIN 200
+
 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
@@ -67,6 +67,18 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};


+const int cq_level[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);

@@ -165,40 +177,68 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    return modified_err;
 }

+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
 double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
 {
    int i, j;

    unsigned char *src = source->y_buffer;
-    unsigned char value;
    double sum_weights = 0.0;
-    double Weight;

    // Loop throught the Y plane raw examining levels and creating a weight for the image
-    for (i = 0; i < source->y_height; i++)
+    i = source->y_height;
+    do
    {
-        for (j = 0; j < source->y_width; j++)
+        j = source->y_width;
+        do
        {
-            value = src[j];
-
-            if (value >= 64)
-                Weight = 1.0;
-            else if (value > 32)
-                Weight = (value - 32.0f) / 32.0f;
-            else
-                Weight = 0.02;
-
-            sum_weights += Weight;
-        }
-
+            sum_weights += weight_table[ *src];
+            src++;
+        }while(--j);
+        src -= source->y_width;
        src += source->y_stride;
-    }
+    }while(--i);

    sum_weights /= (source->y_height * source->y_width);

    return sum_weights;
 }

+
 // This function returns the current per frame maximum bitrate target
 int frame_max_bits(VP8_COMP *cpi)
 {
@@ -249,7 +289,6 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
     * macroblock.
     */
    size_t stats_sz;
-    FIRSTPASS_STATS stats;

    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
    stats_sz = (stats_sz + 7) & ~7;
@@ -376,8 +415,6 @@ unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
 }
 void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
 {
-    int Offset;
-
    cpi->fp_motion_map_stats = target_pos;
 }

@@ -430,7 +467,6 @@ void vp8_end_first_pass(VP8_COMP *cpi)
    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }

-
 void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD * const xd = & x->e_mbd;
@@ -450,7 +486,6 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }

-
 void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
    MACROBLOCKD *const xd = & x->e_mbd;
@@ -538,7 +573,6 @@ void vp8_first_pass(VP8_COMP *cpi)

    int sum_in_vectors = 0;

-    MV best_ref_mv = {0, 0};
    MV zero_ref_mv = {0, 0};

    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
@@ -576,13 +610,20 @@ void vp8_first_pass(VP8_COMP *cpi)
    // for each macroblock row in image
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
    {
-        MV best_ref_mv = {0, 0};
+        int_mv best_ref_mv;
+
+        best_ref_mv.as_int = 0;

        // reset above block coeffs
        xd->up_available = (mb_row != 0);
        recon_yoffset = (mb_row * recon_y_stride * 16);
        recon_uvoffset = (mb_row * recon_uv_stride * 8);

+        // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
        // for each macroblock col in image
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
@@ -615,8 +656,6 @@ void vp8_first_pass(VP8_COMP *cpi)
            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-            x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-            x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);

            // Other than for the first frame do a motion search
            if (cm->current_video_frame > 0)
@@ -637,12 +676,12 @@ void vp8_first_pass(VP8_COMP *cpi)

                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
+                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                        &d->bmi.mv.as_mv, lst_yv12,
                                        &motion_error, recon_yoffset);

                // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-                if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+                if (best_ref_mv.as_int)
                {
                   tmp_err = INT_MAX;
                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
@@ -654,7 +693,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                        d->bmi.mv.as_mv.row = tmp_mv.row;
                        d->bmi.mv.as_mv.col = tmp_mv.col;
                   }
-
                }

                // Experimental search in a second reference frame ((0,0) based only)
@@ -683,6 +721,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                    xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
                }

+                /* Intra assumed best */
+                best_ref_mv.as_int = 0;
+
                if (motion_error <= this_error)
                {
                    d->bmi.mv.as_mv.row <<= 3;
@@ -698,13 +739,10 @@ void vp8_first_pass(VP8_COMP *cpi)
                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
                    intercount++;

-                    best_ref_mv.row = d->bmi.mv.as_mv.row;
-                    best_ref_mv.col = d->bmi.mv.as_mv.col;
-                    //best_ref_mv.row = 0;
-                    //best_ref_mv.col = 0;
+                    best_ref_mv.as_int = d->bmi.mv.as_int;

                    // Was the vector non-zero
-                    if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+                    if (d->bmi.mv.as_int)
                    {
                        mvcount++;

@@ -760,12 +798,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                            *fp_motion_map_ptr = 1;
                    }
                }
-                else
-                {
-                    // Intra was best
-                    best_ref_mv.row = 0;
-                    best_ref_mv.col = 0;
-                }
            }

            coded_error += this_error;
@@ -803,6 +835,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        fps.coded_error = coded_error >> 8;
        weight = vp8_simple_weight(cpi->Source);

+
        if (weight < 0.1)
            weight = 0.1;

@@ -907,7 +940,7 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
    double pow_lowq = 0.40;

    if (section_target_bandwitdh <= 0)
-        return MAXQ;
+        return cpi->maxq_max_limit;          // Highest value allowed

    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);

@@ -943,10 +976,12 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_

    // Correction factor used for Q values >= 20
    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
-    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+    corr_high = (corr_high < 0.05)
+                    ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;

-    // Try and pick a Q that should be high enough to encode the content at the given rate.
-    for (Q = 0; Q < MAXQ; Q++)
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (Q = cpi->maxq_min_limit; Q < cpi->maxq_max_limit; Q++)
    {
        int bits_per_mb_at_this_q;

@@ -965,6 +1000,28 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
            break;
    }

+    // Restriction on active max q for constrained quality mode.
+    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+         (Q < cpi->cq_target_quality) )
+         //(Q < cpi->oxcf.cq_level;) )
+    {
+        Q = cpi->cq_target_quality;
+        //Q = cpi->oxcf.cq_level;
+    }
+
+    // Adjust maxq_min_limit and maxq_max_limit limits based on
+    // averaga q observed in clip for non kf/gf.arf frames
+    // Give average a chance to settle though.
+    if ( (cpi->ni_frames >
+                  ((unsigned int)cpi->total_stats->count >> 8)) &&
+         (cpi->ni_frames > 150) )
+    {
+        cpi->maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+    }
+
    return Q;
 }
 static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
@@ -1113,6 +1170,79 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta

    return Q;
 }
+
+// For cq mode estimate a cq level that matches the observed
+// complexity and data rate.
+static int estimate_cq(VP8_COMP *cpi, double section_err,
+                       int section_target_bandwitdh, int Height, int Width)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double pow_highq = 0.90;
+    double pow_lowq = 0.40;
+    double clip_iiratio;
+    double clip_iifactor;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
+
+    // Corrections for higher compression speed settings
+    // (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+    // II ratio correction factor for clip as a whole
+    clip_iiratio = cpi->total_stats->intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->total_stats->coded_error);
+    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+    if (clip_iifactor < 0.80)
+        clip_iifactor = 0.80;
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that can encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        if (Q < 50)
+        {
+            correction_factor =
+                pow( err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+
+            correction_factor = (correction_factor < 0.05) ? 0.05
+                                    : (correction_factor > 5.0) ? 5.0
+                                        : correction_factor;
+        }
+        else
+            correction_factor = corr_high;
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + correction_factor *
+                        speed_correction *
+                        clip_iifactor *
+                        (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return cq_level[Q];
+}
+
 extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);

 void vp8_init_second_pass(VP8_COMP *cpi)
@@ -1209,6 +1339,43 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 {
 }

+// This function gives and estimate of how badly we believe
+// the predicition quality is decaying from frame to frame.
+double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    double prediction_decay_rate;
+    double motion_decay;
+    double motion_pct = next_frame->pcnt_motion;
+
+
+    // Initial basis is the % mbs inter coded
+    prediction_decay_rate = next_frame->pcnt_inter;
+
+    // High % motion -> somewhat higher decay rate
+    motion_decay = (1.0 - (motion_pct / 20.0));
+    if (motion_decay < prediction_decay_rate)
+        prediction_decay_rate = motion_decay;
+
+    // Adjustment to decay rate based on speed of motion
+    {
+        double this_mv_rabs;
+        double this_mv_cabs;
+        double distance_factor;
+
+        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                               (this_mv_cabs * this_mv_cabs)) / 250.0;
+        distance_factor = ((distance_factor > 1.0)
+                                ? 0.0 : (1.0 - distance_factor));
+        if (distance_factor < prediction_decay_rate)
+            prediction_decay_rate = distance_factor;
+    }
+
+    return prediction_decay_rate;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1230,17 +1397,20 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    double decay_accumulator = 1.0;

    double boost_factor = IIFACTOR;
-    double loop_decay_rate = 1.00;        // Starting decay rate
+    double loop_decay_rate = 1.00;          // Starting decay rate

    double this_frame_mv_in_out = 0.0;
    double mv_in_out_accumulator = 0.0;
    double abs_mv_in_out_accumulator = 0.0;
    double mod_err_per_mb_accumulator = 0.0;

-    int max_bits = frame_max_bits(cpi);    // Max for a single frame
+    int max_bits = frame_max_bits(cpi);     // Max for a single frame

    unsigned char *fpmm_pos;

+    unsigned int allow_alt_ref =
+                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
    cpi->gf_group_bits = 0;
    cpi->gf_decay_rate = 0;

@@ -1255,47 +1425,57 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    // Preload the stats for the next frame.
    mod_frame_err = calculate_modified_err(cpi, this_frame);

-    // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf
+    // Note the error of the frame at the start of the group (this will be
+    // the GF frame error if we code a normal gf
    gf_first_frame_err = mod_frame_err;

-    // Special treatment if the current frame is a key frame (which is also a gf).
-    // If it is then its error score (and hence bit allocation) need to be subtracted out
-    // from the calculation for the GF group
+    // Special treatment if the current frame is a key frame (which is also
+    // a gf). If it is then its error score (and hence bit allocation) need
+    // to be subtracted out from the calculation for the GF group
    if (cpi->common.frame_type == KEY_FRAME)
        gf_group_err -= gf_first_frame_err;

-    // Scan forward to try and work out how many frames the next gf group should contain and
-    // what level of boost is appropriate for the GF or ARF that will be coded with the group
+    // Scan forward to try and work out how many frames the next gf group
+    // should contain and what level of boost is appropriate for the GF
+    // or ARF that will be coded with the group
    i = 0;

-    while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    while (((i < cpi->static_scene_max_gf_interval) ||
+            ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) &&
+           (i < cpi->frames_to_key))
    {
        double r;
        double this_frame_mvr_ratio;
        double this_frame_mvc_ratio;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        //double motion_pct = next_frame.pcnt_motion;
+        double motion_pct;

-        i++;                                                    // Increment the loop counter
+        i++;    // Increment the loop counter

        // Accumulate error score of frames in this gf group
        mod_frame_err = calculate_modified_err(cpi, this_frame);

        gf_group_err += mod_frame_err;

-        mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+        mod_err_per_mb_accumulator +=
+            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);

        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;

        // Accumulate motion stats.
+        motion_pct = next_frame.pcnt_motion;
        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_pct);
        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_pct);

        //Accumulate Motion In/Out of frame stats
-        this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
-        mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion;
-        abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+        this_frame_mv_in_out =
+            next_frame.mv_in_out_count * motion_pct;
+        mv_in_out_accumulator +=
+            next_frame.mv_in_out_count * motion_pct;
+        abs_mv_in_out_accumulator +=
+            fabs(next_frame.mv_in_out_count * motion_pct);

        // If there is a significant amount of motion
        if (motion_pct > 0.05)
@@ -1324,7 +1504,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        }

        // Underlying boost factor is based on inter intra error ratio
-        r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
+        r = ( boost_factor *
+              ( next_frame.intra_error /
+                DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));

        if (next_frame.intra_error > cpi->gf_intra_err_min)
            r = (IIKFACTOR2 * next_frame.intra_error /
@@ -1333,63 +1515,87 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            r = (IIKFACTOR2 * cpi->gf_intra_err_min /
                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-        // Increase boost for frames where new data coming into frame (eg zoom out)
-        // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
+        // Increase boost for frames where new data coming into frame
+        // (eg zoom out). Slightly reduce boost if there is a net balance
+        // of motion out of the frame (zoom in).
        // The range for this_frame_mv_in_out is -1.0 to +1.0
        if (this_frame_mv_in_out > 0.0)
            r += r * (this_frame_mv_in_out * 2.0);
+        // In extreme case boost is halved
        else
-            r += r * (this_frame_mv_in_out / 2.0);  // In extreme case boost is halved
+            r += r * (this_frame_mv_in_out / 2.0);

        if (r > GF_RMAX)
            r = GF_RMAX;

-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);

        // Cumulative effect of decay
        decay_accumulator = decay_accumulator * loop_decay_rate;
        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;

        boost_score += (decay_accumulator * r);

+        // Break clause to detect very still sections after motion
+        // For example a staic image after a fade or other transition
+        // instead of a clean key frame.
+        if ( (i > MIN_GF_INTERVAL) &&
+             (loop_decay_rate >= 0.999) &&
+             (decay_accumulator < 0.9) )
+        {
+            int j;
+            FIRSTPASS_STATS * position = cpi->stats_in;
+            FIRSTPASS_STATS tmp_next_frame;
+            double decay_rate;
+
+            // Look ahead a few frames to see if static condition
+            // persists...
+            for ( j = 0; j < 4; j++ )
+            {
+                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+                    break;
+
+                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
+                if ( decay_rate < 0.999 )
+                    break;
+            }
+            reset_fpf_position(cpi, position);            // Reset file position
+
+            // Force GF not alt ref
+            if ( j == 4 )
+            {
+                if (0)
+                {
+                    FILE *f = fopen("fadegf.stt", "a");
+                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
+                         cpi->common.current_video_frame+i, i,
+                         loop_decay_rate, decay_accumulator,
+                         boost_score );
+                    fclose(f);
+                }
+
+                allow_alt_ref = FALSE;
+
+                boost_score = old_boost_score;
+                break;
+            }
+        }
+
        // Break out conditions.
-        if (   /* i>4 || */
+        if  (   /* i>4 || */
+            // Break at cpi->max_gf_interval unless almost totally static
+            (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
            (
-                (i > MIN_GF_INTERVAL) &&                            // Dont break out with a very short interval
-                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&      // Dont break out very close to a key frame
+                // Dont break out with a very short interval
+                (i > MIN_GF_INTERVAL) &&
+                // Dont break out very close to a key frame
+                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&
                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
                ((mv_ratio_accumulator > 100.0) ||
                 (abs_mv_in_out_accumulator > 3.0) ||
                 (mv_in_out_accumulator < -2.0) ||
-                 ((boost_score - old_boost_score) < 2.0)
-                )
-            )
-        )
+                 ((boost_score - old_boost_score) < 2.0))
+            ) )
        {
            boost_score = old_boost_score;
            break;
@@ -1400,7 +1606,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        old_boost_score = boost_score;
    }

-    cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+    cpi->gf_decay_rate =
+        (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;

    // When using CBR apply additional buffer related upper limits
    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -1410,7 +1617,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // For cbr apply buffer related limits
        if (cpi->drop_frames_allowed)
        {
-            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+            int df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+                                  (cpi->oxcf.optimal_buffer_level / 100);

            if (cpi->buffer_level > df_buffer_level)
                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
@@ -1433,10 +1641,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;

    // Should we use the alternate refernce frame
-    if (cpi->oxcf.play_alternate &&
-        cpi->oxcf.lag_in_frames &&
+    if (allow_alt_ref &&
        (i >= MIN_GF_INTERVAL) &&
-        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&          // dont use ARF very near next kf
+        // dont use ARF very near next kf
+        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&
        (((next_frame.pcnt_inter > 0.75) &&
          ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) &&
          //(cpi->gfu_boost>150) &&
@@ -1766,7 +1974,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

        vp8_avg_stats(&sectionstats);

-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->section_intra_rating =
+            sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
        //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -1993,21 +2203,48 @@ void vp8_second_pass(VP8_COMP *cpi)

    if (cpi->common.current_video_frame == 0)
    {
-        // guess at 2nd pass q
        cpi->est_max_qcorrection_factor = 1.0;
-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);

-        if (tmp_q < cpi->worst_quality)
+        // Experimental code to try and set a cq_level in constrained
+        // quality mode.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
        {
-            cpi->active_worst_quality         = tmp_q;
-            cpi->ni_av_qi                     = tmp_q;
-        }
-        else
-        {
-            cpi->active_worst_quality         = cpi->worst_quality;
-            cpi->ni_av_qi                     = cpi->worst_quality;
+            int est_cq;
+
+            est_cq =
+                estimate_cq( cpi,
+                             (cpi->total_coded_error_left / frames_left),
+                             (int)(cpi->bits_left / frames_left),
+                             cpi->common.Height, cpi->common.Width);
+
+            cpi->cq_target_quality = cpi->oxcf.cq_level;
+            if ( est_cq > cpi->cq_target_quality )
+                cpi->cq_target_quality = est_cq;
        }
+
+        // guess at maxq needed in 2nd pass
+        cpi->maxq_max_limit = cpi->worst_quality;
+        cpi->maxq_min_limit = cpi->best_quality;
+        tmp_q = estimate_max_q( cpi,
+                                (cpi->total_coded_error_left / frames_left),
+                                (int)(cpi->bits_left / frames_left),
+                                cpi->common.Height,
+                                cpi->common.Width);
+
+        // Limit the maxq value returned subsequently.
+        // This increases the risk of overspend or underspend if the initial
+        // estimate for the clip is bad, but helps prevent excessive
+        // variation in Q, especially near the end of a clip
+        // where for example a small overspend may cause Q to crash
+        cpi->maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+                                  ? (tmp_q + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+                                  ? (tmp_q - 32) : cpi->best_quality;
+
+        cpi->active_worst_quality         = tmp_q;
+        cpi->ni_av_qi                     = tmp_q;
    }
+
    // The last few frames of a clip almost always have to few or too many
    // bits and for the sake of over exact rate control we dont want to make
    // radical adjustments to the allowed quantizer range just to use up a
@@ -2029,13 +2266,6 @@ void vp8_second_pass(VP8_COMP *cpi)
            cpi->active_worst_quality --;

        cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
-
-        // Clamp to user set limits
-        if (cpi->active_worst_quality > cpi->worst_quality)
-            cpi->active_worst_quality = cpi->worst_quality;
-        else if (cpi->active_worst_quality < cpi->best_quality)
-            cpi->active_worst_quality = cpi->best_quality;
-
    }

    cpi->frames_to_key --;
@@ -2157,6 +2387,9 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    cpi->common.frame_type = KEY_FRAME;

+    // is this a forced key frame by interval
+    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
    // Clear the alt ref active flag as this can never be active on a key frame
    cpi->source_alt_ref_active = FALSE;

@@ -2213,13 +2446,40 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    if (cpi->oxcf.auto_key
        && cpi->frames_to_key > (int)cpi->key_frame_frequency )
    {
+        FIRSTPASS_STATS *current_pos = cpi->stats_in;
+        FIRSTPASS_STATS tmp_frame;
+
        cpi->frames_to_key /= 2;

-        // Estimate corrected kf group error
-        kf_group_err /= 2.0;
-        kf_group_intra_err /= 2.0;
-        kf_group_coded_err /= 2.0;
+        // Copy first frame details
+        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+        // Reset to the start of the group
+        reset_fpf_position(cpi, start_position);
+
+        kf_group_err = 0;
+        kf_group_intra_err = 0;
+        kf_group_coded_err = 0;
+
+        // Rescan to get the correct error data for the forced kf group
+        for( i = 0; i < cpi->frames_to_key; i++ )
+        {
+            // Accumulate kf group errors
+            kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+            kf_group_intra_err += tmp_frame.intra_error;
+            kf_group_coded_err += tmp_frame.coded_error;
+
+            // Load a the next frame's stats
+            vp8_input_stats(cpi, &tmp_frame);
+        }
+
+        // Reset to the start of the group
+        reset_fpf_position(cpi, current_pos);
+
+        cpi->next_key_frame_forced = TRUE;
    }
+    else
+        cpi->next_key_frame_forced = FALSE;

    // Special case for the last frame of the file
    if (cpi->stats_in >= cpi->stats_in_end)
@@ -2313,7 +2573,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
    {
        double r;
        double motion_decay;
-        double motion_pct = next_frame.pcnt_motion;
+        double motion_pct;

        if (EOF == vp8_input_stats(cpi, &next_frame))
            break;
@@ -2333,6 +2593,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        loop_decay_rate = next_frame.pcnt_inter;

        // High % motion -> somewhat higher decay rate
+        motion_pct = next_frame.pcnt_motion;
        motion_decay = (1.0 - (motion_pct / 20.0));
        if (motion_decay < loop_decay_rate)
            loop_decay_rate = motion_decay;
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -91,8 +91,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)

    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-
+#if !(CONFIG_REALTIME_ONLY)
    cpi->rtcd.search.full_search             = vp8_full_search_sad;
+#endif
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;

    cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -408,6 +408,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
        break;
    case 3:
+    default:
        this_mv.col += 4;
        this_mv.row += 4;
        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
@@ -778,15 +779,17 @@ int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vfp,
    int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
    int i, j;
    unsigned char *src = (*(b->base_src) + b->src);
    int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    int rr = center_mv->row, rc = center_mv->col;
+    int br = ref_mv->row >> 3, bc = ref_mv->col >> 3, tr, tc;
    unsigned int besterr, thiserr = 0x7fffffff;
    int k = -1, tk;

@@ -891,7 +894,7 @@ cal_neighbors:
    best_mv->row = br;
    best_mv->col = bc;

-    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + vp8_mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
 }
 #undef MVC
 #undef PRE
@@ -1387,8 +1390,6 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    else
        return INT_MAX;
 }
-#endif
-

 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
@@ -1541,6 +1542,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
    else
        return INT_MAX;
 }
+#endif /* !(CONFIG_REALTIME_ONLY) */

 #ifdef ENTROPY_STATS
 void print_mode_context(void)
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -24,7 +24,6 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
-#define MAX_POSSIBLE_MV (1 << 11)                               // Maximum MV in 1/8 pel units

 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
@@ -44,8 +43,8 @@ extern int vp8_hex_search
    int *num00,
    const vp8_variance_fn_ptr_t *vf,
    int *mvsadcost[2],
-    int *mvcost[2]
-
+    int *mvcost[2],
+    MV *center_mv
 );

 typedef int (fractional_mv_step_fp)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -28,6 +28,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
 #include "temporal_filter.h"
+#include "findnearmv.h"

 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -183,17 +184,16 @@ typedef struct
    int optimize_coefficients;

    int use_fastquant_for_pick;
+    int no_skip_block4x4_search;
+    int improved_mv_pred;

 } SPEED_FEATURES;

 typedef struct
 {
    MACROBLOCK  mb;
-    int mb_row;
-    TOKENEXTRA *tp;
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;
-    int current_mb_col;
 } MB_ROW_COMP;

 typedef struct
@@ -244,12 +244,6 @@ enum
    BLOCK_MAX_SEGMENTS
 };

-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
 typedef struct
 {

@@ -308,15 +302,17 @@ typedef struct

    YV12_BUFFER_CONFIG last_frame_uf;

-    char *Dest;
-
    TOKENEXTRA *tok;
    unsigned int tok_count;


    unsigned int frames_since_key;
    unsigned int key_frame_frequency;
-    unsigned int next_key;
+    unsigned int this_key_frame_forced;
+    unsigned int next_key_frame_forced;
+
+    // Ambient reconstruction err target for force key frames
+    int ambient_err;

    unsigned int mode_check_freq[MAX_MODES];
    unsigned int mode_test_hit_counts[MAX_MODES];
@@ -338,11 +334,6 @@ typedef struct
    int RDMULT;
    int RDDIV ;

-    TOKENEXTRA *rdtok;
-    vp8_writer rdbc;
-    int intra_mode_costs[10];
-
-
    CODING_CONTEXT coding_context;

    // Rate targetting variables
@@ -350,7 +341,6 @@ typedef struct
    long long last_prediction_error;
    long long intra_error;
    long long last_intra_error;
-    long long last_auto_filter_prediction_error;

 #if 0
    // Experimental RD code
@@ -365,7 +355,6 @@ typedef struct
    int this_frame_target;
    int projected_frame_size;
    int last_q[2];                   // Separate values for Intra/Inter
-    int target_bits_per_mb;

    double rate_correction_factor;
    double key_frame_rate_correction_factor;
@@ -398,6 +387,7 @@ typedef struct
    int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
    int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
    int max_gf_interval;
+    int static_scene_max_gf_interval;
    int baseline_gf_interval;
    int gf_decay_rate;
    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -447,6 +437,10 @@ typedef struct
    int best_quality;
    int active_best_quality;

+    int cq_target_quality;
+    int maxq_max_limit;
+    int maxq_min_limit;
+
    int drop_frames_allowed;          // Are we permitted to drop frames?
    int drop_frame;                  // Drop this frame?
    int drop_count;                  // How many frames have we dropped?
@@ -551,8 +545,6 @@ typedef struct

    int ref_frame_flags;

-    int exp[512];
-
    SPEED_FEATURES sf;
    int error_bins[1024];

@@ -598,22 +590,21 @@ typedef struct
    int cyclic_refresh_q;
    signed char *cyclic_refresh_map;

+#if CONFIG_MULTITHREAD
    // multithread data
-    int current_mb_col_main;
+    int * mt_current_mb_col;
+    int mt_sync_range;
    int processor_core_count;
    int b_multi_threaded;
    int encoding_thread_count;

-#if CONFIG_MULTITHREAD
    pthread_t *h_encoding_thread;
-#endif
    MB_ROW_COMP *mb_row_ei;
    ENCODETHREAD_DATA *en_thread_data;

-#if CONFIG_MULTITHREAD
    //events
-    sem_t *h_event_mbrencoding;
-    sem_t h_event_main;
+    sem_t *h_event_start_encoding;
+    sem_t h_event_end_encoding;
 #endif

    TOKENLIST *tplist;
@@ -685,6 +676,9 @@ typedef struct
    int *lf_ref_frame_sign_bias;
    int *lf_ref_frame;

+#if CONFIG_REALTIME_ONLY
+    int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
+#endif
 } VP8_COMP;

 void control_data_rate(VP8_COMP *cpi);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -24,7 +24,7 @@
 #include "g_common.h"
 #include "variance.h"
 #include "mcomp.h"
-
+#include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -168,8 +168,6 @@ static int pick_intra4x4block(
    B_PREDICTION_MODE *best_mode,
    B_PREDICTION_MODE above,
    B_PREDICTION_MODE left,
-    ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l,

    int *bestrate,
    int *bestdistortion)
@@ -179,8 +177,6 @@ static int pick_intra4x4block(
    int rate;
    int distortion;
    unsigned int *mode_costs;
-    (void) l;
-    (void) a;

    if (x->e_mbd.frame_type == KEY_FRAME)
    {
@@ -211,6 +207,7 @@ static int pick_intra4x4block(

    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
    vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode);
+
    return best_rd;
 }

@@ -220,17 +217,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    MACROBLOCKD *const xd = &mb->e_mbd;
    int i;
    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
-    int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode
+    int error;
    int distortion = 0;
-    ENTROPY_CONTEXT_PLANES t_above, t_left;
-    ENTROPY_CONTEXT *ta;
-    ENTROPY_CONTEXT *tl;
-
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;

    vp8_intra_prediction_down_copy(xd);

@@ -243,10 +231,8 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);

-        error += pick_intra4x4block(rtcd,
-                                    mb, mb->block + i, xd->block + i, &best_mode, A, L,
-                                    ta + vp8_block2above[i],
-                                    tl + vp8_block2left[i], &r, &d);
+        pick_intra4x4block(rtcd, mb, mb->block + i, xd->block + i,
+                               &best_mode, A, L, &r, &d);

        cost += r;
        distortion += d;
@@ -264,10 +250,15 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int
    *Rate = cost;

    if (i == 16)
+    {
        *best_dist = distortion;
+        error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, distortion);
+    }
    else
+    {
        *best_dist = INT_MAX;
-
+        error = INT_MAX;
+    }

    return error;
 }
@@ -421,7 +412,6 @@ int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)

 }

-
 int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
    BLOCK *b = &x->block[0];
@@ -430,7 +420,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    B_MODE_INFO best_bmodes[16];
    MB_MODE_INFO best_mbmode;
    PARTITION_INFO best_partition;
-    MV best_ref_mv1;
+    MV best_ref_mv;
    MV mode_mv[MB_MODE_COUNT];
    MB_PREDICTION_MODE this_mode;
    int num00;
@@ -448,9 +438,14 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
    int best_mode_index = 0;
    int sse = INT_MAX;

+    MV mvp;
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+
    MV nearest_mv[4];
    MV near_mv[4];
-    MV best_ref_mv[4];
+    MV frame_best_ref_mv[4];
    int MDCounts[4][4];
    unsigned char *y_buffer[4];
    unsigned char *u_buffer[4];
@@ -470,7 +465,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
-                          &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
@@ -484,7 +479,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
-                          &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
@@ -498,7 +493,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];

        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
-                          &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);

        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
@@ -538,10 +533,6 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                                        + vp8_cost_one(cpi->prob_gf_coded);
    }

-
-
-    best_rd = INT_MAX;
-
    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

    // if we encode a new mv this is important
@@ -604,17 +595,41 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv1 = best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
            memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
        }

-        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
-        if (cpi->is_src_frame_alt_ref)
+        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+        // unless ARNR filtering is enabled in which case we want
+        // an unfiltered alternative
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
        {
            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                continue;
        }

+        if(cpi->sf.improved_mv_pred && x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+        {
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+            /* adjust mvp to make sure it is within MV range */
+            if(mvp.row > best_ref_mv.row + MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row + MAX_FULL_PEL_VAL;
+            else if(mvp.row < best_ref_mv.row - MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row - MAX_FULL_PEL_VAL;
+            if(mvp.col > best_ref_mv.col + MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col + MAX_FULL_PEL_VAL;
+            else if(mvp.col < best_ref_mv.col - MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col - MAX_FULL_PEL_VAL;
+        }
+
        switch (this_mode)
        {
        case B_PRED:
@@ -670,61 +685,59 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
            int n = 0;
            int sadpb = x->sadperbit16;

+            int col_min;
+            int col_max;
+            int row_min;
+            int row_max;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
            // Further step/diamond searches as necessary
-            if (cpi->Speed < 8)
+            step_param = cpi->sf.first_step + speed_adjust;
+
+            if(cpi->sf.improved_mv_pred)
            {
-                step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-            }
-            else
+                sr += speed_adjust;
+                //adjust search range according to sr from mv prediction
+                if(sr > step_param)
+                    step_param = sr;
+
+                col_min = (best_ref_mv.col - MAX_FULL_PEL_VAL) >>3;
+                col_max = (best_ref_mv.col + MAX_FULL_PEL_VAL) >>3;
+                row_min = (best_ref_mv.row - MAX_FULL_PEL_VAL) >>3;
+                row_max = (best_ref_mv.row + MAX_FULL_PEL_VAL) >>3;
+
+                // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+            }else
            {
-                step_param = cpi->sf.first_step + 2;
-                further_steps = 0;
+                mvp.row = best_ref_mv.row;
+                mvp.col = best_ref_mv.col;
            }

-#if 0
-
-            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
-            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-            // Further step searches
-            while (n < further_steps)
-            {
-                n++;
-
-                if (num00)
-                    num00--;
-                else
-                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);
-
-                    if (thissme < bestsme)
-                    {
-                        bestsme = thissme;
-                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-                    }
-                    else
-                    {
-                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
-                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
-                    }
-                }
-            }
-
-#else
+            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);

            if (cpi->sf.search_method == HEX)
            {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
            }
            else
            {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;

@@ -743,7 +756,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                        num00--;
                    else
                    {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9

                        if (thissme < bestsme)
                        {
@@ -760,19 +773,24 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                }
            }

-#endif
+            if(cpi->sf.improved_mv_pred)
+            {
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+            }
+
+            if (bestsme < INT_MAX)
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+
+            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+            // mv cost;
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
        }

-        if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
-
-        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-        // mv cost;
-        rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
-
-
        case NEARESTMV:
        case NEARMV:

--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -296,7 +296,6 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    int filt_err = 0;
    int min_filter_level;
    int max_filter_level;
-    int prediction_difference = (int)(100 * abs((int)(cpi->last_auto_filter_prediction_error - cpi->prediction_error)) / (1 + cpi->prediction_error));

    int filter_step;
    int filt_high = 0;
@@ -478,6 +477,5 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
    cpi->last_auto_filt_val = filt_best;
    cpi->last_auto_filt_q  = cm->base_qindex;

-    cpi->last_auto_filter_prediction_error = cpi->prediction_error;
    cpi->frames_since_auto_filter = 0;
 }
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -70,7 +70,6 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
    int i, rc, eob;
-    int zbin;
    int x, y, z, sz;
    short *coeff_ptr   = b->coeff;
    short *round_ptr   = b->round;
@@ -130,9 +129,6 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
        rc   = vp8_default_zig_zag1d[i];
        z    = coeff_ptr[rc];

-        //if ( i == 0 )
-        //    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
-        //else
        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

        zbin_boost_ptr ++;
@@ -145,13 +141,13 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
            y  = (((x * quant_ptr[rc]) >> 16) + x)
                 >> quant_shift_ptr[rc];                // quantize (x)
            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc]  = x;                         // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+            qcoeff_ptr[rc]  = x;                        // write to destination
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

            if (y)
            {
                eob = i;                                // last nonzero coeffs
-                zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+                zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
            }
        }
    }
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1550,12 +1550,21 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
                    }
                }
-                // VBR
+                // VBR and CQ mode
                // Note that tighter restrictions here can help quality but hurt encode speed
                else
                {
-                    *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-                    *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    // Stron overshoot limit for constrained quality
+                    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
                }
            }
        }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -12,10 +12,22 @@
 #ifndef __INC_RDOPT_H
 #define __INC_RDOPT_H
 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);
 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
 extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);

+extern void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);

 #endif
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,36 +36,9 @@

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-#define USE_FILTER_LUT 0         // use lookup table to improve filter

 #if VP8_TEMPORAL_ALT_REF

-#if USE_FILTER_LUT
-// for (strength = 0; strength <= 6; strength++) {
-//   for (delta = 0; delta <= 18; delta++) {
-//     float coeff = (3.0 * delta * delta) / pow(2, strength);
-//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
-//   }
-//   printf("\n");
-// }
-static int modifier_lut[7][19] =
-{
-    // Strength=0
-    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=1
-    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=2
-    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=3
-    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=4
-    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},
-    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}
-};
-#endif
 static void vp8_temporal_filter_predictors_mb_c
 (
    MACROBLOCKD *x,
@@ -86,21 +59,18 @@ static void vp8_temporal_filter_predictors_mb_c

    if ((mv_row | mv_col) & 7)
    {
-//        vp8_sixtap_predict16x16_c(yptr, stride,
-//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
        x->subpixel_predict16x16(yptr, stride,
                                    mv_col & 7, mv_row & 7, &pred[0], 16);
    }
    else
    {
-        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
        RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
    }

    // U & V
    mv_row >>= 1;
    mv_col >>= 1;
-    stride >>= 1;
+    stride = (stride + 1) >> 1;
    offset = (mv_row >> 3) * stride + (mv_col >> 3);
    uptr = u_mb_ptr + offset;
    vptr = v_mb_ptr + offset;
@@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c
    int strength,
    int filter_weight,
    unsigned int *accumulator,
-    unsigned int *count
+    unsigned short *count
 )
 {
    int i, j, k;
    int modifier;
    int byte = 0;

-#if USE_FILTER_LUT
-    int *lut = modifier_lut[strength];
-#endif
-
    for (i = 0,k = 0; i < block_size; i++)
    {
        for (j = 0; j < block_size; j++, k++)
@@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c
            int src_byte = frame1[byte];
            int pixel_value = *frame2++;

-#if USE_FILTER_LUT
-            modifier = abs(src_byte-pixel_value);
-            modifier = modifier>18 ? 0 : lut[modifier];
-#else
            modifier   = src_byte - pixel_value;
+            // This is an integer approximation of:
+            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
            modifier  *= modifier;
            modifier  *= 3;
            modifier  += 1 << (strength - 1);
@@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c
                modifier = 16;

            modifier = 16 - modifier;
-#endif
            modifier *= filter_weight;

            count[k] += modifier;
@@ -240,7 +204,7 @@ static int vp8_temporal_filter_find_matching_mb_c
            step_param,
            sadpb/*x->errorperbit*/,
            &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost);
+            mvsadcost, mvcost, &best_ref_mv1);
    }
    else
    {
@@ -326,17 +290,17 @@ static void vp8_temporal_filter_iterate_c
    int mb_col, mb_row;
    unsigned int filter_weight[MAX_LAG_BUFFERS];
    unsigned char *mm_ptr = cpi->fp_motion_map;
-    int cols = cpi->common.mb_cols;
-    int rows = cpi->common.mb_rows;
+    int mb_cols = cpi->common.mb_cols;
+    int mb_rows = cpi->common.mb_rows;
    int MBs  = cpi->common.MBs;
    int mb_y_offset = 0;
    int mb_uv_offset = 0;
-    unsigned int accumulator[384];
-    unsigned int count[384];
+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
    unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);

    // Save input state
    unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -350,7 +314,7 @@ static void vp8_temporal_filter_iterate_c
            filter_weight[frame] = 1;
    }

-    for (mb_row = 0; mb_row < rows; mb_row++)
+    for (mb_row = 0; mb_row < mb_rows; mb_row++)
    {
 #if ALT_REF_MC_ENABLED
        // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -359,14 +323,14 @@ static void vp8_temporal_filter_iterate_c
                                + (VP8BORDERINPIXELS - 19);
 #endif

-        for (mb_col = 0; mb_col < cols; mb_col++)
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
        {
            int i, j, k, w;
            int weight_cap;
            int stride;

            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned short));

 #if ALT_REF_MC_ENABLED
            // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -533,8 +497,8 @@ static void vp8_temporal_filter_iterate_c
            mb_uv_offset += 8;
        }

-        mb_y_offset += 16*f->y_stride-f->y_width;
-        mb_uv_offset += 8*f->uv_stride-f->uv_width;
+        mb_y_offset += 16*(f->y_stride-mb_cols);
+        mb_uv_offset += 8*(f->uv_stride-mb_cols);
    }

    // Restore input state
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -22,9 +22,13 @@
     int strength, \
     int filter_weight, \
     unsigned int *accumulator, \
-     unsigned int *count \
+     unsigned short *count \
    )

+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
 #ifndef vp8_temporal_filter_apply
 #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
 #endif
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -11,220 +11,169 @@
 %include "vpx_ports/x86_abi_support.asm"


-;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
-;               short *qcoeff_ptr,short *dequant_ptr,
-;               const int *default_zig_zag, short *round_ptr,
-;               short *quant_ptr, short *dqcoeff_ptr,
+;int vp8_regular_quantize_b_impl_sse2(
+;               short *coeff_ptr,
+;               short *zbin_ptr,
+;               short *qcoeff_ptr,
+;               short *dequant_ptr,
+;               const int *default_zig_zag,
+;               short *round_ptr,
+;               short *quant_ptr,
+;               short *dqcoeff_ptr,
 ;               unsigned short zbin_oq_value,
-;               short *zbin_boost_ptr);
+;               short *zbin_boost_ptr,
+;               short *quant_shift);
 ;
 global sym(vp8_regular_quantize_b_impl_sse2)
 sym(vp8_regular_quantize_b_impl_sse2):
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 10
+    SHADOW_ARGS_TO_STACK 11
+    SAVE_XMM
    push        rsi
    push        rdi
    push        rbx
+    ALIGN_STACK 16, rax
+    %define abs_minus_zbin    0
+    %define temp_qcoeff       32
+    %define qcoeff            64
+    %define eob_tmp           96
+    %define stack_size        112
+    sub         rsp, stack_size
    ; end prolog

-    ALIGN_STACK 16, rax
-
-    %define abs_minus_zbin_lo 0
-    %define abs_minus_zbin_hi 16
-    %define temp_qcoeff_lo 32
-    %define temp_qcoeff_hi 48
-    %define save_xmm6 64
-    %define save_xmm7 80
-    %define eob 96
-
-    %define vp8_regularquantizeb_stack_size eob + 16
-
-    sub         rsp, vp8_regularquantizeb_stack_size
-
-    movdqa      OWORD PTR[rsp + save_xmm6], xmm6
-    movdqa      OWORD PTR[rsp + save_xmm7], xmm7
-
-    mov         rdx, arg(0)                 ;coeff_ptr
-    mov         eax, arg(8)                 ;zbin_oq_value
-
-    mov         rcx, arg(1)                 ;zbin_ptr
-    movd        xmm7, eax
+    mov         rdx, arg(0)                 ; coeff_ptr
+    mov         rcx, arg(1)                 ; zbin_ptr
+    movd        xmm7, arg(8)                ; zbin_oq_value
+    mov         rdi, arg(5)                 ; round_ptr
+    mov         rsi, arg(6)                 ; quant_ptr

+    ; z
    movdqa      xmm0, OWORD PTR[rdx]
    movdqa      xmm4, OWORD PTR[rdx + 16]

+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
+
    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

-    psraw       xmm0, 15                    ;sign of z (aka sz)
-    psraw       xmm4, 15                    ;sign of z (aka sz)
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr
-    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    psubw       xmm1, xmm0                  ;x = abs(z)
-
-    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value
-    psubw       xmm5, xmm4                  ;x = abs(z)
-
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)
-    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)
-
-    mov         rdi, arg(5)                 ;round_ptr
-    mov         rsi, arg(6)                 ;quant_ptr
-
-    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
-    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
-
-    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back
-
-    movdqa      xmm2, OWORD PTR[rdi]
-    movdqa      xmm3, OWORD PTR[rsi]
-
-    movdqa      xmm6, OWORD PTR[rdi + 16]
-    movdqa      xmm7, OWORD PTR[rsi + 16]
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    pmulhw      xmm1, xmm3
-    pmulhw      xmm5, xmm7
-
-    mov         rsi, arg(2)                 ;qcoeff_ptr
-    pxor        xmm6, xmm6
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm4, 15

+    ; (z ^ sz)
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4

+    ; x = abs(z)
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

-    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1
-    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5
+    movdqa      xmm2, OWORD PTR[rcx]
+    movdqa      xmm3, OWORD PTR[rcx + 16]

-    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff
-    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7

-    xor         rax, rax
-    mov         rcx, -1
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm1, xmm2
+    psubw       xmm5, xmm3
+    movdqa      OWORD PTR[rsp + abs_minus_zbin], xmm1
+    movdqa      OWORD PTR[rsp + abs_minus_zbin + 16], xmm5

-    mov         [rsp + eob], rcx
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-
-    mov         rbx, arg(4)                 ;default_zig_zag
-
-rq_zigzag_loop:
-    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1a
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1a
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1a:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1b
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1b
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1b:
-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
-
-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
-    lea         rax, [rax + 1]
-
-    sub         edx, edi                    ;x - zbin
-    jl          rq_zigzag_1c
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-
-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
-
-    cmp         edx, 0
-    je          rq_zigzag_1c
-
-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ;zbin_boost_ptr
-    mov         [rsp + eob], rax            ;eob = i
-
-rq_zigzag_1c:
-    lea         rax, [rax + 1]
-
-    cmp         rax, 16
-    jl          rq_zigzag_loop
-
-    mov         rdi, arg(2)                 ;qcoeff_ptr
-    mov         rcx, arg(3)                 ;dequant_ptr
-    mov         rsi, arg(7)                 ;dqcoeff_ptr
+    ; add (zbin_ptr + zbin_oq_value) back
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3

    movdqa      xmm2, OWORD PTR[rdi]
-    movdqa      xmm3, OWORD PTR[rdi + 16]
+    movdqa      xmm6, OWORD PTR[rdi + 16]
+
+    movdqa      xmm3, OWORD PTR[rsi]
+    movdqa      xmm7, OWORD PTR[rsi + 16]
+
+    ; x + round
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm6
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm3, xmm1
+    pmulhw      xmm7, xmm5
+
+    ; y += x
+    paddw       xmm1, xmm3
+    paddw       xmm5, xmm7
+
+    movdqa      OWORD PTR[rsp + temp_qcoeff], xmm1
+    movdqa      OWORD PTR[rsp + temp_qcoeff + 16], xmm5
+
+    pxor        xmm6, xmm6
+    ; zero qcoeff
+    movdqa      OWORD PTR[rsp + qcoeff], xmm6
+    movdqa      OWORD PTR[rsp + qcoeff + 16], xmm6
+
+    mov         [rsp + eob_tmp], DWORD -1   ; eob
+    mov         rsi, arg(9)                 ; zbin_boost_ptr
+    mov         rdi, arg(4)                 ; default_zig_zag
+    mov         rax, arg(10)                ; quant_shift_ptr
+
+%macro ZIGZAG_LOOP 2
+rq_zigzag_loop_%1:
+    movsxd      rdx, DWORD PTR[rdi + (%1 * 4)] ; rc
+    movsx       ebx, WORD PTR [rsi]         ; *zbin_boost_ptr
+    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
+
+    ; x
+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
+
+    ; if (x >= zbin)
+    sub         ecx, ebx                    ; x - zbin
+    jl          rq_zigzag_loop_%2           ; x < zbin
+
+    movsx       ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]
+
+    ; downshift by quant_shift[rdx]
+    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
+    sar         ebx, cl                     ; also sets Z bit
+    je          rq_zigzag_loop_%2           ; !y
+    mov         WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+
+    mov         rsi, arg(9)                 ; reset to b->zrun_zbin_boost
+    mov         [rsp + eob_tmp], DWORD %1   ; eob = i
+%endmacro
+ZIGZAG_LOOP 0, 1
+ZIGZAG_LOOP 1, 2
+ZIGZAG_LOOP 2, 3
+ZIGZAG_LOOP 3, 4
+ZIGZAG_LOOP 4, 5
+ZIGZAG_LOOP 5, 6
+ZIGZAG_LOOP 6, 7
+ZIGZAG_LOOP 7, 8
+ZIGZAG_LOOP 8, 9
+ZIGZAG_LOOP 9, 10
+ZIGZAG_LOOP 10, 11
+ZIGZAG_LOOP 11, 12
+ZIGZAG_LOOP 12, 13
+ZIGZAG_LOOP 13, 14
+ZIGZAG_LOOP 14, 15
+ZIGZAG_LOOP 15, end
+rq_zigzag_loop_end:
+
+    mov         rbx, arg(2)                 ; qcoeff_ptr
+    mov         rcx, arg(3)                 ; dequant_ptr
+    mov         rsi, arg(7)                 ; dqcoeff_ptr
+    mov         rax, [rsp + eob_tmp]        ; eob
+
+    movdqa      xmm2, OWORD PTR[rsp + qcoeff]
+    movdqa      xmm3, OWORD PTR[rsp + qcoeff + 16]
+
+    ; y ^ sz
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm4
+    ; x = (y ^ sz) - sz
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm4

    movdqa      xmm0, OWORD PTR[rcx]
    movdqa      xmm1, OWORD PTR[rcx + 16]
@@ -232,23 +181,20 @@ rq_zigzag_1c:
    pmullw      xmm0, xmm2
    pmullw      xmm1, xmm3

-    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff
-    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff
-
-    mov         rax, [rsp + eob]
-
-    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]
-    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]
+    movdqa      OWORD PTR[rbx], xmm2
+    movdqa      OWORD PTR[rbx + 16], xmm3
+    movdqa      OWORD PTR[rsi], xmm0        ; store dqcoeff
+    movdqa      OWORD PTR[rsi + 16], xmm1   ; store dqcoeff

    add         rax, 1

-    add         rsp, vp8_regularquantizeb_stack_size
-    pop         rsp
-
    ; begin epilog
+    add         rsp, stack_size
+    pop         rsp
    pop         rbx
    pop         rdi
    pop         rsi
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -27,11 +27,11 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2);

 #if !CONFIG_RUNTIME_CPU_DETECT

-/* The sse2 quantizer has not been updated to match the new exact
- * quantizer introduced in commit e04e2935
- *#undef vp8_quantize_quantb
- *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
- */
+// Currently, this function realizes a gain on x86 and a loss on x86_64
+#if ARCH_X86
+#undef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
+#endif

 #endif

--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -586,52 +586,45 @@ sym(vp8_sad16x16_sse3):

    STACK_FRAME_CREATE_X3

-        lea             end_ptr,    [src_ptr+src_stride*8]
-
-        lea             end_ptr,    [end_ptr+src_stride*8]
-        pxor            mm7,        mm7
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7

 .vp8_sad16x16_sse3_loop:
-
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
-
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
-
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
-
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]

        lea             src_ptr,    [src_ptr+src_stride*2]
        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]

-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
+        psadbw          xmm0,       xmm1

-        cmp             src_ptr,    end_ptr
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
+
+        sub             end_ptr,     1
        jne             .vp8_sad16x16_sse3_loop

-        movq            ret_var,    mm7
-
-.vp8_sad16x16_early_exit:
-
-        mov             rax,        ret_var
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0

    STACK_FRAME_DESTROY_X3

--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2)
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         temporal_filter_apply_load_finished
+
+temporal_filter_apply_load_16:
+        movdqu      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddw       xmm4,           xmm0
+        paddw       xmm5,           xmm2
+        paddw       xmm6,           xmm1
+        paddw       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          temporal_filter_apply_load_16
+        jmp         temporal_filter_apply_load_8
+
+temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
--- a/vp8/encoder/x86/temporal_filter_x86.h
+++ b/vp8/encoder/x86/temporal_filter_x86.h
@@ -9,14 +9,19 @@
 */


-#ifndef DETOKENIZE_ARM_H
-#define DETOKENIZE_ARM_H
+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
+#define __INC_VP8_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp8_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2

-#if HAVE_ARMV6
-#if CONFIG_ARM_ASM_DETOK
-void vp8_init_detokenizer(VP8D_COMP *dx);
-void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
-#endif
 #endif

 #endif
+
+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
 ;    unsigned char *src_ptr,
 ;    int src_pixels_per_line,
 ;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
+;    int  xoffset,
+;    int  yoffset,
 ;    int *sum,
 ;    unsigned int *sumsquared;;
 ;
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
    GET_GOT     rbx
    push rsi
    push rdi
-    sub         rsp, 16
+    push rbx
    ; end prolog

        pxor            xmm6,           xmm6                 ;
        pxor            xmm7,           xmm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;

-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]

-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height

        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;

-        movq            xmm3,           QWORD PTR [rsi+1]        ;
        punpcklbw       xmm1,           xmm0                 ;
-
-        pmullw          xmm1,           [rax]               ;
+        pmullw          xmm1,           [rax]                ;
        punpcklbw       xmm3,           xmm0
-            ;
        pmullw          xmm3,           [rax+16]             ;
+
        paddw           xmm1,           xmm3                 ;
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
        movdqa          xmm5,           xmm1
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rsi,            r8
-%endif
-filter_block2d_bil_var_sse2_loop:

+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
        movq            xmm1,           QWORD PTR [rsi]               ;
        movq            xmm3,           QWORD PTR [rsi+1]             ;

        punpcklbw       xmm1,           xmm0                 ;
        pmullw          xmm1,           [rax]               ;
-
        punpcklbw       xmm3,           xmm0                 ;
        pmullw          xmm3,           [rax+16]             ;

        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-
+        paddw           xmm1,           xmm4               ;
        psraw           xmm1,           xmm_filter_shift    ;
+
        movdqa          xmm3,           xmm5                 ;
-
        movdqa          xmm5,           xmm1                 ;
-        pmullw          xmm3,           [rdx]               ;

+        pmullw          xmm3,           [rdx]               ;
        pmullw          xmm1,           [rdx+16]             ;
        paddw           xmm1,           xmm3                 ;
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
+        paddw           xmm1,           xmm4                 ;
        psraw           xmm1,           xmm_filter_shift    ;

        movq            xmm3,           QWORD PTR [rdi]               ;
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
        pmaddwd         xmm1,           xmm1                 ;
        paddd           xmm7,           xmm1                 ;

+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
 %if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
 %else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
+        lea             rdi,            [rdi + r9]
 %endif

        sub             rcx,            1                   ;
        jnz             filter_block2d_bil_var_sse2_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
        movdq2q         mm6,            xmm6                ;
        movdq2q         mm7,            xmm7                ;

@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
        movd            [rsi],          mm2    ; xsum
        movd            [rdi],          mm4    ; xxsum

-
    ; begin epilog
-    add rsp, 16
+    pop rbx
    pop rdi
    pop rsi
    RESTORE_GOT
+    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
@@ -974,3 +1069,13 @@ SECTION_RODATA
 align 16
 xmm_bi_rd:
    times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
    const unsigned char *src_ptr,
    int src_pixels_per_line,
    unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
+    int  xoffset,
+    int  yoffset,
    int *sum,
    unsigned int *sumsquared
 );
@@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt

 }

-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
-{
-    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
-    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
    const unsigned char  *src_ptr,
@@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
    unsigned int *sse
 )
 {
-
    int xsum;
    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }

    *sse = xxsum;
    return (xxsum - ((xsum * xsum) >> 6));
@@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
        vp8_filter_block2d_bil_var_sse2(
            src_ptr, src_pixels_per_line,
            dst_ptr, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
            &xsum0, &xxsum0
        );

@@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
        vp8_filter_block2d_bil_var_sse2(
            src_ptr + 8, src_pixels_per_line,
            dst_ptr + 8, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
            &xsum1, &xxsum1
        );
    }
@@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
    int xsum0, xsum1;
    unsigned int xxsum0, xxsum1;

+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum0, &xxsum0
-    );
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);

-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum1, &xxsum1
-    );
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum1, &xxsum1);
+    }

    xsum0 += xsum1;
    xxsum0 += xxsum1;
@@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 {
    int xsum;
    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }

    *sse = xxsum;
    return (xxsum - ((xsum * xsum) >> 7));
--- a/Show More
+++ b/Show More