Update CHANGELOG for v0.9.2 release

Change-Id: I184e927987544e9f34f890249b589ea13a93a330
Update AUTHORS
2010-09-02 14:56:47 -04:00 · 2010-09-02 13:41:03 -04:00 · 2010-09-02 13:33:01 -04:00 · 2010-09-02 12:03:51 -04:00 · 2010-09-02 11:52:39 -04:00 · 2010-09-02 11:52:38 -04:00
147 changed files with 8240 additions and 11592 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -0,0 +1,2 @@
 Adrian Grange <agrange@google.com>
 Johann Koenig <johannkoenig@google.com>
--- a/10
+++ b/10
@@ -1,16 +1,26 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
 Michael Kohler <michaelkohler@live.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
--- a/50
+++ b/50
@@ -1,3 +1,53 @@
 2010-09-02 v0.9.2
  - Enhancements:
      Disable frame dropping by default
      Improved multithreaded performance
      Improved Force Key Frame Behaviour
      Increased rate control buffer level precision
      Fix bug in 1st pass motion compensation
      ivfenc: correct fixed kf interval, --disable-kf
  - Speed:
      Changed above and left context data layout
      Rework idct calling structure.
      Removed unnecessary MB_MODE_INFO copies
      x86: SSSE3 sixtap prediction
      Reworked IDCT to include reconstruction (add) step
      Swap alt/gold/new/last frame buffer ptrs instead of copying.
      Improve SSE2 loopfilter functions
      Change bitreader to use a larger window.
      Avoid loopfilter reinitialization when possible
  - Quality:
      Normalize quantizer's zero bin and rounding factors
      Add trellis quantization.
      Make the quantizer exact.
      Updates to ARNR filtering algorithm
      Fix breakout thresh computation for golden & AltRef frames
      Redo the forward 4x4 dct
      Improve the accuracy of forward walsh-hadamard transform
      Further adjustment of RD behaviour with Q and Zbin.
  - Build System:
      Allow linking of libs built with MinGW to MSVC
      Fix target auto-detection on mingw32
      Allow --cpu= to work for x86.
      configure: pass original arguments through to make dist
      Fix builds without runtime CPU detection
      msvs: fix install of codec sources
      msvs: Change devenv.com command line for better msys support
      msvs: Add vs9 targets.
      Add x86_64-linux-icc target
  - Bugs:
      Potential crashes on older MinGW builds
      Fix two-pass framrate for Y4M input.
      Fixed simple loop filter, other crashes on ARM v6
      arm: fix missing dependency with --enable-shared
      configure: support directories containing .o
      Replace pinsrw (SSE) with MMX instructions
      apple: include proper mach primatives
      Fixed rate control bug with long key frame interval.
      Fix DSO link errors on x86-64 when not using a version script
      Fixed buffer selection for UV in AltRef filtering
 2010-06-17 v0.9.1
  - Enhancements:
      * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
--- a/build/arm-wince-vs8/vpx_decoder.sln
+++ b/build/arm-wince-vs8/vpx_decoder.sln
@@ -8,7 +8,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example", "example.vcproj",
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "obj_int_extract", "obj_int_extract.vcproj", "{E1360C65-D375-4335-8057-7ED99CC3F9B2}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vpx_decoder", "vpx_decoder.vcproj", "{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vpx", "vpx.vcproj", "{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}"
 	ProjectSection(ProjectDependencies) = postProject
 		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
 	EndProjectSection
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -39,13 +39,8 @@ dist:
 	@if [ -d "$(DIST_DIR)/src" ]; then \
            mkdir -p "$(DIST_DIR)/build"; \
            cd "$(DIST_DIR)/build"; \
-            if [ "$(TGT_CC)" = "rvct" ] ; then \
+            echo "Rerunning configure $(CONFIGURE_ARGS)"; \
-				echo "../src/configure --target=$(TOOLCHAIN) --libc=$(ALT_LIBC)"; \
+            ../src/configure $(CONFIGURE_ARGS); \
 				../src/configure --target=$(TOOLCHAIN) --libc=$(ALT_LIBC); \
 			else \
 				echo "../src/configure --target=$(TOOLCHAIN)"; \
 				../src/configure --target=$(TOOLCHAIN); \
 			fi; \
            $(if $(filter vs%,$(TGT_CC)),make NO_LAUNCH_DEVENV=1;) \
        fi
 	@if [ -d "$(DIST_DIR)" ]; then \
@@ -334,6 +329,7 @@ ifneq ($(call enabled,DIST-SRCS),)
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_def.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
    #
    # This isn't really ARCH_ARM dependent, it's dependant on whether we're
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -378,7 +378,7 @@ EOF
 fmt_deps = sed -e 's;^__image.axf;\$(dir \$@)\$(notdir \$<).o \$@;' #hide
 EOF
    else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\(.*\)\.o;\$(dir \$@)\1\$(suffix \$<).o \$@;' #hide
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$(dir \$@)\1\$(suffix \$<).o \$@;'
 EOF
    fi
@@ -395,8 +395,6 @@ EOF
 write_common_target_config_h() {
    cat > ${TMP_H} << EOF
 /* This file automatically generated by configure. Do not edit! */
 #define INLINE      ${INLINE}
 #define FORCEINLINE ${FORCEINLINE:-${INLINE}}
 #define RESTRICT    ${RESTRICT}
 EOF
    print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
@@ -520,6 +518,7 @@ process_common_toolchain() {
                tgt_os=darwin9
                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
                ;;
            *linux*|*bsd*)
@@ -783,6 +782,9 @@ process_common_toolchain() {
        soft_enable ssse3
        case  ${tgt_os} in
            win*)
                enabled gcc && add_cflags -fno-common
                ;;
            solaris*)
                CC=${CC:-${CROSS}gcc}
                LD=${LD:-${CROSS}gcc}
@@ -799,11 +801,21 @@ process_common_toolchain() {
                add_ldflags -i-static
                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE3 -axSSE3
                enabled x86_64 && AR=xiar
                case ${tune_cpu} in
                    atom*)
                        tune_cflags="-x"
                        tune_cpu="SSE3_ATOM"
                    ;;
                    *)
                        tune_cflags="-march="
                    ;;
                esac
                ;;
            gcc*)
                add_cflags  -m${bits}
                add_ldflags -m${bits}
                link_with_cc=gcc
                tune_cflags="-march="
            setup_gnu_toolchain
                ;;
        esac
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -206,7 +206,7 @@ for opt in "$@"; do
    ;;
    --ver=*) vs_ver="$optval"
             case $optval in
-             [78])
+             [789])
             ;;
             *) die Unrecognized Visual Studio Version in $opt
             ;;
@@ -248,6 +248,8 @@ case "${vs_ver:-8}" in
    ;;
    8) vs_ver_id="8.00"
    ;;
    9) vs_ver_id="9.00"
    ;;
 esac
 [ -n "$name" ] || die "Project name (--name) must be specified!"
@@ -347,7 +349,7 @@ generate_vcproj() {
        x86*) $uses_asm && tag ToolFile RelativePath="$self_dirname/../x86-msvs/yasm.rules"
        ;;
        arm*|iwmmx*)
-            if [ "$name" == "vpx_decoder" ];then
+            if [ "$name" == "vpx" ];then
            case "$target" in
                armv5*)
                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmv5.rules"
@@ -376,7 +378,7 @@ generate_vcproj() {
        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
            case "$name" in
-                vpx_decoder) tag Tool \
+                vpx)         tag Tool \
                             Name="VCPreBuildEventTool" \
                             CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
                             tag Tool \
@@ -437,7 +439,7 @@ generate_vcproj() {
                Name="VCCLCompilerTool" \
                Optimization="0" \
                AdditionalIncludeDirectories="$incs" \
-                PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;$defines" \
+                PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
                RuntimeLibrary="$debug_runtime" \
                UsePrecompiledHeader="0" \
                WarningLevel="3" \
@@ -510,7 +512,7 @@ generate_vcproj() {
        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
            case "$name" in
-                vpx_decoder) tag DeploymentTool \
+                vpx)         tag DeploymentTool \
                             ForceDirty="-1" \
                             RegisterOutput="0"
                                ;;
@@ -534,7 +536,7 @@ generate_vcproj() {
        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
            case "$name" in
-                vpx_decoder) tag Tool \
+                vpx)         tag Tool \
                                     Name="VCPreBuildEventTool" \
                                     CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
                             tag Tool \
@@ -595,7 +597,7 @@ generate_vcproj() {
        x86*) tag       Tool \
                      Name="VCCLCompilerTool" \
                      AdditionalIncludeDirectories="$incs" \
-                      PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;$defines" \
+                      PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
                      RuntimeLibrary="$release_runtime" \
                      UsePrecompiledHeader="0" \
                      WarningLevel="3" \
@@ -672,7 +674,7 @@ generate_vcproj() {
        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
            case "$name" in
-                vpx_decoder) tag DeploymentTool \
+                vpx)         tag DeploymentTool \
                             ForceDirty="-1" \
                             RegisterOutput="0"
                ;;
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
 Options:
    --help                      Print this message
    --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8) of visual studio to generate for
+    --ver=version               Version (7,8,9) of visual studio to generate for
    --target=isa-os-cc          Target specifier
 EOF
    exit 1
@@ -193,11 +193,11 @@ ${TAB}rm -rf "$platform"/"$config"
 ifneq (\$(found_devenv),)
  ifeq (\$(CONFIG_VS_VERSION),7)
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile /build "$config"
+${TAB}devenv.com $outfile -build "$config"
  else
 $nows_sln_config: $outfile
-${TAB}devenv.com $outfile /build "$sln_config"
+${TAB}devenv.com $outfile -build "$sln_config"
  endif
 else
@@ -224,7 +224,7 @@ for opt in "$@"; do
    ;;
    --ver=*) vs_ver="$optval"
             case $optval in
-             [78])
+             [789])
             ;;
             *) die Unrecognized Visual Studio Version in $opt
             ;;
@@ -235,7 +235,7 @@ for opt in "$@"; do
             7) sln_vers="8.00"
                sln_vers_str="Visual Studio .NET 2003"
             ;;
-             8)
+             [89])
             ;;
             *) die "Unrecognized Visual Studio Version '$optval' in $opt"
             ;;
@@ -257,6 +257,9 @@ case "${vs_ver:-8}" in
    8) sln_vers="9.00"
       sln_vers_str="Visual Studio 2005"
    ;;
    9) sln_vers="10.00"
       sln_vers_str="Visual Studio 2008"
    ;;
 esac
 for f in "${file_list[@]}"; do
--- a/14
+++ b/14
@@ -38,6 +38,7 @@ Advanced options:
  ${toggle_realtime_only}         enable this option while building for real-time encoding
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -107,11 +108,13 @@ all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs8"
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} universal-darwin8-gcc"
 all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} generic-gnu"
@@ -228,10 +231,8 @@ CONFIG_LIST="
    dequant_tokens
    dc_recon
    new_tokens
    runtime_cpu_detect
    postproc
    postproc_generic
    multithread
    psnr
    ${CODECS}
@@ -242,7 +243,7 @@ CONFIG_LIST="
    spatial_resampling
    realtime_only
    shared
-    dixie
+    arm_asm_detok
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -269,9 +270,7 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    new_tokens
    postproc
    postproc_generic
    multithread
    psnr
    ${CODECS}
@@ -281,7 +280,7 @@ CMDLINE_SELECT="
    spatial_resampling
    realtime_only
    shared
-    dixie
+    arm_asm_detok
 "
 process_cmdline() {
@@ -389,6 +388,7 @@ VERSION_MAJOR=${VERSION_MAJOR}
 VERSION_MINOR=${VERSION_MINOR}
 VERSION_PATCH=${VERSION_PATCH}
 CONFIGURE_ARGS=${CONFIGURE_ARGS}
 EOF
    enabled child || echo "CONFIGURE_ARGS?=${CONFIGURE_ARGS}" >> config.mk
@@ -523,8 +523,6 @@ process_toolchain() {
             enable solution
             vs_version=${tgt_cc##vs}
             all_targets="${all_targets} solution"
             INLINE=__inline
             FORCEINLINE=__forceinline
        ;;
    esac
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -29,10 +29,6 @@
 static const char *exec_name;
 #if CONFIG_DIXIE
 extern vpx_codec_iface_t vpx_codec_vp8_dixie_algo;
 #endif
 static const struct
 {
    char const *name;
@@ -44,9 +40,6 @@ static const struct
 #if CONFIG_VP8_DECODER
    {"vp8",  &vpx_codec_vp8_dx_algo,   0x00385056, 0x00FFFFFF},
 #endif
 #if CONFIG_DIXIE
    {"dixie", &vpx_codec_vp8_dixie_algo, 0x00385056, 0x00FFFFFF},
 #endif
 };
 #include "args.h"
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -505,9 +505,11 @@ static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1,
                                     "Minimum keyframe interval (frames)");
 static const arg_def_t kf_max_dist = ARG_DEF(NULL, "kf-max-dist", 1,
                                     "Maximum keyframe interval (frames)");
 static const arg_def_t kf_disabled = ARG_DEF(NULL, "disable-kf", 0,
                                     "Disable keyframe placement");
 static const arg_def_t *kf_args[] =
 {
-    &kf_min_dist, &kf_max_dist, NULL
+    &kf_min_dist, &kf_max_dist, &kf_disabled, NULL
 };
@@ -800,6 +802,8 @@ int main(int argc, const char **argv_)
            cfg.kf_min_dist = arg_parse_uint(&arg);
        else if (arg_match(&arg, &kf_max_dist, argi))
            cfg.kf_max_dist = arg_parse_uint(&arg);
        else if (arg_match(&arg, &kf_disabled, argi))
            cfg.kf_mode = VPX_KF_DISABLED;
        else
            argj++;
    }
@@ -886,6 +890,8 @@ int main(int argc, const char **argv_)
                {
                    cfg.g_timebase.num = y4m.fps_d;
                    cfg.g_timebase.den = y4m.fps_n;
                    /* And don't reset it in the second pass.*/
                    arg_have_timebase = 1;
                }
                arg_use_i420 = 0;
            }
@@ -1016,9 +1022,6 @@ int main(int argc, const char **argv_)
        /* Construct Encoder Context */
        if (cfg.kf_min_dist == cfg.kf_max_dist)
            cfg.kf_mode = VPX_KF_FIXED;
        vpx_codec_enc_init(&encoder, codec->iface, &cfg,
                           show_psnr ? VPX_CODEC_USE_PSNR : 0);
        ctx_exit_on_error(&encoder, "Failed to initialize encoder");
--- a/libs.mk
+++ b/libs.mk
@@ -11,6 +11,8 @@
 ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
 CODEC_SRCS-yes += libs.mk
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
@@ -59,7 +61,6 @@ CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
 CODEC_SRCS-yes += $(SRC_PATH_BARE)/libs.mk # to show up in the msvs workspace
 endif
 # The following pairs define a mapping of locations in the distribution
@@ -144,7 +145,6 @@ obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
 PROJECTS-$(BUILD_LIBVPX) += armasm$(ARM_ARCH).rules
 endif
 vpx.def: $(call enabled,CODEC_EXPORTS)
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -24,43 +24,36 @@ extern  void vp8_init_scan_order_mask();
 void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
    int i;
-    vpx_memset(mi - cols - 1, 0, sizeof(MODE_INFO) * cols + 1);
+    vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));
    for (i = 0; i < rows; i++)
    {
        vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO));
    }
 }
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 {
    int i;
    for (i = 0; i < NUM_YV12_BUFFERS; i++)
        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->new_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->last_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->golden_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->alt_ref_frame);
    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
-    vpx_free(oci->above_context[Y1CONTEXT]);
+    vpx_free(oci->above_context);
    vpx_free(oci->above_context[UCONTEXT]);
    vpx_free(oci->above_context[VCONTEXT]);
    vpx_free(oci->above_context[Y2CONTEXT]);
    vpx_free(oci->mip);
-    oci->above_context[Y1CONTEXT] = 0;
+    oci->above_context = 0;
    oci->above_context[UCONTEXT]  = 0;
    oci->above_context[VCONTEXT]  = 0;
    oci->above_context[Y2CONTEXT] = 0;
    oci->mip = 0;
    // Structure used to minitor GF useage
    if (oci->gf_active_flags != 0)
        vpx_free(oci->gf_active_flags);
    oci->gf_active_flags = 0;
 }
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
 {
    int i;
    vp8_de_alloc_frame_buffers(oci);
    // our internal buffers are always multiples of 16
@@ -71,37 +64,33 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
        height += 16 - (height & 0xf);
    for (i = 0; i < NUM_YV12_BUFFERS; i++)
    {
      oci->fb_idx_ref_cnt[0] = 0;
      if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i],  width, height, VP8BORDERINPIXELS) < 0)
        {
            vp8_de_alloc_frame_buffers(oci);
            return ALLOC_FAILURE;
        }
    }
    oci->new_fb_idx = 0;
    oci->lst_fb_idx = 1;
    oci->gld_fb_idx = 2;
    oci->alt_fb_idx = 3;
    oci->fb_idx_ref_cnt[0] = 1;
    oci->fb_idx_ref_cnt[1] = 1;
    oci->fb_idx_ref_cnt[2] = 1;
    oci->fb_idx_ref_cnt[3] = 1;
    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    if (vp8_yv12_alloc_frame_buffer(&oci->new_frame,   width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    if (vp8_yv12_alloc_frame_buffer(&oci->last_frame,  width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    if (vp8_yv12_alloc_frame_buffer(&oci->golden_frame, width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    if (vp8_yv12_alloc_frame_buffer(&oci->alt_ref_frame, width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
@@ -123,33 +112,9 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
    oci->mi = oci->mip + oci->mode_info_stride + 1;
-    oci->above_context[Y1CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 4 , 1);
+    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
-    if (!oci->above_context[Y1CONTEXT])
+    if (!oci->above_context)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    oci->above_context[UCONTEXT]  = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1);
    if (!oci->above_context[UCONTEXT])
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    oci->above_context[VCONTEXT]  = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols * 2 , 1);
    if (!oci->above_context[VCONTEXT])
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    oci->above_context[Y2CONTEXT] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * oci->mb_cols     , 1);
    if (!oci->above_context[Y2CONTEXT])
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
@@ -157,20 +122,6 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
    // Structures used to minitor GF usage
    if (oci->gf_active_flags != 0)
        vpx_free(oci->gf_active_flags);
    oci->gf_active_flags = (unsigned char *)vpx_calloc(oci->mb_rows * oci->mb_cols, 1);
    if (!oci->gf_active_flags)
    {
        vp8_de_alloc_frame_buffers(oci);
        return ALLOC_FAILURE;
    }
    oci->gf_active_count = oci->mb_rows * oci->mb_cols;
    return 0;
 }
 void vp8_setup_version(VP8_COMMON *cm)
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -0,0 +1,67 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT  |vp8_dc_only_idct_add_v6|
    AREA    |.text|, CODE, READONLY
 ;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
 ;                             unsigned char *dst_ptr, int pitch, int stride)
 ; r0  input_dc
 ; r1  pred_ptr
 ; r2  dest_ptr
 ; r3  pitch
 ; sp  stride
 |vp8_dc_only_idct_add_v6| PROC
    stmdb       sp!, {r4 - r7, lr}
    add         r0, r0, #4                ; input_dc += 4
    ldr         r12, c0x0000FFFF
    ldr         r4, [r1], r3
    ldr         r6, [r1], r3
    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
    ldr         lr, [sp, #20]
    orr         r0, r0, r0, lsl #16       ; a1 | a1
    uxtab16     r5, r0, r4                ; a1+2 | a1+0
    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
    uxtab16     r7, r0, r6
    uxtab16     r6, r0, r6, ror #8
    usat16      r5, #8, r5
    usat16      r4, #8, r4
    usat16      r7, #8, r7
    usat16      r6, #8, r6
    orr         r5, r5, r4, lsl #8
    orr         r7, r7, r6, lsl #8
    ldr         r4, [r1], r3
    ldr         r6, [r1]
    str         r5, [r2], lr
    str         r7, [r2], lr
    uxtab16     r5, r0, r4
    uxtab16     r4, r0, r4, ror #8
    uxtab16     r7, r0, r6
    uxtab16     r6, r0, r6, ror #8
    usat16      r5, #8, r5
    usat16      r4, #8, r4
    usat16      r7, #8, r7
    usat16      r6, #8, r6
    orr         r5, r5, r4, lsl #8
    orr         r7, r7, r6, lsl #8
    str         r5, [r2], lr
    str         r7, [r2]
    ldmia       sp!, {r4 - r7, pc}
    ENDP  ; |vp8_dc_only_idct_add_v6|
 ; Constant Pool
 c0x0000FFFF DCD 0x0000FFFF
    END
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -15,8 +15,6 @@
    EXPORT  |vp8_short_idct4x4llm_v6_scott|
    EXPORT  |vp8_short_idct4x4llm_v6_dual|
    EXPORT  |vp8_dc_only_idct_armv6|
    AREA    |.text|, CODE, READONLY
 ;********************************************************************************
@@ -344,34 +342,4 @@ loop2_dual
    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
    ENDP
 ; sjl added 10/17/08
 ;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
 |vp8_dc_only_idct_armv6| PROC
    stmdb       sp!, {r4 - r6, lr}
    add         r0, r0, #0x4
    add         r4, r1, r2                      ; output + shortpitch
    mov         r0, r0, ASR #0x3    ;aka a1
    add         r5, r1, r2, LSL #1              ; output + shortpitch * 2
    pkhbt       r0, r0, r0, lsl #16             ; a1 | a1
    add         r6, r5, r2                      ; output + shortpitch * 3
    str         r0, [r1, #0]
    str         r0, [r1, #4]
    str         r0, [r4, #0]
    str         r0, [r4, #4]
    str         r0, [r5, #0]
    str         r0, [r5, #4]
    str         r0, [r6, #0]
    str         r0, [r6, #4]
    ldmia       sp!, {r4 - r6, pc}
    ENDP  ; |vp8_dc_only_idct_armv6|
    END
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -8,8 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
-    EXPORT |vp8_short_inv_walsh4x4_armv6|
+    EXPORT |vp8_short_inv_walsh4x4_v6|
-    EXPORT |vp8_short_inv_walsh4x4_1_armv6|
+    EXPORT |vp8_short_inv_walsh4x4_1_v6|
    ARM
    REQUIRE8
@@ -17,8 +17,8 @@
    AREA    |.text|, CODE, READONLY  ; name this block of code
-;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_armv6| PROC
+|vp8_short_inv_walsh4x4_v6| PROC
    stmdb       sp!, {r4 - r11, lr}
@@ -123,11 +123,11 @@
    str         r5, [r1]
    ldmia       sp!, {r4 - r11, pc}
-    ENDP        ; |vp8_short_inv_walsh4x4_armv6|
+    ENDP        ; |vp8_short_inv_walsh4x4_v6|
-;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
+;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_armv6| PROC
+|vp8_short_inv_walsh4x4_1_v6| PROC
    ldrsh       r2, [r0]             ; [0]
    add         r2, r2, #3           ; [0] + 3
@@ -145,7 +145,7 @@
    str         r2, [r1]
    bx          lr
-    ENDP        ; |vp8_short_inv_walsh4x4_1_armv6|
+    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|
 ; Constant Pool
 c0x00030003 DCD 0x00030003
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -55,113 +55,87 @@ pstep       RN  r1
 ;stack  const char *thresh,
 ;stack  int  count
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
+; All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
+; for flimit. Same applies to limit. thresh is not used in simple looopfilter
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}
-    sub         src, src, pstep, lsl #1     ; move src pointer down by 2 lines
+    ldr         r12, [r3]                   ; limit
-
+    ldr         r3, [src, -pstep, lsl #1]   ; p1
-    ldr         r12, [r3], #4               ; limit
+    ldr         r4, [src, -pstep]           ; p0
-    ldr         r3, [src], pstep            ; p1
+    ldr         r5, [src]                   ; q0
-
+    ldr         r6, [src, pstep]            ; q1
-    ldr         r9, [sp, #36]               ; count for 8-in-parallel
+    ldr         r7, [r2]                    ; flimit
    ldr         r4, [src], pstep            ; p0
    ldr         r7, [r2], #4                ; flimit
    ldr         r5, [src], pstep            ; q0
    ldr         r2, c0x80808080
-
+    ldr         r9, [sp, #40]               ; count for 8-in-parallel
    ldr         r6, [src]                   ; q1
    uadd8       r7, r7, r7                  ; flimit * 2
-    mov         r9, r9, lsl #1              ; 4-in-parallel
+    mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
    uadd8       r12, r7, r12                ; flimit * 2 + limit
    mov         lr, #0                      ; need 0 in a couple places
 |simple_hnext8|
-    ; vp8_simple_filter_mask() function
+    ; vp8_simple_filter_mask()
    uqsub8      r7, r3, r6                  ; p1 - q1
    uqsub8      r8, r6, r3                  ; q1 - p1
    uqsub8      r10, r4, r5                 ; p0 - q0
    uqsub8      r11, r5, r4                 ; q0 - p0
    orr         r8, r8, r7                  ; abs(p1 - q1)
    ldr         lr, c0x7F7F7F7F             ; 01111111 mask
    orr         r10, r10, r11               ; abs(p0 - q0)
    and         r8, lr, r8, lsr #1          ; abs(p1 - q1) / 2
    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
-    mvn         lr, #0                      ; r10 == -1
+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
-    ; STALL waiting on r10 :(
+    mvn         r8, #0
-    uqsub8      r10, r10, r12               ; compare to flimit
+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
-    mov         r8, #0
+    sel         r10, r8, lr                 ; filter mask: F or 0
    usub8       r10, r8, r10                ; use usub8 instead of ssub8
    ; STALL (maybe?) when are flags set? :/
    sel         r10, lr, r8                 ; filter mask: lr
    cmp         r10, #0
-    beq         simple_hskip_filter         ; skip filtering
+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
-    ;vp8_simple_filter() function
+    ;vp8_simple_filter()
    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
-    qsub8       r3, r3, r6                  ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
-    qsub8       r6, r5, r4                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+    qsub8       r6, r5, r4                  ; q0 - p0
-
+    qadd8       r3, r3, r6                  ; += q0 - p0
    qadd8       r3, r3, r6
    ldr         r8, c0x03030303             ; r8 = 3
    qadd8       r3, r3, r6
    ldr         r7, c0x04040404
    qadd8       r3, r3, r6                  ; += q0 - p0
    ldr         r8, c0x03030303
    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    ;STALL
    and         r3, r3, r10                 ; vp8_filter &= mask
-    qadd8       r3, r3, r6
+    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
-    and         r3, r3, lr                  ; vp8_filter &= mask;
+    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
-    ;save bottom 3 bits so that we round one side +4 and the other +3
+    shadd8      r7 , r7 , lr
-    qadd8       r8 , r3 , r8                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    shadd8      r8 , r8 , lr
-    qadd8       r3 , r3 , r7                ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+    shadd8      r7 , r7 , lr
    shadd8      r8 , r8 , lr
    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
-    mov         r7, #0
+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
-    shadd8      r8 , r8 , r7                ; Filter2 >>= 3
+    qadd8       r4, r4, r8                  ; u = p0 + Filter2
    shadd8      r3 , r3 , r7                ; Filter1 >>= 3
    shadd8      r8 , r8 , r7
    shadd8      r3 , r3 , r7
    shadd8      r8 , r8 , r7                ; r8: Filter2
    shadd8      r3 , r3 , r7                ; r7: filter1
    ;calculate output
    sub         src, src, pstep, lsl #1
    qadd8       r4, r4, r8                  ; u = vp8_signed_char_clamp(p0 + Filter2)
    qsub8       r5 ,r5, r3                  ; u = vp8_signed_char_clamp(q0 - Filter1)
    eor         r4, r4, r2                  ; *op0 = u^0x80
    str         r4, [src], pstep            ; store op0 result
    eor         r5, r5, r2                  ; *oq0 = u^0x80
-    str         r5, [src], pstep            ; store oq0 result
+    str         r5, [src]                   ; store oq0 result
    eor         r4, r4, r2                  ; *op0 = u^0x80
    str         r4, [src, -pstep]           ; store op0 result
 |simple_hskip_filter|
    add         src, src, #4
    sub         src, src, pstep
    sub         src, src, pstep, lsl #1
    subs        r9, r9, #1
    addne       src, src, #4                ; next row
-    ;pld            [src]
+    ldrne       r3, [src, -pstep, lsl #1]   ; p1
-    ;pld            [src, pstep]
+    ldrne       r4, [src, -pstep]           ; p0
-    ;pld            [src, pstep, lsl #1]
+    ldrne       r5, [src]                   ; q0
-
+    ldrne       r6, [src, pstep]            ; q1
    ldrne           r3, [src], pstep            ; p1
    ldrne           r4, [src], pstep            ; p0
    ldrne           r5, [src], pstep            ; q0
    ldrne           r6, [src]                   ; q1
    bne         simple_hnext8
@@ -174,9 +148,9 @@ pstep       RN  r1
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}
-    ldr         r12, [r2], #4               ; r12: flimit
+    ldr         r12, [r2]                   ; r12: flimit
    ldr         r2, c0x80808080
-    ldr         r7, [r3], #4                ; limit
+    ldr         r7, [r3]                    ; limit
    ; load soure data to r7, r8, r9, r10
    ldrh        r3, [src, #-2]
@@ -213,16 +187,14 @@ pstep       RN  r1
    uqsub8      r10, r5, r4                 ; q0 - p0
    orr         r7, r7, r8                  ; abs(p1 - q1)
    orr         r9, r9, r10                 ; abs(p0 - q0)
    ldr         lr, c0x7F7F7F7F             ; 0111 1111 mask
    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
    and         r7, lr, r7, lsr #1          ; abs(p1 - q1) / 2
    mov         r8, #0
    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
    mvn         r10, #0                     ; r10 == -1
    uqsub8      r7, r7, r12                 ; compare to flimit
-    usub8       r7, r8, r7
+    usub8       r7, r12, r7                 ; compare to flimit
-    sel         r7, r10, r8                 ; filter mask: lr
+    sel         lr, r10, r8                 ; filter mask
    cmp         lr, #0
    beq         simple_vskip_filter         ; skip filtering
@@ -233,35 +205,34 @@ pstep       RN  r1
    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
-    qsub8       r3, r3, r6                  ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
-    qsub8       r6, r5, r4                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+    qsub8       r6, r5, r4                  ; q0 - p0
-    qadd8       r3, r3, r6
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
-    ldr         r8, c0x03030303             ; r8 = 3
+    ldr         r9, c0x03030303             ; r9 = 3
-    qadd8       r3, r3, r6
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    ldr         r7, c0x04040404
-    qadd8       r3, r3, r6
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    ;STALL
    and         r3, r3, lr                  ; vp8_filter &= mask
-    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
-    qadd8       r8 , r3 , r8                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
    qadd8       r3 , r3 , r7                ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
-    mov         r7, #0
+    shadd8      r9 , r9 , r8
-    shadd8      r8 , r8 , r7                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r8
-    shadd8      r3 , r3 , r7                ; Filter1 >>= 3
+    shadd8      r9 , r9 , r8
-    shadd8      r8 , r8 , r7
+    shadd8      r3 , r3 , r8
-    shadd8      r3 , r3 , r7
+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
-    shadd8      r8 , r8 , r7                ; r8: filter2
+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
    shadd8      r3 , r3 , r7                ; r7: filter1
    ;calculate output
    sub         src, src, pstep, lsl #2
-    qadd8       r4, r4, r8                  ; u = vp8_signed_char_clamp(p0 + Filter2)
+    qadd8       r4, r4, r9                  ; u = p0 + Filter2
-    qsub8       r5, r5, r3                  ; u = vp8_signed_char_clamp(q0 - Filter1)
+    qsub8       r5, r5, r3                  ; u = q0 - Filter1
    eor         r4, r4, r2                  ; *op0 = u^0x80
    eor         r5, r5, r2                  ; *oq0 = u^0x80
@@ -286,10 +257,6 @@ pstep       RN  r1
 |simple_vskip_filter|
    subs        r11, r11, #1
    ;pld            [src]
    ;pld            [src, pstep]
    ;pld            [src, pstep, lsl #1]
    ; load soure data to r7, r8, r9, r10
    ldrneh      r3, [src, #-2]
    ldrneh      r4, [src], pstep
@@ -309,14 +276,12 @@ pstep       RN  r1
    bne         simple_vnext8
-    ldmia       sp!, {r4 - r12, pc}
+    ldmia       sp!, {r4 - r11, pc}
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
 ; Constant Pool
 c0x80808080 DCD     0x80808080
 c0x03030303 DCD     0x03030303
 c0x04040404 DCD     0x04040404
 c0x01010101 DCD     0x01010101
 c0x7F7F7F7F DCD     0x7F7F7F7F
    END
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -15,9 +15,9 @@
 #if HAVE_ARMV6
 extern prototype_idct(vp8_short_idct4x4llm_1_v6);
 extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
-extern prototype_idct_scalar(vp8_dc_only_idct_armv6);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
-extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
+extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
 #undef  vp8_idct_idct1
 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
@@ -25,20 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
 #undef  vp8_idct_idct16
 #define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
-#undef  vp8_idct_idct1_scalar
+#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_armv6
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
 #undef  vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
+#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
 #undef  vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
+#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
 #endif
 #if HAVE_ARMV7
 extern prototype_idct(vp8_short_idct4x4llm_1_neon);
 extern prototype_idct(vp8_short_idct4x4llm_neon);
-extern prototype_idct_scalar(vp8_dc_only_idct_neon);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
 extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
@@ -48,8 +48,8 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
 #undef  vp8_idct_idct16
 #define vp8_idct_idct16 vp8_short_idct4x4llm_neon
-#undef  vp8_idct_idct1_scalar
+#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_neon
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
 #undef  vp8_idct_iwalsh1
 #define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -14,16 +14,6 @@
 #include "loopfilter.h"
 #include "onyxc_int.h"
 typedef void loop_filter_uvfunction
 (
    unsigned char *u,   // source pointer
    int p,              // pitch
    const signed char *flimit,
    const signed char *limit,
    const signed char *thresh,
    unsigned char *v
 );
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@@ -0,0 +1,49 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT  |vp8_dc_only_idct_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
 ;                               unsigned char *dst_ptr, int pitch, int stride)
 ; r0  input_dc
 ; r1  pred_ptr
 ; r2  dst_ptr
 ; r3  pitch
 ; sp  stride
 |vp8_dc_only_idct_add_neon| PROC
    add             r0, r0, #4
    asr             r0, r0, #3
    ldr             r12, [sp]
    vdup.16         q0, r0
    vld1.32         {d2[0]}, [r1], r3
    vld1.32         {d2[1]}, [r1], r3
    vld1.32         {d4[0]}, [r1], r3
    vld1.32         {d4[1]}, [r1]
    vaddw.u8        q1, q0, d2
    vaddw.u8        q2, q0, d4
    vqmovun.s16     d2, q1
    vqmovun.s16     d4, q2
    vst1.32         {d2[0]}, [r2], r12
    vst1.32         {d2[1]}, [r2], r12
    vst1.32         {d4[0]}, [r2], r12
    vst1.32         {d4[1]}, [r2]
    bx             lr
    ENDP
    END
--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c
@@ -29,7 +29,7 @@ void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
    unsigned char *y_buffer = x->dst.y_buffer;
    unsigned char *ypred_ptr = x->predictor;
    int y_stride = x->dst.y_stride;
-    int mode = x->mbmi.mode;
+    int mode = x->mode_info_context->mbmi.mode;
    int Up = x->up_available;
    int Left = x->left_available;
@@ -52,7 +52,7 @@ void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
    unsigned char *y_buffer = x->dst.y_buffer;
    unsigned char *ypred_ptr = x->predictor;
    int y_stride = x->dst.y_stride;
-    int mode = x->mbmi.mode;
+    int mode = x->mode_info_context->mbmi.mode;
    int Up = x->up_available;
    int Left = x->left_available;
--- a/vp8/common/arm/systemdependent.c
+++ b/vp8/common/arm/systemdependent.c
@@ -43,7 +43,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;
    rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
    rtcd->idct.idct1_scalar = vp8_dc_only_idct_neon;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
@@ -75,7 +74,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;
    rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
    rtcd->idct.idct1_scalar = vp8_dc_only_idct_armv6;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_armv6;
    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_armv6;
@@ -128,11 +126,13 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
 #endif
 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
 #endif
 #endif
 #if HAVE_ARMV7
    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon;
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -51,19 +51,18 @@ DEFINE(mb_dst_y_stride,                          offsetof(MACROBLOCKD, dst.y_str
 DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
 DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
 DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
 DEFINE(mb_mbmi_mode,                            offsetof(MACROBLOCKD, mbmi.mode));
 DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
 DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
 DEFINE(detok_scan,                              offsetof(DETOK, scan));
-DEFINE(detok_ptr_onyxblock2context_leftabove,    offsetof(DETOK, ptr_onyxblock2context_leftabove));
+DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_onyx_coef_tree_ptr,                  offsetof(DETOK, vp8_coef_tree_ptr));
+DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
 DEFINE(detok_teb_base_ptr,                      offsetof(DETOK, teb_base_ptr));
 DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_onyx_coef_bands_x,                offsetof(DETOK, ptr_onyx_coef_bands_x));
+DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
-DEFINE(DETOK_A,                                 offsetof(DETOK, A));
+DEFINE(detok_A,                                 offsetof(DETOK, A));
-DEFINE(DETOK_L,                                 offsetof(DETOK, L));
+DEFINE(detok_L,                                 offsetof(DETOK, L));
 DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
 DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
--- a/vp8/common/blockd.c
+++ b/vp8/common/blockd.c
@@ -12,13 +12,13 @@
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count)
 {
    vpx_memcpy(t->l, l, sizeof(ENTROPY_CONTEXT) * count);
    vpx_memcpy(t->a, a, sizeof(ENTROPY_CONTEXT) * count);
 }
 const int vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0};
 const int vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0};
 const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1};
-const int vp8_block2context[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3};
+
 const unsigned char vp8_block2left[25] =
 {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 };
 const unsigned char vp8_block2above[25] =
 {
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -49,19 +49,19 @@ typedef struct
 } POS;
-typedef int ENTROPY_CONTEXT;
+typedef char ENTROPY_CONTEXT;
 typedef struct
 {
-    ENTROPY_CONTEXT l[4];
+    ENTROPY_CONTEXT y1[4];
-    ENTROPY_CONTEXT a[4];
+    ENTROPY_CONTEXT u[2];
-} TEMP_CONTEXT;
+    ENTROPY_CONTEXT v[2];
    ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 extern void vp8_setup_temp_context(TEMP_CONTEXT *t, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int count);
 extern const int vp8_block2left[25];
 extern const int vp8_block2above[25];
 extern const int vp8_block2type[25];
-extern const int vp8_block2context[25];
+
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 #define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
    Dest = ((A)!=0) + ((B)!=0);
@@ -215,9 +215,10 @@ typedef struct
 {
    DECLARE_ALIGNED(16, short, diff[400]);      // from idct diff
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-    DECLARE_ALIGNED(16, short, reference[384]);
+//not used    DECLARE_ALIGNED(16, short, reference[384]);
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);
    // 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
    BLOCKD block[25];
@@ -232,14 +233,12 @@ typedef struct
    FRAME_TYPE frame_type;
    MB_MODE_INFO mbmi;
    int up_available;
    int left_available;
    // Y,U,V,Y2
-    ENTROPY_CONTEXT *above_context[4];   // row of context for each plane
+    ENTROPY_CONTEXT_PLANES *above_context;
-    ENTROPY_CONTEXT(*left_context)[4];   // (up to) 4 contexts ""
+    ENTROPY_CONTEXT_PLANES *left_context;
    // 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active.
    unsigned char segmentation_enabled;
@@ -275,9 +274,6 @@ typedef struct
    int mb_to_top_edge;
    int mb_to_bottom_edge;
    //char * gf_active_ptr;
    signed char *gf_active_ptr;
    unsigned int frames_since_golden;
    unsigned int frames_till_alt_ref_frame;
    vp8_subpix_fn_t  subpixel_predict;
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -264,8 +264,10 @@ void vp8_entropy_mode_init()
    vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
    vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
-    vp8_tokens_from_tree(VP8_MVREFENCODINGS,   vp8_mv_ref_tree);
+    vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array,
-    vp8_tokens_from_tree(VP8_SUBMVREFENCODINGS, vp8_sub_mv_ref_tree);
+                                vp8_mv_ref_tree, NEARESTMV);
    vp8_tokens_from_tree_offset(vp8_sub_mv_ref_encoding_array,
                                vp8_sub_mv_ref_tree, LEFT4X4);
    vp8_tokens_from_tree(vp8_small_mvencodings, vp8_small_mvtree);
 }
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -54,10 +54,6 @@ extern struct vp8_token_struct vp8_mbsplit_encodings  [VP8_NUMMBSPLITS];
 extern struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
 extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 #define VP8_MVREFENCODINGS      (vp8_mv_ref_encoding_array - NEARESTMV)
 #define VP8_SUBMVREFENCODINGS   (vp8_sub_mv_ref_encoding_array - LEFT4X4)
 extern const vp8_tree_index vp8_small_mvtree[];
 extern struct vp8_token_struct vp8_small_mvencodings [8];
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -39,6 +39,9 @@ static void extend_plane_borders
    for (i = 0; i < h - 0 + 1; i++)
    {
        // Some linkers will complain if we call vpx_memset with el set to a
        // constant 0.
        if (el)
            vpx_memset(dest_ptr1, src_ptr1[0], el);
        vpx_memset(dest_ptr2, src_ptr2[0], er);
        src_ptr1  += sp;
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -32,7 +32,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;
    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
-    rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
+    rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
@@ -61,7 +61,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
-#if CONFIG_POSTPROC || CONFIG_VP8_ENCODER
+#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -18,8 +18,10 @@
 #define prototype_idct(sym) \
    void sym(short *input, short *output, int pitch)
-#define prototype_idct_scalar(sym) \
+#define prototype_idct_scalar_add(sym) \
-    void sym(short input, short *output, int pitch)
+    void sym(short input, \
             unsigned char *pred, unsigned char *output, \
             int pitch, int stride)
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/idct_x86.h"
@@ -39,10 +41,10 @@ extern prototype_idct(vp8_idct_idct1);
 #endif
 extern prototype_idct(vp8_idct_idct16);
-#ifndef vp8_idct_idct1_scalar
+#ifndef vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_c
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
 #endif
-extern prototype_idct_scalar(vp8_idct_idct1_scalar);
+extern prototype_idct_scalar_add(vp8_idct_idct1_scalar_add);
 #ifndef vp8_idct_iwalsh1
@@ -56,14 +58,14 @@ extern prototype_second_order(vp8_idct_iwalsh1);
 extern prototype_second_order(vp8_idct_iwalsh16);
 typedef prototype_idct((*vp8_idct_fn_t));
-typedef prototype_idct_scalar((*vp8_idct_scalar_fn_t));
+typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
 typedef prototype_second_order((*vp8_second_order_fn_t));
 typedef struct
 {
    vp8_idct_fn_t            idct1;
    vp8_idct_fn_t            idct16;
-    vp8_idct_scalar_fn_t  idct1_scalar;
+    vp8_idct_scalar_add_fn_t idct1_scalar_add;
    vp8_second_order_fn_t iwalsh1;
    vp8_second_order_fn_t iwalsh16;
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -104,23 +104,30 @@ void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
    }
 }
-
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
 void vp8_dc_only_idct_c(short input_dc, short *output, int pitch)
 {
-    int i;
+    int a1 = ((input_dc + 4) >> 3);
-    int a1;
+    int r, c;
    short *op = output;
    int shortpitch = pitch >> 1;
    a1 = ((input_dc + 4) >> 3);
-    for (i = 0; i < 4; i++)
+    for (r = 0; r < 4; r++)
    {
-        op[0] = a1;
+        for (c = 0; c < 4; c++)
-        op[1] = a1;
+        {
-        op[2] = a1;
+            int a = a1 + pred_ptr[c] ;
-        op[3] = a1;
+
-        op += shortpitch;
+            if (a < 0)
                a = 0;
            if (a > 255)
                a = 255;
            dst_ptr[c] = (unsigned char) a ;
        }
        dst_ptr += stride;
        pred_ptr += pitch;
    }
 }
 void vp8_short_inv_walsh4x4_c(short *input, short *output)
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -65,7 +65,8 @@ void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x
 {
    int i;
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.mode != B_PRED &&
        x->mode_info_context->mbmi.mode != SPLITMV)
    {
        // do 2nd order transform on the dc block
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -117,5 +117,14 @@ typedef struct
 #define LF_INVOKE(ctx,fn) vp8_lf_##fn
 #endif
 typedef void loop_filter_uvfunction
 (
    unsigned char *u,   // source pointer
    int p,              // pitch
    const signed char *flimit,
    const signed char *limit,
    const signed char *thresh,
    unsigned char *v
 );
 #endif
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -18,7 +18,7 @@
 typedef unsigned char uc;
-__inline signed char vp8_signed_char_clamp(int t)
+static __inline signed char vp8_signed_char_clamp(int t)
 {
    t = (t < -128 ? -128 : t);
    t = (t > 127 ? 127 : t);
@@ -27,7 +27,7 @@ __inline signed char vp8_signed_char_clamp(int t)
 // should we apply any filter at all ( 11111111 yes, 00000000 no)
-__inline signed char vp8_filter_mask(signed char limit, signed char flimit,
+static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
                                     uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
 {
    signed char mask = 0;
@@ -47,7 +47,7 @@ __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
 }
 // is there high variance internal edge ( 11111111 yes, 00000000 no)
-__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
 {
    signed char hev = 0;
    hev  |= (abs(p1 - p0) > thresh) * -1;
@@ -55,7 +55,7 @@ __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
    return hev;
 }
-__inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
+static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
    signed char ps0, qs0;
@@ -161,7 +161,7 @@ void vp8_loop_filter_vertical_edge_c
    while (++i < count * 8);
 }
-__inline void vp8_mbfilter(signed char mask, signed char hev,
+static __inline void vp8_mbfilter(signed char mask, signed char hev,
                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
    signed char s, u;
@@ -281,7 +281,7 @@ void vp8_mbloop_filter_vertical_edge_c
 }
 // should we apply any filter at all ( 11111111 yes, 00000000 no)
-__inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
 {
 // Why does this cause problems for win32?
 // error C2143: syntax error : missing ';' before 'type'
@@ -294,7 +294,7 @@ __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimi
    return mask;
 }
-__inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+static __inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
 {
    signed char vp8_filter, Filter1, Filter2;
    signed char p1 = (signed char) * op1 ^ 0x80;
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -33,6 +33,7 @@ void vp8_initialize_common(void);
 #define MAXQ 127
 #define QINDEX_RANGE (MAXQ + 1)
 #define NUM_YV12_BUFFERS 4
 typedef struct frame_contexts
 {
@@ -94,15 +95,16 @@ typedef struct VP8Common
    YUV_TYPE clr_type;
    CLAMP_TYPE  clamp_type;
    YV12_BUFFER_CONFIG last_frame;
    YV12_BUFFER_CONFIG golden_frame;
    YV12_BUFFER_CONFIG alt_ref_frame;
    YV12_BUFFER_CONFIG new_frame;
    YV12_BUFFER_CONFIG *frame_to_show;
    YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
    int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
    int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
    YV12_BUFFER_CONFIG post_proc_buffer;
    YV12_BUFFER_CONFIG temp_scale_frame;
-    FRAME_TYPE last_frame_type;  //Add to check if vp8_frame_init_loop_filter() can be skiped.
+    FRAME_TYPE last_frame_type;  //Add to check if vp8_frame_init_loop_filter() can be skipped.
    FRAME_TYPE frame_type;
    int show_frame;
@@ -131,8 +133,6 @@ typedef struct VP8Common
    unsigned int frames_since_golden;
    unsigned int frames_till_alt_ref_frame;
    unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
    int gf_active_count;
    /* We allocate a MODE_INFO struct for each macroblock, together with
       an extra row on top and column on the left to simplify prediction. */
@@ -165,8 +165,8 @@ typedef struct VP8Common
    int ref_frame_sign_bias[MAX_REF_FRAMES];    // Two state 0, 1
    // Y,U,V,Y2
-    ENTROPY_CONTEXT *above_context[4];   // row of context for each plane
+    ENTROPY_CONTEXT_PLANES *above_context;   // row of context for each plane
-    ENTROPY_CONTEXT left_context[4][4];  // (up to) 4 contexts ""
+    ENTROPY_CONTEXT_PLANES left_context;  // (up to) 4 contexts ""
    // keyframe block modes are predicted by their above, left neighbors
@@ -201,6 +201,7 @@ typedef struct VP8Common
 void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
 void vp8_init_loop_filter(VP8_COMMON *cm);
 void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
 #endif
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -330,13 +330,6 @@ void vp8_de_noise(YV12_BUFFER_CONFIG         *source,
 }
 //Notes: It is better to change CHAR to unsigned or signed to
 //avoid error on ARM platform.
 char vp8_an[8][64][3072];
 int vp8_cd[8][64];
 double vp8_gaussian(double sigma, double mu, double x)
 {
    return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -210,7 +210,8 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
 {
    int i;
-    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
        x->mode_info_context->mbmi.mode != SPLITMV)
    {
        unsigned char *uptr, *vptr;
        unsigned char *upred_ptr = &x->predictor[256];
@@ -254,16 +255,18 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
    }
 }
-
+//encoder only
 void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
 {
-    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+
  if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
      x->mode_info_context->mbmi.mode != SPLITMV)
    {
        unsigned char *ptr_base;
        unsigned char *ptr;
        unsigned char *pred_ptr = x->predictor;
-        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mbmi.mv.as_mv.col;
+        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
        int pre_stride = x->block[0].pre_stride;
        ptr_base = x->pre.y_buffer;
@@ -282,7 +285,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
    {
        int i;
-        if (x->mbmi.partitioning < 3)
+        if (x->mode_info_context->mbmi.partitioning < 3)
        {
            for (i = 0; i < 4; i++)
            {
@@ -313,7 +316,9 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
 void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
 {
-    if (x->mbmi.ref_frame != INTRA_FRAME && x->mbmi.mode != SPLITMV)
+
    if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
        x->mode_info_context->mbmi.mode != SPLITMV)
    {
        int offset;
        unsigned char *ptr_base;
@@ -323,8 +328,8 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
        unsigned char *upred_ptr = &x->predictor[256];
        unsigned char *vpred_ptr = &x->predictor[320];
-        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mbmi.mv.as_mv.col;
+        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
        int pre_stride = x->block[0].pre_stride;
        ptr_base = x->pre.y_buffer;
@@ -361,7 +366,7 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
    {
        int i;
-        if (x->mbmi.partitioning < 3)
+        if (x->mode_info_context->mbmi.partitioning < 3)
        {
            for (i = 0; i < 4; i++)
            {
@@ -410,7 +415,7 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
 {
    int i, j;
-    if (x->mbmi.mode == SPLITMV)
+    if (x->mode_info_context->mbmi.mode == SPLITMV)
    {
        for (i = 0; i < 2; i++)
        {
@@ -455,8 +460,8 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
    }
    else
    {
-        int mvrow = x->mbmi.mv.as_mv.row;
+        int mvrow = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mvcol = x->mbmi.mv.as_mv.col;
+        int mvcol = x->mode_info_context->mbmi.mv.as_mv.col;
        if (mvrow < 0)
            mvrow -= 1;
@@ -535,7 +540,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
    unsigned char *pred_ptr = x->predictor;
    unsigned char *dst_ptr = x->dst.y_buffer;
-    if (x->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.mode != SPLITMV)
    {
        int offset;
        unsigned char *ptr_base;
@@ -547,8 +552,8 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
        unsigned char *udst_ptr = x->dst.u_buffer;
        unsigned char *vdst_ptr = x->dst.v_buffer;
-        int mv_row = x->mbmi.mv.as_mv.row;
+        int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mv_col = x->mbmi.mv.as_mv.col;
+        int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
        int pre_stride = x->dst.y_stride; //x->block[0].pre_stride;
        ptr_base = x->pre.y_buffer;
@@ -587,7 +592,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
        //if sth is wrong, go back to what it is in build_inter_predictors_mb.
        int i;
-        if (x->mbmi.partitioning < 3)
+        if (x->mode_info_context->mbmi.partitioning < 3)
        {
            for (i = 0; i < 4; i++)
            {
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -43,7 +43,7 @@ void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
    }
    // for Y
-    switch (x->mbmi.mode)
+    switch (x->mode_info_context->mbmi.mode)
    {
    case DC_PRED:
    {
@@ -164,7 +164,7 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
    }
    // for Y
-    switch (x->mbmi.mode)
+    switch (x->mode_info_context->mbmi.mode)
    {
    case DC_PRED:
    {
@@ -290,7 +290,7 @@ void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x)
        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
    }
-    switch (x->mbmi.uv_mode)
+    switch (x->mode_info_context->mbmi.uv_mode)
    {
    case DC_PRED:
    {
@@ -430,7 +430,7 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
    }
-    switch (x->mbmi.uv_mode)
+    switch (x->mode_info_context->mbmi.uv_mode)
    {
    case DC_PRED:
    {
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -75,7 +75,8 @@
 #define thread_sleep(nms) // { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
 #else
 #include <unistd.h>
-#define thread_sleep(nms) usleep(nms*1000);// {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#include <sched.h>
 #define thread_sleep(nms) sched_yield();// {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
 #endif
 /* Not Windows. Assume pthreads */
--- a/vp8/common/treecoder.c
+++ b/vp8/common/treecoder.c
@@ -47,6 +47,12 @@ void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
    tree2tok(p, t, 0, 0, 0);
 }
 void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
                                 int offset)
 {
    tree2tok(p - offset, t, 0, 0, 0);
 }
 static void branch_counts(
    int n,                      /* n = size of alphabet */
    vp8_token tok               [ /* n */ ],
--- a/vp8/common/treecoder.h
+++ b/vp8/common/treecoder.h
@@ -54,6 +54,8 @@ typedef const struct vp8_token_struct
 /* Construct encoding array from tree. */
 void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
 void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
                                 int offset);
 /* Convert array of token occurrence counts into a table of probabilities
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -22,7 +22,7 @@
 #if HAVE_MMX
 extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
 extern prototype_idct(vp8_short_idct4x4llm_mmx);
-extern prototype_idct_scalar(vp8_dc_only_idct_mmx);
+extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
 extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
@@ -34,8 +34,8 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
 #undef  vp8_idct_idct16
 #define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
-#undef  vp8_idct_idct1_scalar
+#undef  vp8_idct_idct1_scalar_add
-#define vp8_idct_idct1_scalar vp8_dc_only_idct_mmx
+#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_mmx
 #undef vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -220,35 +220,61 @@ sym(vp8_short_idct4x4llm_1_mmx):
    pop         rbp
    ret
-;void dc_only_idct_mmx(short input_dc, short *output, int pitch)
+;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-global sym(vp8_dc_only_idct_mmx)
+global sym(vp8_dc_only_idct_add_mmx)
-sym(vp8_dc_only_idct_mmx):
+sym(vp8_dc_only_idct_add_mmx):
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
+    SHADOW_ARGS_TO_STACK 5
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
-        movd        mm0,            arg(0) ;input_dc
+        mov         rsi,            arg(1) ;s -- prediction
        mov         rdi,            arg(2) ;d -- destination
        movsxd      rax,            dword ptr arg(4) ;stride
        movsxd      rdx,            dword ptr arg(3) ;pitch
        pxor        mm0,            mm0
-        paddw       mm0,            [fours GLOBAL]
+        movd        mm5,            arg(0) ;input_dc
        mov         rdx,            arg(1) ;output
-        psraw       mm0,            3
+        paddw       mm5,            [fours GLOBAL]
        movsxd      rax,            dword ptr arg(2) ;pitch
-        punpcklwd   mm0,            mm0
+        psraw       mm5,            3
        punpckldq   mm0,            mm0
-        movq        [rdx],          mm0
+        punpcklwd   mm5,            mm5
-        movq        [rdx+rax],      mm0
+        punpckldq   mm5,            mm5
-        movq        [rdx+rax*2],    mm0
+        movd        mm1,            [rsi]
-        add         rdx,            rax
+        punpcklbw   mm1,            mm0
        paddsw      mm1,            mm5
        packuswb    mm1,            mm0              ; pack and unpack to saturate
        movd        [rdi],          mm1
-        movq        [rdx+rax*2],    mm0
+        movd        mm2,            [rsi+rdx]
        punpcklbw   mm2,            mm0
        paddsw      mm2,            mm5
        packuswb    mm2,            mm0              ; pack and unpack to saturate
        movd        [rdi+rax],      mm2
        movd        mm3,            [rsi+2*rdx]
        punpcklbw   mm3,            mm0
        paddsw      mm3,            mm5
        packuswb    mm3,            mm0              ; pack and unpack to saturate
        movd        [rdi+2*rax],    mm3
        add         rdi,            rax
        add         rsi,            rdx
        movd        mm4,            [rsi+2*rdx]
        punpcklbw   mm4,            mm0
        paddsw      mm4,            mm5
        packuswb    mm4,            mm0              ; pack and unpack to saturate
        movd        [rdi+2*rax],    mm4
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;void idct_dequant_0_2x_sse2
 ; (
 ;   short *qcoeff       - 0
 ;   short *dequant      - 1
 ;   unsigned char *pre  - 2
 ;   unsigned char *dst  - 3
 ;   int dst_stride      - 4
 ;   int blk_stride      - 5
 ; )
 global sym(idct_dequant_0_2x_sse2)
 sym(idct_dequant_0_2x_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    ; end prolog
        mov         rdx,            arg(1) ; dequant
        mov         rax,            arg(0) ; qcoeff
    ; Zero out xmm7, for use unpacking
        pxor        xmm7,           xmm7
        movd        xmm4,           [rax]
        movd        xmm5,           [rdx]
        pinsrw      xmm4,           [rax+32],   4
        pinsrw      xmm5,           [rdx],      4
        pmullw      xmm4,           xmm5
    ; clear coeffs
        movd        [rax],          xmm7
        movd        [rax+32],       xmm7
 ;pshufb
        pshuflw     xmm4,           xmm4,       00000000b
        pshufhw     xmm4,           xmm4,       00000000b
        mov         rax,            arg(2) ; pre
        paddw       xmm4,           [fours GLOBAL]
        movsxd      rcx,            dword ptr arg(5) ; blk_stride
        psraw       xmm4,           3
        movq        xmm0,           [rax]
        movq        xmm1,           [rax+rcx]
        movq        xmm2,           [rax+2*rcx]
        lea         rcx,            [3*rcx]
        movq        xmm3,           [rax+rcx]
        punpcklbw   xmm0,           xmm7
        punpcklbw   xmm1,           xmm7
        punpcklbw   xmm2,           xmm7
        punpcklbw   xmm3,           xmm7
        mov         rax,            arg(3) ; dst
        movsxd      rdx,            dword ptr arg(4) ; dst_stride
    ; Add to predict buffer
        paddw       xmm0,           xmm4
        paddw       xmm1,           xmm4
        paddw       xmm2,           xmm4
        paddw       xmm3,           xmm4
    ; pack up before storing
        packuswb    xmm0,           xmm7
        packuswb    xmm1,           xmm7
        packuswb    xmm2,           xmm7
        packuswb    xmm3,           xmm7
    ; store blocks back out
        movq        [rax],          xmm0
        movq        [rax + rdx],    xmm1
        lea         rax,            [rax + 2*rdx]
        movq        [rax],          xmm2
        movq        [rax + rdx],    xmm3
    ; begin epilog
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 global sym(idct_dequant_full_2x_sse2)
 sym(idct_dequant_full_2x_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ; special case when 2 blocks have 0 or 1 coeffs
    ; dc is set as first coeff, so no need to load qcoeff
        mov         rax,            arg(0) ; qcoeff
        mov         rsi,            arg(2) ; pre
        mov         rdi,            arg(3) ; dst
        movsxd      rcx,            dword ptr arg(5) ; blk_stride
    ; Zero out xmm7, for use unpacking
        pxor        xmm7,           xmm7
        mov         rdx,            arg(1)  ; dequant
    ; note the transpose of xmm1 and xmm2, necessary for shuffle
    ;   to spit out sensicle data
        movdqa      xmm0,           [rax]
        movdqa      xmm2,           [rax+16]
        movdqa      xmm1,           [rax+32]
        movdqa      xmm3,           [rax+48]
    ; Clear out coeffs
        movdqa      [rax],          xmm7
        movdqa      [rax+16],       xmm7
        movdqa      [rax+32],       xmm7
        movdqa      [rax+48],       xmm7
    ; dequantize qcoeff buffer
        pmullw      xmm0,           [rdx]
        pmullw      xmm2,           [rdx+16]
        pmullw      xmm1,           [rdx]
        pmullw      xmm3,           [rdx+16]
    ; repack so block 0 row x and block 1 row x are together
        movdqa      xmm4,           xmm0
        punpckldq   xmm0,           xmm1
        punpckhdq   xmm4,           xmm1
        pshufd      xmm0,           xmm0,       11011000b
        pshufd      xmm1,           xmm4,       11011000b
        movdqa      xmm4,           xmm2
        punpckldq   xmm2,           xmm3
        punpckhdq   xmm4,           xmm3
        pshufd      xmm2,           xmm2,       11011000b
        pshufd      xmm3,           xmm4,       11011000b
    ; first pass
        psubw       xmm0,           xmm2        ; b1 = 0-2
        paddw       xmm2,           xmm2        ;
        movdqa      xmm5,           xmm1
        paddw       xmm2,           xmm0        ; a1 = 0+2
        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
        movdqa      xmm7,           xmm3
        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
        psubw       xmm7,           xmm5        ; c1
        movdqa      xmm5,           xmm1
        movdqa      xmm4,           xmm3
        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm5,           xmm1
        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
        paddw       xmm3,           xmm4
        paddw       xmm3,           xmm5        ; d1
        movdqa      xmm6,           xmm2        ; a1
        movdqa      xmm4,           xmm0        ; b1
        paddw       xmm2,           xmm3        ;0
        paddw       xmm4,           xmm7        ;1
        psubw       xmm0,           xmm7        ;2
        psubw       xmm6,           xmm3        ;3
    ; transpose for the second pass
        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
        pshufd      xmm0,           xmm2,       11011000b
        pshufd      xmm2,           xmm1,       11011000b
        pshufd      xmm1,           xmm5,       11011000b
        pshufd      xmm3,           xmm7,       11011000b
    ; second pass
        psubw       xmm0,           xmm2            ; b1 = 0-2
        paddw       xmm2,           xmm2
        movdqa      xmm5,           xmm1
        paddw       xmm2,           xmm0            ; a1 = 0+2
        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
        movdqa      xmm7,           xmm3
        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
        psubw       xmm7,           xmm5            ; c1
        movdqa      xmm5,           xmm1
        movdqa      xmm4,           xmm3
        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm5,           xmm1
        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
        paddw       xmm3,           xmm4
        paddw       xmm3,           xmm5            ; d1
        paddw       xmm0,           [fours GLOBAL]
        paddw       xmm2,           [fours GLOBAL]
        movdqa      xmm6,           xmm2            ; a1
        movdqa      xmm4,           xmm0            ; b1
        paddw       xmm2,           xmm3            ;0
        paddw       xmm4,           xmm7            ;1
        psubw       xmm0,           xmm7            ;2
        psubw       xmm6,           xmm3            ;3
        psraw       xmm2,           3
        psraw       xmm0,           3
        psraw       xmm4,           3
        psraw       xmm6,           3
    ; transpose to save
        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
        pshufd      xmm0,           xmm2,       11011000b
        pshufd      xmm2,           xmm1,       11011000b
        pshufd      xmm1,           xmm5,       11011000b
        pshufd      xmm3,           xmm7,       11011000b
        pxor        xmm7,           xmm7
    ; Load up predict blocks
        movq        xmm4,           [rsi]
        movq        xmm5,           [rsi+rcx]
        punpcklbw   xmm4,           xmm7
        punpcklbw   xmm5,           xmm7
        paddw       xmm0,           xmm4
        paddw       xmm1,           xmm5
        movq        xmm4,           [rsi+2*rcx]
        lea         rcx,            [3*rcx]
        movq        xmm5,           [rsi+rcx]
        punpcklbw   xmm4,           xmm7
        punpcklbw   xmm5,           xmm7
        paddw       xmm2,           xmm4
        paddw       xmm3,           xmm5
 .finish:
    ; pack up before storing
        packuswb    xmm0,           xmm7
        packuswb    xmm1,           xmm7
        packuswb    xmm2,           xmm7
        packuswb    xmm3,           xmm7
    ; Load destination stride before writing out,
    ;   doesn't need to persist
        movsxd      rdx,            dword ptr arg(4) ; dst_stride
    ; store blocks back out
        movq        [rdi],          xmm0
        movq        [rdi + rdx],    xmm1
        lea         rdi,            [rdi + 2*rdx]
        movq        [rdi],          xmm2
        movq        [rdi + rdx],    xmm3
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void idct_dequant_dc_0_2x_sse2
 ; (
 ;   short *qcoeff       - 0
 ;   short *dequant      - 1
 ;   unsigned char *pre  - 2
 ;   unsigned char *dst  - 3
 ;   int dst_stride      - 4
 ;   short *dc           - 5
 ; )
 global sym(idct_dequant_dc_0_2x_sse2)
 sym(idct_dequant_dc_0_2x_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ; special case when 2 blocks have 0 or 1 coeffs
    ; dc is set as first coeff, so no need to load qcoeff
        mov         rax,            arg(0) ; qcoeff
        mov         rsi,            arg(2) ; pre
        mov         rdi,            arg(3) ; dst
        mov         rdx,            arg(5) ; dc
    ; Zero out xmm7, for use unpacking
        pxor        xmm7,           xmm7
    ; load up 2 dc words here == 2*16 = doubleword
        movd        xmm4,           [rdx]
    ; Load up predict blocks
        movq        xmm0,           [rsi]
        movq        xmm1,           [rsi+16]
        movq        xmm2,           [rsi+32]
        movq        xmm3,           [rsi+48]
    ; Duplicate and expand dc across
        punpcklwd   xmm4,           xmm4
        punpckldq   xmm4,           xmm4
    ; Rounding to dequant and downshift
        paddw       xmm4,           [fours GLOBAL]
        psraw       xmm4,           3
    ; Predict buffer needs to be expanded from bytes to words
        punpcklbw   xmm0,           xmm7
        punpcklbw   xmm1,           xmm7
        punpcklbw   xmm2,           xmm7
        punpcklbw   xmm3,           xmm7
    ; Add to predict buffer
        paddw       xmm0,           xmm4
        paddw       xmm1,           xmm4
        paddw       xmm2,           xmm4
        paddw       xmm3,           xmm4
    ; pack up before storing
        packuswb    xmm0,           xmm7
        packuswb    xmm1,           xmm7
        packuswb    xmm2,           xmm7
        packuswb    xmm3,           xmm7
    ; Load destination stride before writing out,
    ;   doesn't need to persist
        movsxd      rdx,            dword ptr arg(4) ; dst_stride
    ; store blocks back out
        movq        [rdi],          xmm0
        movq        [rdi + rdx],    xmm1
        lea         rdi,            [rdi + 2*rdx]
        movq        [rdi],          xmm2
        movq        [rdi + rdx],    xmm3
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 global sym(idct_dequant_dc_full_2x_sse2)
 sym(idct_dequant_dc_full_2x_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    ; special case when 2 blocks have 0 or 1 coeffs
    ; dc is set as first coeff, so no need to load qcoeff
        mov         rax,            arg(0) ; qcoeff
        mov         rsi,            arg(2) ; pre
        mov         rdi,            arg(3) ; dst
    ; Zero out xmm7, for use unpacking
        pxor        xmm7,           xmm7
        mov         rdx,            arg(1)  ; dequant
    ; note the transpose of xmm1 and xmm2, necessary for shuffle
    ;   to spit out sensicle data
        movdqa      xmm0,           [rax]
        movdqa      xmm2,           [rax+16]
        movdqa      xmm1,           [rax+32]
        movdqa      xmm3,           [rax+48]
    ; Clear out coeffs
        movdqa      [rax],          xmm7
        movdqa      [rax+16],       xmm7
        movdqa      [rax+32],       xmm7
        movdqa      [rax+48],       xmm7
    ; dequantize qcoeff buffer
        pmullw      xmm0,           [rdx]
        pmullw      xmm2,           [rdx+16]
        pmullw      xmm1,           [rdx]
        pmullw      xmm3,           [rdx+16]
    ; DC component
        mov         rdx,            arg(5)
    ; repack so block 0 row x and block 1 row x are together
        movdqa      xmm4,           xmm0
        punpckldq   xmm0,           xmm1
        punpckhdq   xmm4,           xmm1
        pshufd      xmm0,           xmm0,       11011000b
        pshufd      xmm1,           xmm4,       11011000b
        movdqa      xmm4,           xmm2
        punpckldq   xmm2,           xmm3
        punpckhdq   xmm4,           xmm3
        pshufd      xmm2,           xmm2,       11011000b
        pshufd      xmm3,           xmm4,       11011000b
    ; insert DC component
        pinsrw      xmm0,           [rdx],      0
        pinsrw      xmm0,           [rdx+2],    4
    ; first pass
        psubw       xmm0,           xmm2        ; b1 = 0-2
        paddw       xmm2,           xmm2        ;
        movdqa      xmm5,           xmm1
        paddw       xmm2,           xmm0        ; a1 = 0+2
        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
        movdqa      xmm7,           xmm3
        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
        psubw       xmm7,           xmm5        ; c1
        movdqa      xmm5,           xmm1
        movdqa      xmm4,           xmm3
        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm5,           xmm1
        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
        paddw       xmm3,           xmm4
        paddw       xmm3,           xmm5        ; d1
        movdqa      xmm6,           xmm2        ; a1
        movdqa      xmm4,           xmm0        ; b1
        paddw       xmm2,           xmm3        ;0
        paddw       xmm4,           xmm7        ;1
        psubw       xmm0,           xmm7        ;2
        psubw       xmm6,           xmm3        ;3
    ; transpose for the second pass
        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
        pshufd      xmm0,           xmm2,       11011000b
        pshufd      xmm2,           xmm1,       11011000b
        pshufd      xmm1,           xmm5,       11011000b
        pshufd      xmm3,           xmm7,       11011000b
    ; second pass
        psubw       xmm0,           xmm2            ; b1 = 0-2
        paddw       xmm2,           xmm2
        movdqa      xmm5,           xmm1
        paddw       xmm2,           xmm0            ; a1 = 0+2
        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
        movdqa      xmm7,           xmm3
        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
        psubw       xmm7,           xmm5            ; c1
        movdqa      xmm5,           xmm1
        movdqa      xmm4,           xmm3
        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
        paddw       xmm5,           xmm1
        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
        paddw       xmm3,           xmm4
        paddw       xmm3,           xmm5            ; d1
        paddw       xmm0,           [fours GLOBAL]
        paddw       xmm2,           [fours GLOBAL]
        movdqa      xmm6,           xmm2            ; a1
        movdqa      xmm4,           xmm0            ; b1
        paddw       xmm2,           xmm3            ;0
        paddw       xmm4,           xmm7            ;1
        psubw       xmm0,           xmm7            ;2
        psubw       xmm6,           xmm3            ;3
        psraw       xmm2,           3
        psraw       xmm0,           3
        psraw       xmm4,           3
        psraw       xmm6,           3
    ; transpose to save
        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
        pshufd      xmm0,           xmm2,       11011000b
        pshufd      xmm2,           xmm1,       11011000b
        pshufd      xmm1,           xmm5,       11011000b
        pshufd      xmm3,           xmm7,       11011000b
        pxor        xmm7,           xmm7
    ; Load up predict blocks
        movq        xmm4,           [rsi]
        movq        xmm5,           [rsi+16]
        punpcklbw   xmm4,           xmm7
        punpcklbw   xmm5,           xmm7
        paddw       xmm0,           xmm4
        paddw       xmm1,           xmm5
        movq        xmm4,           [rsi+32]
        movq        xmm5,           [rsi+48]
        punpcklbw   xmm4,           xmm7
        punpcklbw   xmm5,           xmm7
        paddw       xmm2,           xmm4
        paddw       xmm3,           xmm5
 .finish:
    ; pack up before storing
        packuswb    xmm0,           xmm7
        packuswb    xmm1,           xmm7
        packuswb    xmm2,           xmm7
        packuswb    xmm3,           xmm7
    ; Load destination stride before writing out,
    ;   doesn't need to persist
        movsxd      rdx,            dword ptr arg(4) ; dst_stride
    ; store blocks back out
        movq        [rdi],          xmm0
        movq        [rdi + rdx],    xmm1
        lea         rdi,            [rdi + 2*rdx]
        movq        [rdi],          xmm2
        movq        [rdi + rdx],    xmm3
    ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 fours:
    times 8 dw 0x0004
 align 16
 x_s1sqr2:
    times 8 dw 0x8A8C
 align 16
 x_c1sqr2less1:
    times 8 dw 0x4E7B
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -34,6 +34,11 @@ prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
 prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
 prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
 extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
 #if HAVE_MMX
 // Horizontal MB filtering
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -157,10 +162,7 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
    if (v_ptr)
        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
 }
@@ -183,10 +185,7 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
    if (v_ptr)
        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
 }
@@ -211,10 +210,7 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
    if (v_ptr)
        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
 }
@@ -241,10 +237,7 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
    if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
    if (v_ptr)
        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
 }
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -731,7 +731,7 @@ rd:
    times 4 dw 0x40
 align 16
-global sym(vp8_six_tap_mmx) HIDDEN_DATA
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
 sym(vp8_six_tap_mmx):
    times 8 dw 0
    times 8 dw 0
@@ -791,7 +791,7 @@ sym(vp8_six_tap_mmx):
 align 16
-global sym(vp8_bilinear_filters_mmx) HIDDEN_DATA
+global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
 sym(vp8_bilinear_filters_mmx):
    times 8 dw 128
    times 8 dw 0
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,931 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
 ; input pixel array has output_height rows. This routine assumes that output_height is an
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;
 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
 ;
 ;*************************************************************************************/
 ;void vp8_filter_block1d8_h6_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned char *output_ptr,
 ;    unsigned int    output_pitch,
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
 global sym(vp8_filter_block1d8_h6_ssse3)
 sym(vp8_filter_block1d8_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4
    movdqa      xmm7, [rd GLOBAL]
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    mov         rdi, arg(2)             ;output_ptr
    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d8_h4_ssse3
    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
    sub         rdi, rdx
 ;xmm3 free
 filter_block1d8_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]
    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pmaddubsw   xmm0, xmm4
    pmaddubsw   xmm1, xmm5
    pshufb      xmm2, [shuf3b GLOBAL]
    add         rdi, rdx
    pmaddubsw   xmm2, xmm6
    lea         rsi,    [rsi + rax]
    dec         rcx
    paddsw      xmm0, xmm1
    paddsw      xmm0, xmm7
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    movq        MMWORD Ptr [rdi], xmm0
    jnz         filter_block1d8_h6_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d8_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]
    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]
    mov         rsi, arg(0)             ;src_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
    sub         rdi, rdx
 ;xmm3 free
 filter_block1d8_h4_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    movdqa      xmm2, xmm0
    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]
    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]
    pmaddubsw   xmm0, xmm5
    add         rdi, rdx
    pmaddubsw   xmm2, xmm6
    lea         rsi,    [rsi + rax]
    dec         rcx
    paddsw      xmm0, xmm7
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    movq        MMWORD Ptr [rdi], xmm0
    jnz         filter_block1d8_h4_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block1d16_h6_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned char  *output_ptr,
 ;    unsigned int    output_pitch,
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
 global sym(vp8_filter_block1d16_h6_ssse3)
 sym(vp8_filter_block1d16_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    mov         rdi, arg(2)             ;output_ptr
    movdqa      xmm7, [rd GLOBAL]
 ;;
 ;;    cmp         esi, DWORD PTR [rax]
 ;;    je          vp8_filter_block1d16_h4_ssse3
    mov         rsi, arg(0)             ;src_ptr
    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
 filter_block1d16_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]
    movdqa      xmm2, xmm1
    pmaddubsw   xmm0, xmm4
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
    pmaddubsw   xmm2, xmm6
    paddsw      xmm0, xmm1
    movdqa      xmm1, xmm3
    pshufb      xmm3, [shuf1b GLOBAL]
    paddsw      xmm0, xmm7
    pmaddubsw   xmm3, xmm4
    paddsw      xmm0, xmm2
    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
    pmaddubsw   xmm2, xmm6
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm1
    paddsw      xmm3, xmm7
    paddsw      xmm3, xmm2
    psraw       xmm3, 7
    packuswb    xmm3, xmm3
    punpcklqdq  xmm0, xmm3
    movdqa      XMMWORD Ptr [rdi], xmm0
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d16_h6_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d16_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
 filter_block1d16_h4_rowloop_ssse3:
    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
    pmaddubsw   xmm2, xmm6
    movdqa      xmm0, xmm3
    pshufb      xmm3, [shuf3b GLOBAL]
    pshufb      xmm0, [shuf2b GLOBAL]
    paddsw      xmm1, xmm7
    paddsw      xmm1, xmm2
    pmaddubsw   xmm0, xmm5
    pmaddubsw   xmm3, xmm6
    psraw       xmm1, 7
    packuswb    xmm1, xmm1
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm0
    paddsw      xmm3, xmm7
    psraw       xmm3, 7
    packuswb    xmm3, xmm3
    punpcklqdq  xmm1, xmm3
    movdqa      XMMWORD Ptr [rdi], xmm1
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d16_h4_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block1d4_h6_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
 ;    unsigned char  *output_ptr,
 ;    unsigned int    output_pitch,
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
 global sym(vp8_filter_block1d4_h6_ssse3)
 sym(vp8_filter_block1d4_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    movdqa      xmm7, [rd GLOBAL]
    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d4_h4_ssse3
    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    mov         rdi, arg(2)             ;output_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
 ;xmm3 free
 filter_block1d4_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]
    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pmaddubsw   xmm0, xmm4
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
 ;--
    pmaddubsw   xmm2, xmm6
    lea         rsi,    [rsi + rax]
 ;--
    paddsw      xmm0, xmm1
    paddsw      xmm0, xmm7
    pxor        xmm1, xmm1
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    movd        DWORD PTR [rdi], xmm0
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d4_h6_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d4_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    movdqa      xmm0, XMMWORD PTR [shuf2b GLOBAL]
    movdqa      xmm3, XMMWORD PTR [shuf3b GLOBAL]
    mov         rsi, arg(0)             ;src_ptr
    mov         rdi, arg(2)             ;output_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch
 filter_block1d4_h4_rowloop_ssse3:
    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
    movdqa      xmm2, xmm1
    pshufb      xmm1, xmm0 ;;[shuf2b GLOBAL]
    pshufb      xmm2, xmm3 ;;[shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
 ;--
    pmaddubsw   xmm2, xmm6
    lea         rsi,    [rsi + rax]
 ;--
    paddsw      xmm1, xmm7
    paddsw      xmm1, xmm2
    psraw       xmm1, 7
    packuswb    xmm1, xmm1
    movd        DWORD PTR [rdi], xmm1
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d4_h4_rowloop_ssse3
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block1d16_v6_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
 ;    unsigned char *output_ptr,
 ;    unsigned int   out_pitch,
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
 global sym(vp8_filter_block1d16_v6_ssse3)
 sym(vp8_filter_block1d16_v6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d16_v4_ssse3
    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
 %endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)   ;output_height
    add         rax, rdx
 vp8_filter_block1d16_v6_ssse3_loop:
    movq        xmm1, MMWORD PTR [rsi]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E
    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, [rd GLOBAL]
    psraw       xmm2, 7
    packuswb    xmm2, xmm2
    movq        MMWORD PTR [rdi], xmm2          ;store the results
    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E
    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, [rd GLOBAL]
    psraw       xmm2, 7
    packuswb    xmm2, xmm2
    movq        MMWORD PTR [rdi+8], xmm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;out_pitch
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d16_v6_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d16_v4_ssse3:
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
 %endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)   ;output_height
    add         rax, rdx
 vp8_filter_block1d16_v4_ssse3_loop:
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E
    pmaddubsw   xmm3, xmm6
    pmaddubsw   xmm2, xmm7
    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
    paddsw      xmm2, [rd GLOBAL]
    paddsw      xmm2, xmm3
    psraw       xmm2, 7
    packuswb    xmm2, xmm2
    punpcklbw   xmm5, xmm4                  ;B D
    punpcklbw   xmm1, xmm0                  ;C E
    pmaddubsw   xmm1, xmm6
    pmaddubsw   xmm5, xmm7
    movdqa      xmm4, [rd GLOBAL]
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      xmm5, xmm1
    paddsw      xmm5, xmm4
    psraw       xmm5, 7
    packuswb    xmm5, xmm5
    punpcklqdq  xmm2, xmm5
    movdqa       XMMWORD PTR [rdi], xmm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;out_pitch
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d16_v4_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block1d8_v6_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
 ;    unsigned char *output_ptr,
 ;    unsigned int   out_pitch,
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
 global sym(vp8_filter_block1d8_v6_ssse3)
 sym(vp8_filter_block1d8_v6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
 %endif
    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d8_v4_ssse3
    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    mov         rax, rsi
    add         rax, rdx
 vp8_filter_block1d8_v6_ssse3_loop:
    movq        xmm1, MMWORD PTR [rsi]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E
    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    movdqa      xmm4, [rd GLOBAL]
    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, xmm4
    psraw       xmm2, 7
    packuswb    xmm2, xmm2
    movq        MMWORD PTR [rdi], xmm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d8_v6_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d8_v4_ssse3:
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    movdqa      xmm5, [rd GLOBAL]
    mov         rsi, arg(0)             ;src_ptr
    mov         rax, rsi
    add         rax, rdx
 vp8_filter_block1d8_v4_ssse3_loop:
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E
    pmaddubsw   xmm3, xmm6
    pmaddubsw   xmm2, xmm7
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm5
    psraw       xmm2, 7
    packuswb    xmm2, xmm2
    movq        MMWORD PTR [rdi], xmm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d8_v4_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block1d4_v6_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
 ;    unsigned char *output_ptr,
 ;    unsigned int   out_pitch,
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
 global sym(vp8_filter_block1d4_v6_ssse3)
 sym(vp8_filter_block1d4_v6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;
    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr
 %if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
 %endif
    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d4_v4_ssse3
    movq        mm5, MMWORD PTR [rax]         ;k0_k5
    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    mov         rsi, arg(0)             ;src_ptr
    mov         rax, rsi
    add         rax, rdx
 vp8_filter_block1d4_v6_ssse3_loop:
    movd        mm1, DWORD PTR [rsi]                  ;A
    movd        mm2, DWORD PTR [rsi + rdx]            ;B
    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   mm2, mm4                  ;B D
    punpcklbw   mm3, mm0                  ;C E
    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
    movq        mm4, [rd GLOBAL]
    pmaddubsw   mm3, mm6
    punpcklbw   mm1, mm0                  ;A F
    pmaddubsw   mm2, mm7
    pmaddubsw   mm1, mm5
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      mm2, mm3
    paddsw      mm2, mm1
    paddsw      mm2, mm4
    psraw       mm2, 7
    packuswb    mm2, mm2
    movd        DWORD PTR [rdi], mm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d4_v6_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 vp8_filter_block1d4_v4_ssse3:
    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
    movq        mm5, MMWORD PTR [rd GLOBAL]
    mov         rsi, arg(0)             ;src_ptr
    mov         rax, rsi
    add         rax, rdx
 vp8_filter_block1d4_v4_ssse3_loop:
    movd        mm2, DWORD PTR [rsi + rdx]            ;B
    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
    punpcklbw   mm2, mm4                  ;B D
    punpcklbw   mm3, mm0                  ;C E
    pmaddubsw   mm3, mm6
    pmaddubsw   mm2, mm7
    add         rsi,  rdx
    add         rax,  rdx
 ;--
 ;--
    paddsw      mm2, mm3
    paddsw      mm2, mm5
    psraw       mm2, 7
    packuswb    mm2, mm2
    movd        DWORD PTR [rdi], mm2
 %if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
 %else
    add         rdi,        r8
 %endif
    dec         rcx
    jnz         vp8_filter_block1d4_v4_ssse3_loop
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 SECTION_RODATA
 align 16
 shuf1b:
    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
 shuf2b:
    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
 shuf3b:
    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
 align 16
 rd:
    times 8 dw 0x40
 align 16
 k0_k5:
    times 8 db 0, 0             ;placeholder
    times 8 db 0, 0
    times 8 db 2, 1
    times 8 db 0, 0
    times 8 db 3, 3
    times 8 db 0, 0
    times 8 db 1, 2
    times 8 db 0, 0
 k1_k3:
    times 8 db  0,    0         ;placeholder
    times 8 db  -6,  12
    times 8 db -11,  36
    times 8 db  -9,  50
    times 8 db -16,  77
    times 8 db  -6,  93
    times 8 db  -8, 108
    times 8 db  -1, 123
 k2_k4:
    times 8 db 128,    0        ;placeholder
    times 8 db 123,   -1
    times 8 db 108,   -8
    times 8 db  93,   -6
    times 8 db  77,  -16
    times 8 db  50,   -9
    times 8 db  36,  -11
    times 8 db  12,   -6
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h
@@ -86,4 +86,37 @@ extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
 #endif
 #endif
 #if HAVE_SSSE3
 extern prototype_subpixel_predict(vp8_sixtap_predict16x16_ssse3);
 extern prototype_subpixel_predict(vp8_sixtap_predict8x8_ssse3);
 extern prototype_subpixel_predict(vp8_sixtap_predict8x4_ssse3);
 extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3);
 //extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2);
 //extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2);
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_subpix_sixtap16x16
 #define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_ssse3
 #undef  vp8_subpix_sixtap8x8
 #define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_ssse3
 #undef  vp8_subpix_sixtap8x4
 #define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3
 #undef  vp8_subpix_sixtap4x4
 #define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
 //#undef  vp8_subpix_bilinear16x16
 //#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2
 //#undef  vp8_subpix_bilinear8x8
 //#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2
 #endif
 #endif
 #endif
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -359,3 +359,195 @@ void vp8_sixtap_predict8x4_sse2
 }
 #endif
 #if HAVE_SSSE3
 extern void vp8_filter_block1d8_h6_ssse3
 (
    unsigned char  *src_ptr,
    unsigned int    src_pixels_per_line,
    unsigned char  *output_ptr,
    unsigned int    output_pitch,
    unsigned int    output_height,
    unsigned int    vp8_filter_index
 );
 extern void vp8_filter_block1d16_h6_ssse3
 (
    unsigned char  *src_ptr,
    unsigned int    src_pixels_per_line,
    unsigned char  *output_ptr,
    unsigned int    output_pitch,
    unsigned int    output_height,
    unsigned int    vp8_filter_index
 );
 extern void vp8_filter_block1d16_v6_ssse3
 (
    unsigned char *src_ptr,
    unsigned int   src_pitch,
    unsigned char *output_ptr,
    unsigned int   out_pitch,
    unsigned int   output_height,
    unsigned int   vp8_filter_index
 );
 extern void vp8_filter_block1d8_v6_ssse3
 (
    unsigned char *src_ptr,
    unsigned int   src_pitch,
    unsigned char *output_ptr,
    unsigned int   out_pitch,
    unsigned int   output_height,
    unsigned int   vp8_filter_index
 );
 extern void vp8_filter_block1d4_h6_ssse3
 (
    unsigned char  *src_ptr,
    unsigned int    src_pixels_per_line,
    unsigned char  *output_ptr,
    unsigned int    output_pitch,
    unsigned int    output_height,
    unsigned int    vp8_filter_index
 );
 extern void vp8_filter_block1d4_v6_ssse3
 (
    unsigned char *src_ptr,
    unsigned int   src_pitch,
    unsigned char *output_ptr,
    unsigned int   out_pitch,
    unsigned int   output_height,
    unsigned int   vp8_filter_index
 );
 void vp8_sixtap_predict16x16_ssse3
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
    if (xoffset)
    {
        if (yoffset)
        {
            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
        }
        else
        {
            // First-pass only
            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
        }
    }
    else
    {
        // Second-pass only
        vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
    }
 }
 void vp8_sixtap_predict8x8_ssse3
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
    if (xoffset)
    {
        if (yoffset)
        {
            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
        }
        else
        {
            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
        }
    }
    else
    {
        // Second-pass only
        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
    }
 }
 void vp8_sixtap_predict8x4_ssse3
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
    if (xoffset)
    {
        if (yoffset)
        {
            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
        }
        else
        {
            // First-pass only
            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
        }
    }
    else
    {
        // Second-pass only
        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
    }
 }
 void vp8_sixtap_predict4x4_ssse3
 (
    unsigned char  *src_ptr,
    int   src_pixels_per_line,
    int  xoffset,
    int  yoffset,
    unsigned char *dst_ptr,
    int dst_pitch
 )
 {
  DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
  if (xoffset)
  {
      if (yoffset)
      {
          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
      }
      else
      {
          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
      }
  }
  else
  {
      vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
  }
 }
 #endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -27,6 +27,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
    int mmx_enabled = flags & HAS_MMX;
    int xmm_enabled = flags & HAS_SSE;
    int wmt_enabled = flags & HAS_SSE2;
    int SSSE3Enabled = flags & HAS_SSSE3;
    /* Note:
     *
@@ -42,7 +43,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
    {
        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_mmx;
        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
-        rtcd->idct.idct1_scalar = vp8_dc_only_idct_mmx;
+        rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
        rtcd->idct.iwalsh1     = vp8_short_inv_walsh4x4_1_mmx;
@@ -114,5 +115,17 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
    }
 #endif
 #if HAVE_SSSE3
    if (SSSE3Enabled)
    {
        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_ssse3;
        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_ssse3;
        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_ssse3;
        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict4x4_ssse3;
    }
 #endif
 #endif
 }
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -0,0 +1,218 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT |vp8_dequant_dc_idct_add_v6|
    AREA |.text|, CODE, READONLY
 ;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
 ; unsigned char *dest, int pitch, int stride, int Dc)
 ; r0 = input
 ; r1 = dq
 ; r2 = pred
 ; r3 = dest
 ; sp + 36 = pitch  ; +4 = 40
 ; sp + 40 = stride  ; +4 = 44
 ; sp + 44 = Dc  ; +4 = 48
 |vp8_dequant_dc_idct_add_v6| PROC
    stmdb   sp!, {r4-r11, lr}
    ldr     r6, [sp, #44]
    ldr     r4, [r0]                ;input
    ldr     r5, [r1], #4            ;dq
    sub     sp, sp, #4
    str     r3, [sp]
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    mov     r12, #3
 vp8_dequant_dc_add_loop
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    subs    r12, r12, #1
    ldrne   r4, [r0, #4]
    ldrne   r5, [r1], #4
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    bne     vp8_dequant_dc_add_loop
    sub     r0, r0, #32
    mov     r1, r0
 ; short_idct4x4llm_v6_dual
    ldr     r3, cospi8sqrt2minus1
    ldr     r4, sinpi8sqrt2
    ldr     r6, [r0, #8]
    mov     r5, #2
 vp8_dequant_dc_idct_loop1_v6
    ldr     r12, [r0, #24]
    ldr     r14, [r0, #16]
    smulwt  r9, r3, r6
    smulwb  r7, r3, r6
    smulwt  r10, r4, r6
    smulwb  r8, r4, r6
    pkhbt   r7, r7, r9, lsl #16
    smulwt  r11, r3, r12
    pkhbt   r8, r8, r10, lsl #16
    uadd16  r6, r6, r7
    smulwt  r7, r4, r12
    smulwb  r9, r3, r12
    smulwb  r10, r4, r12
    subs    r5, r5, #1
    pkhbt   r9, r9, r11, lsl #16
    ldr     r11, [r0], #4
    pkhbt   r10, r10, r7, lsl #16
    uadd16  r7, r12, r9
    usub16  r7, r8, r7
    uadd16  r6, r6, r10
    uadd16  r10, r11, r14
    usub16  r8, r11, r14
    uadd16  r9, r10, r6
    usub16  r10, r10, r6
    uadd16  r6, r8, r7
    usub16  r7, r8, r7
    str     r6, [r1, #8]
    ldrne   r6, [r0, #8]
    str     r7, [r1, #16]
    str     r10, [r1, #24]
    str     r9, [r1], #4
    bne     vp8_dequant_dc_idct_loop1_v6
    mov     r5, #2
    sub     r0, r1, #8
 vp8_dequant_dc_idct_loop2_v6
    ldr     r6, [r0], #4
    ldr     r7, [r0], #4
    ldr     r8, [r0], #4
    ldr     r9, [r0], #4
    smulwt  r1, r3, r6
    smulwt  r12, r4, r6
    smulwt  lr, r3, r8
    smulwt  r10, r4, r8
    pkhbt   r11, r8, r6, lsl #16
    pkhbt   r1, lr, r1, lsl #16
    pkhbt   r12, r10, r12, lsl #16
    pkhtb   r6, r6, r8, asr #16
    uadd16  r6, r1, r6
    pkhbt   lr, r9, r7, lsl #16
    uadd16  r10, r11, lr
    usub16  lr, r11, lr
    pkhtb   r8, r7, r9, asr #16
    subs    r5, r5, #1
    smulwt  r1, r3, r8
    smulwb  r7, r3, r8
    smulwt  r11, r4, r8
    smulwb  r9, r4, r8
    pkhbt   r1, r7, r1, lsl #16
    uadd16  r8, r1, r8
    pkhbt   r11, r9, r11, lsl #16
    usub16  r1, r12, r8
    uadd16  r8, r11, r6
    ldr     r9, c0x00040004
    ldr     r12, [sp, #40]
    uadd16  r6, r10, r8
    usub16  r7, r10, r8
    uadd16  r7, r7, r9
    uadd16  r6, r6, r9
    uadd16  r10, r14, r1
    usub16  r1, r14, r1
    uadd16  r10, r10, r9
    uadd16  r1, r1, r9
    ldr     r11, [r2], r12
    mov     r8, r7, asr #3
    pkhtb   r9, r8, r10, asr #19
    mov     r8, r1, asr #3
    pkhtb   r8, r8, r6, asr #19
    uxtb16  lr, r11, ror #8
    qadd16  r9, r9, lr
    uxtb16  lr, r11
    qadd16  r8, r8, lr
    usat16  r9, #8, r9
    usat16  r8, #8, r8
    orr     r9, r8, r9, lsl #8
    ldr     r11, [r2], r12
    ldr     lr, [sp]
    ldr     r12, [sp, #44]
    mov     r7, r7, lsl #16
    mov     r1, r1, lsl #16
    mov     r10, r10, lsl #16
    mov     r6, r6, lsl #16
    mov     r7, r7, asr #3
    pkhtb   r7, r7, r10, asr #19
    mov     r1, r1, asr #3
    pkhtb   r1, r1, r6, asr #19
    uxtb16  r8, r11, ror #8
    qadd16  r7, r7, r8
    uxtb16  r8, r11
    qadd16  r1, r1, r8
    usat16  r7, #8, r7
    usat16  r1, #8, r1
    orr     r1, r1, r7, lsl #8
    str     r9, [lr], r12
    str     r1, [lr], r12
    str     lr, [sp]
    bne     vp8_dequant_dc_idct_loop2_v6
 ; vpx_memset
    sub     r0, r0, #32
    add     sp, sp, #4
    mov     r12, #0
    str     r12, [r0]
    str     r12, [r0, #4]
    str     r12, [r0, #8]
    str     r12, [r0, #12]
    str     r12, [r0, #16]
    str     r12, [r0, #20]
    str     r12, [r0, #24]
    str     r12, [r0, #28]
    ldmia   sp!, {r4 - r11, pc}
    ENDP    ; |vp8_dequant_dc_idct_add_v6|
 ; Constant Pool
 cospi8sqrt2minus1 DCD 0x00004E7B
 sinpi8sqrt2       DCD 0x00008A8C
 c0x00040004       DCD 0x00040004
    END
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,196 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license and patent
 ;  grant that can be found in the LICENSE file in the root of the source
 ;  tree. All contributing project authors may be found in the AUTHORS
 ;  file in the root of the source tree.
 ;
    EXPORT |vp8_dequant_idct_add_v6|
    AREA |.text|, CODE, READONLY
 ;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
 ; unsigned char *dest, int pitch, int stride)
 ; r0 = input
 ; r1 = dq
 ; r2 = pred
 ; r3 = dest
 ; sp + 36 = pitch  ; +4 = 40
 ; sp + 40 = stride  ; +4 = 44
 |vp8_dequant_idct_add_v6| PROC
    stmdb   sp!, {r4-r11, lr}
    ldr     r4, [r0]                ;input
    ldr     r5, [r1], #4            ;dq
    sub     sp, sp, #4
    str     r3, [sp]
    mov     r12, #4
 vp8_dequant_add_loop
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    subs    r12, r12, #1
    ldrne   r4, [r0, #4]
    ldrne   r5, [r1], #4
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    bne     vp8_dequant_add_loop
    sub     r0, r0, #32
    mov     r1, r0
 ; short_idct4x4llm_v6_dual
    ldr     r3, cospi8sqrt2minus1
    ldr     r4, sinpi8sqrt2
    ldr     r6, [r0, #8]
    mov     r5, #2
 vp8_dequant_idct_loop1_v6
    ldr     r12, [r0, #24]
    ldr     r14, [r0, #16]
    smulwt  r9, r3, r6
    smulwb  r7, r3, r6
    smulwt  r10, r4, r6
    smulwb  r8, r4, r6
    pkhbt   r7, r7, r9, lsl #16
    smulwt  r11, r3, r12
    pkhbt   r8, r8, r10, lsl #16
    uadd16  r6, r6, r7
    smulwt  r7, r4, r12
    smulwb  r9, r3, r12
    smulwb  r10, r4, r12
    subs    r5, r5, #1
    pkhbt   r9, r9, r11, lsl #16
    ldr     r11, [r0], #4
    pkhbt   r10, r10, r7, lsl #16
    uadd16  r7, r12, r9
    usub16  r7, r8, r7
    uadd16  r6, r6, r10
    uadd16  r10, r11, r14
    usub16  r8, r11, r14
    uadd16  r9, r10, r6
    usub16  r10, r10, r6
    uadd16  r6, r8, r7
    usub16  r7, r8, r7
    str     r6, [r1, #8]
    ldrne   r6, [r0, #8]
    str     r7, [r1, #16]
    str     r10, [r1, #24]
    str     r9, [r1], #4
    bne     vp8_dequant_idct_loop1_v6
    mov     r5, #2
    sub     r0, r1, #8
 vp8_dequant_idct_loop2_v6
    ldr     r6, [r0], #4
    ldr     r7, [r0], #4
    ldr     r8, [r0], #4
    ldr     r9, [r0], #4
    smulwt  r1, r3, r6
    smulwt  r12, r4, r6
    smulwt  lr, r3, r8
    smulwt  r10, r4, r8
    pkhbt   r11, r8, r6, lsl #16
    pkhbt   r1, lr, r1, lsl #16
    pkhbt   r12, r10, r12, lsl #16
    pkhtb   r6, r6, r8, asr #16
    uadd16  r6, r1, r6
    pkhbt   lr, r9, r7, lsl #16
    uadd16  r10, r11, lr
    usub16  lr, r11, lr
    pkhtb   r8, r7, r9, asr #16
    subs    r5, r5, #1
    smulwt  r1, r3, r8
    smulwb  r7, r3, r8
    smulwt  r11, r4, r8
    smulwb  r9, r4, r8
    pkhbt   r1, r7, r1, lsl #16
    uadd16  r8, r1, r8
    pkhbt   r11, r9, r11, lsl #16
    usub16  r1, r12, r8
    uadd16  r8, r11, r6
    ldr     r9, c0x00040004
    ldr     r12, [sp, #40]
    uadd16  r6, r10, r8
    usub16  r7, r10, r8
    uadd16  r7, r7, r9
    uadd16  r6, r6, r9
    uadd16  r10, r14, r1
    usub16  r1, r14, r1
    uadd16  r10, r10, r9
    uadd16  r1, r1, r9
    ldr     r11, [r2], r12
    mov     r8, r7, asr #3
    pkhtb   r9, r8, r10, asr #19
    mov     r8, r1, asr #3
    pkhtb   r8, r8, r6, asr #19
    uxtb16  lr, r11, ror #8
    qadd16  r9, r9, lr
    uxtb16  lr, r11
    qadd16  r8, r8, lr
    usat16  r9, #8, r9
    usat16  r8, #8, r8
    orr     r9, r8, r9, lsl #8
    ldr     r11, [r2], r12
    ldr     lr, [sp]
    ldr     r12, [sp, #44]
    mov     r7, r7, lsl #16
    mov     r1, r1, lsl #16
    mov     r10, r10, lsl #16
    mov     r6, r6, lsl #16
    mov     r7, r7, asr #3
    pkhtb   r7, r7, r10, asr #19
    mov     r1, r1, asr #3
    pkhtb   r1, r1, r6, asr #19
    uxtb16  r8, r11, ror #8
    qadd16  r7, r7, r8
    uxtb16  r8, r11
    qadd16  r1, r1, r8
    usat16  r7, #8, r7
    usat16  r1, #8, r1
    orr     r1, r1, r7, lsl #8
    str     r9, [lr], r12
    str     r1, [lr], r12
    str     lr, [sp]
    bne     vp8_dequant_idct_loop2_v6
 ; vpx_memset
    sub     r0, r0, #32
    add     sp, sp, #4
    mov     r12, #0
    str     r12, [r0]
    str     r12, [r0, #4]
    str     r12, [r0, #8]
    str     r12, [r0, #12]
    str     r12, [r0, #16]
    str     r12, [r0, #20]
    str     r12, [r0, #24]
    str     r12, [r0, #28]
    ldmia   sp!, {r4 - r11, pc}
    ENDP    ; |vp8_dequant_idct_add_v6|
 ; Constant Pool
 cospi8sqrt2minus1 DCD 0x00004E7B
 sinpi8sqrt2       DCD 0x00008A8C
 c0x00040004       DCD 0x00040004
    END
--- a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
@@ -1,203 +0,0 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_dequant_dc_idct_v6|
    ; ARM
    ; REQUIRE8
    ; PRESERVE8
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
 |vp8_dequant_dc_idct_v6| PROC
    stmdb   sp!, {r4-r11, lr}
    ldr     r6, [sp, #36]           ;load Dc
    ldr     r4, [r0]                ;input
    ldr     r5, [r1], #4            ;dq
    sub     sp, sp, #4
    str     r0, [sp]
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    mov     r12, #3
 dequant_dc_idct_loop
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4            ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    subs    r12, r12, #1
    ldrne   r4, [r0, #4]
    ldrne   r5, [r1], #4
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    bne     dequant_dc_idct_loop
    sub     r0, r0, #32
    mov     r1, r2
    mov     r2, r3
 ; short_idct4x4llm_v6_dual
    mov r3, #0x00004E00 ;                   cos
    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
    mov r4, #0x00008A00 ;                       sin
    orr r4, r4, #0x0000008C ; sinpi8sqrt2
    mov r5, #0x2    ; i=2                           i
 loop1_dual_11
    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
    subs    r5, r5, #0x1    ; i--                           --
    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
    usub16  r7, r8, r7  ; c                                 c
    uadd16  r6, r6, r10 ; d                             d
    uadd16  r10, r11, r14   ; a                                             a
    usub16  r8, r11, r14    ; b                                     b
    uadd16  r9, r10, r6 ; a+d                                           a+d
    usub16  r10, r10, r6    ; a-d                                               a-d
    uadd16  r6, r8, r7  ; b+c                               b+c
    usub16  r7, r8, r7  ; b-c                                   b-c
    str r6, [r1, r2]    ; o5 | o4
    add r6, r2, r2  ; pitch * 2                             p2
    str r7, [r1, r6]    ; o9 | o8
    add r6,  r6, r2 ; pitch * 3                             p3
    str r10, [r1, r6]   ; o13 | o12
    str r9, [r1], #0x4  ; o1 | o0           ++
    bne loop1_dual_11   ;
    mov r5, #0x2    ; i=2                           i
    sub r0, r1, #8  ; reset input/output        i/o
 loop2_dual_22
    ldr r6, [r0, r2]    ; i5 | i4                               5|4
    ldr r1, [r0]    ; i1 | i0           1|0
    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
    add r14, r2, #0x4   ; pitch + 2                                                             p+2
    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 <20>                                     tc1
    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
    uadd16  r10, r11, r9    ; a                                             a
    usub16  r9, r11, r9 ; b                                         b
    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
    subs    r5, r5, #0x1    ; i--                           --
    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
    uadd16  r7, r10, r6 ; a+d                                   a+d
    mov r8, #0x4    ; set up 4's                                        4
    orr r8, r8, #0x40000    ;                                       4|4
    usub16  r6, r10, r6 ; a-d                               a-d
    uadd16  r6, r6, r8  ; a-d+4                             3|7
    uadd16  r7, r7, r8  ; a+d+4                                 0|4
    uadd16  r10, r9, r12    ; b+c                                               b+c
    usub16  r1, r9, r12 ; b-c           b-c
    uadd16  r10, r10, r8    ; b+c+4                                             1|5
    uadd16  r1, r1, r8  ; b-c+4         2|6
    mov r8, r10, asr #19    ; o1 >> 3
    strh    r8, [r0, #2]    ; o1
    mov r8, r1, asr #19 ; o2 >> 3
    strh    r8, [r0, #4]    ; o2
    mov r8, r6, asr #19 ; o3 >> 3
    strh    r8, [r0, #6]    ; o3
    mov r8, r7, asr #19 ; o0 >> 3
    strh    r8, [r0], r2    ; o0        +p
    sxth    r10, r10    ;
    mov r8, r10, asr #3 ; o5 >> 3
    strh    r8, [r0, #2]    ; o5
    sxth    r1, r1  ;
    mov r8, r1, asr #3  ; o6 >> 3
    strh    r8, [r0, #4]    ; o6
    sxth    r6, r6  ;
    mov r8, r6, asr #3  ; o7 >> 3
    strh    r8, [r0, #6]    ; o7
    sxth    r7, r7  ;
    mov r8, r7, asr #3  ; o4 >> 3
    strh    r8, [r0], r2    ; o4        +p
 ;;;;;   subs    r5, r5, #0x1    ; i--                           --
    bne loop2_dual_22   ;
 ;vpx_memset
    ldr     r0, [sp]
    add     sp, sp, #4
    mov     r12, #0
    str     r12, [r0]
    str     r12, [r0, #4]
    str     r12, [r0, #8]
    str     r12, [r0, #12]
    str     r12, [r0, #16]
    str     r12, [r0, #20]
    str     r12, [r0, #24]
    str     r12, [r0, #28]
    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
    ENDP    ;|vp8_dequant_dc_idct_v68|
    END
--- a/vp8/decoder/arm/armv6/dequantidct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantidct_v6.asm
@@ -1,184 +0,0 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_dequant_idct_v6|
    ; ARM
    ; REQUIRE8
    ; PRESERVE8
    AREA    |.text|, CODE, READONLY  ; name this block of code
 ;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
 |vp8_dequant_idct_v6| PROC
    stmdb   sp!, {r4-r11, lr}
    ldr     r4, [r0]            ;input
    ldr     r5, [r1], #4            ;dq
    sub     sp, sp, #4
    str     r0, [sp]
    mov     r12, #4
 dequant_idct_loop
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    ldr     r4, [r0, #4]            ;input
    ldr     r5, [r1], #4        ;dq
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    smulbb  r6, r4, r5
    smultt  r7, r4, r5
    subs    r12, r12, #1
    ldrne   r4, [r0, #4]
    ldrne   r5, [r1], #4
    strh    r6, [r0], #2
    strh    r7, [r0], #2
    bne     dequant_idct_loop
    sub     r0, r0, #32
    mov     r1, r2
    mov     r2, r3
 ; short_idct4x4llm_v6_dual
    mov r3, #0x00004E00 ;                   cos
    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
    mov r4, #0x00008A00 ;                       sin
    orr r4, r4, #0x0000008C ; sinpi8sqrt2
    mov r5, #0x2    ; i=2                           i
 loop1_dual_1
    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
    subs    r5, r5, #0x1    ; i--                           --
    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
    usub16  r7, r8, r7  ; c                                 c
    uadd16  r6, r6, r10 ; d                             d
    uadd16  r10, r11, r14   ; a                                             a
    usub16  r8, r11, r14    ; b                                     b
    uadd16  r9, r10, r6 ; a+d                                           a+d
    usub16  r10, r10, r6    ; a-d                                               a-d
    uadd16  r6, r8, r7  ; b+c                               b+c
    usub16  r7, r8, r7  ; b-c                                   b-c
    str r6, [r1, r2]    ; o5 | o4
    add r6, r2, r2  ; pitch * 2                             p2
    str r7, [r1, r6]    ; o9 | o8
    add r6,  r6, r2 ; pitch * 3                             p3
    str r10, [r1, r6]   ; o13 | o12
    str r9, [r1], #0x4  ; o1 | o0           ++
    bne loop1_dual_1    ;
    mov r5, #0x2    ; i=2                           i
    sub r0, r1, #8  ; reset input/output        i/o
 loop2_dual_2
    ldr r6, [r0, r2]    ; i5 | i4                               5|4
    ldr r1, [r0]    ; i1 | i0           1|0
    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
    add r14, r2, #0x4   ; pitch + 2                                                             p+2
    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 <20>                                     tc1
    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
    uadd16  r10, r11, r9    ; a                                             a
    usub16  r9, r11, r9 ; b                                         b
    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
    subs    r5, r5, #0x1    ; i--                           --
    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
    uadd16  r7, r10, r6 ; a+d                                   a+d
    mov r8, #0x4    ; set up 4's                                        4
    orr r8, r8, #0x40000    ;                                       4|4
    usub16  r6, r10, r6 ; a-d                               a-d
    uadd16  r6, r6, r8  ; a-d+4                             3|7
    uadd16  r7, r7, r8  ; a+d+4                                 0|4
    uadd16  r10, r9, r12    ; b+c                                               b+c
    usub16  r1, r9, r12 ; b-c           b-c
    uadd16  r10, r10, r8    ; b+c+4                                             1|5
    uadd16  r1, r1, r8  ; b-c+4         2|6
    mov r8, r10, asr #19    ; o1 >> 3
    strh    r8, [r0, #2]    ; o1
    mov r8, r1, asr #19 ; o2 >> 3
    strh    r8, [r0, #4]    ; o2
    mov r8, r6, asr #19 ; o3 >> 3
    strh    r8, [r0, #6]    ; o3
    mov r8, r7, asr #19 ; o0 >> 3
    strh    r8, [r0], r2    ; o0        +p
    sxth    r10, r10    ;
    mov r8, r10, asr #3 ; o5 >> 3
    strh    r8, [r0, #2]    ; o5
    sxth    r1, r1  ;
    mov r8, r1, asr #3  ; o6 >> 3
    strh    r8, [r0, #4]    ; o6
    sxth    r6, r6  ;
    mov r8, r6, asr #3  ; o7 >> 3
    strh    r8, [r0, #6]    ; o7
    sxth    r7, r7  ;
    mov r8, r7, asr #3  ; o4 >> 3
    strh    r8, [r0], r2    ; o4        +p
 ;;;;;   subs    r5, r5, #0x1    ; i--                           --
    bne loop2_dual_2    ;
            ;
 ;vpx_memset
    ldr     r0, [sp]
    add     sp, sp, #4
    mov     r12, #0
    str     r12, [r0]
    str     r12, [r0, #4]
    str     r12, [r0, #8]
    str     r12, [r0, #12]
    str     r12, [r0, #16]
    str     r12, [r0, #20]
    str     r12, [r0, #24]
    str     r12, [r0, #28]
    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
    ENDP    ;|vp8_dequant_idct_v6|
    END
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -0,0 +1,151 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "dequantize.h"
 void vp8_dequant_dc_idct_add_y_block_v6
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
        else
            vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
        if (eobs[1] > 1)
            vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
        else
            vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
        if (eobs[2] > 1)
            vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
        else
            vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
        if (eobs[3] > 1)
            vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
        else
            vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
        q    += 64;
        dc   += 4;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_y_block_v6
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
            ((int *)(q+16))[0] = 0;
        }
        if (eobs[2] > 1)
            vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
            ((int *)(q+32))[0] = 0;
        }
        if (eobs[3] > 1)
            vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
            ((int *)(q+48))[0] = 0;
        }
        q    += 64;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_v6
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstu += 4*stride;
        eobs += 2;
    }
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstv += 4*stride;
        eobs += 2;
    }
 }
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -14,32 +14,56 @@
 #if HAVE_ARMV6
 extern prototype_dequant_block(vp8_dequantize_b_v6);
-extern prototype_dequant_idct(vp8_dequant_idct_v6);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
 extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6
-#undef  vp8_dequant_idct
+#undef vp8_dequant_idct_add
-#define vp8_dequant_idct vp8_dequant_idct_v6
+#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-#undef  vp8_dequant_idct_dc
+#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
 #undef vp8_dequant_dc_idct_add_y_block
 #define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
 #if HAVE_ARMV7
 extern prototype_dequant_block(vp8_dequantize_b_neon);
-extern prototype_dequant_idct(vp8_dequant_idct_neon);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
 extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon
-#undef  vp8_dequant_idct
+#undef vp8_dequant_idct_add
-#define vp8_dequant_idct vp8_dequant_idct_neon
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-#undef  vp8_dequant_idct_dc
+#undef vp8_dequant_dc_idct_add
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
 #undef vp8_dequant_dc_idct_add_y_block
 #define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif
 #endif
--- a/vp8/decoder/arm/detokenize.asm
+++ b/vp8/decoder/arm/detokenize.asm
@@ -0,0 +1,320 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_decode_mb_tokens_v6|
    AREA    |.text|, CODE, READONLY  ; name this block of code
    INCLUDE vpx_asm_offsets.asm
 l_qcoeff    EQU     0
 l_i         EQU     4
 l_type      EQU     8
 l_stop      EQU     12
 l_c         EQU     16
 l_l_ptr     EQU     20
 l_a_ptr     EQU     24
 l_bc        EQU     28
 l_coef_ptr  EQU     32
 l_stacksize EQU     64
 ;; constant offsets -- these should be created at build time
 c_block2above_offset         EQU 25
 c_entropy_nodes              EQU 11
 c_dct_eob_token              EQU 11
 |vp8_decode_mb_tokens_v6| PROC
    stmdb       sp!, {r4 - r11, lr}
    sub         sp, sp, #l_stacksize
    mov         r7, r1                      ; type
    mov         r9, r0                      ; detoken
    ldr         r1, [r9, #detok_current_bc]
    ldr         r0, [r9, #detok_qcoeff_start_ptr]
    mov         r11, #0                     ; i
    mov         r3, #16                     ; stop
    cmp         r7, #1                      ; type ?= 1
    addeq       r11, r11, #24               ; i = 24
    addeq       r3, r3, #8                  ; stop = 24
    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
    str         r0, [sp, #l_qcoeff]
    str         r11, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r1, [sp, #l_bc]
    add         lr, r9, r7, lsl #2          ; detoken + type*4
    ldr         r8, [r1, #bool_decoder_user_buffer]
    ldr         r10, [lr, #detok_coef_probs]
    ldr         r5, [r1, #bool_decoder_count]
    ldr         r6, [r1, #bool_decoder_range]
    ldr         r4, [r1, #bool_decoder_value]
    str         r10, [sp, #l_coef_ptr]
 BLOCK_LOOP
    ldr         r3, [r9, #detok_ptr_block2leftabove]
    ldr         r1, [r9, #detok_L]
    ldr         r2, [r9, #detok_A]
    ldrb        r12, [r3, r11]!             ; block2left[i]
    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
    cmp         r7, #0                      ; c = !type
    moveq       r7, #1
    movne       r7, #0
    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
 ; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
    cmp         r0, #0                      ; *l ?= 0
    movne       r0, #1
    cmp         r3, #0                      ; *a ?= 0
    addne       r0, r0, #1                  ; t
    str         r1, [sp, #l_l_ptr]          ; save &l
    str         r2, [sp, #l_a_ptr]          ; save &a
    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
    mov         r1, #0                      ; t = 0
    str         r7, [sp, #l_c]
    ;align 4
 COEFF_LOOP
    ldr         r3, [r9, #detok_ptr_coef_bands_x]
    ldr         lr, [r9, #detok_coef_tree_ptr]
    ;STALL
    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
    ;STALL
    ;STALL
    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
 get_token_loop
    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
    mov         r3, r6, lsl #8              ; range << 8
    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
    mov         r10, #1                     ; 1
    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
    ;++
    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
    addhs       r1, r1, #1                  ; t += 1
    movhs       r4, r3                      ; value -= bigsplit (split << 24)
    subhs       r2, r6, r2                  ; range -= split
 ;   movlo       r6, r2                      ; range = split
    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
 ; NORMALIZE
    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range <<= shift
    mov         r4, r4, lsl r3              ; value <<= shift
 ; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
    addle         r5, r5, #8                ; count += 8
    rsble         r3, r5, #24               ; 24 - count
    addle         r8, r8, #1                ; bufptr++
    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
    cmp         r1, #0                      ; t ?= 0
    bgt         get_token_loop              ; while (t > 0)
    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
    beq         END_OF_BLOCK                ; break
    rsb         lr, r1, #0                  ; v = -t;
    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
    ble         SKIP_EXTRABITS
    ldr         r3, [r9, #detok_teb_base_ptr]
    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
 extrabits_loop
    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
    ldrb        r2, [r3, #4]                ; probability. why +4?
    mov         r3, r6, lsl #8              ; range << 8
    sub         r3, r3, #256                ; range << 8 + 1 << 8
    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
    ldrb        r12, [r8]                   ; *bufptr
    ;++
    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
    movhs       r4, r10                     ; value = value - (split << 24)
    subhs       r2, r6, r2                  ; range = range - split
    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
 ; NORMALIZE
    clz         r3, r2                      ; shift - leading zeros in split
    sub         r3, r3, #24                 ; don't count first 3 bytes
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range = range << shift
    mov         r4, r4, lsl r3              ; value <<= shift
    addle       r5, r5, #8                  ; count += BR_COUNT
    addle       r8, r8, #1                  ; bufptr++
    rsble       r3, r5, #24                 ; BR_COUNT - count
    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
    subs        r0, r0, #1                  ; bits_count --
    bpl         extrabits_loop
 SKIP_EXTRABITS
    ldr         r11, [sp, #l_qcoeff]
    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
    cmp         r1, #0                      ; check for nonzero token - if (t)
    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
    add         r3, r6, #1                  ; range + 1
    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
    movhs       r4, r3                      ; value -= (split << 24)
    subhs       r2, r6, r2                  ; range -= split
    mvnhs       r3, lr                      ; -v
    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
 ; NORMALIZE
    clz         r3, r2                      ; leading 0s in split
    sub         r3, r3, #24                 ; shift
    subs        r5, r5, r3                  ; count -= shift
    mov         r6, r2, lsl r3              ; range <<= shift
    mov         r4, r4, lsl r3              ; value <<= shift
    ldrleb      r2, [r8], #1                ; *(bufptr++)
    addle       r5, r5, #8                  ; count += 8
    rsble       r3, r5, #24                 ; BR_COUNT - count
    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
    cmn         r1, #1                      ; t < -ONE_TOKEN
    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
    mvn         r1, #1                      ; t = -1 ???? C is -2
 SKIP_EOB_CHECK
    ldr         r7, [sp, #l_c]              ; c
    ldr         r3, [r9, #detok_scan]
    add         r1, r1, #2                  ; t+= 2
    cmp         r7, #15                     ; c should will be one higher
    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
    add         r7, r7, #1                  ; c++
    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
    str         r7, [sp, #l_c]              ; store c
    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
    blt         COEFF_LOOP
    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
 END_OF_BLOCK
    ldr         r3, [sp, #l_type]           ; type
    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
    ldr         r11, [sp, #l_i]             ; i
    ldr         r12, [sp, #l_stop]          ; stop
    cmp         r3, #0                      ; type ?= 0
    moveq       r1, #1
    movne       r1, #0
    add         r3, r11, r9                 ; detok + i
    cmp         r7, r1                      ; c ?= !type
    strb        r7, [r3, #detok_eob]        ; eob[i] = c
    ldr         r7, [sp, #l_l_ptr]          ; l
    ldr         r2, [sp, #l_a_ptr]          ; a
    movne       r3, #1                      ; t
    moveq       r3, #0
    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
    add         r11, r11, #1                ; i++
    strb        r3, [r7]                    ; *l = t
    strb        r3, [r2]                    ; *a = t
    str         r0, [sp, #l_qcoeff]         ; qcoeff
    str         r11, [sp, #l_i]             ; i
    cmp         r11, r12                    ; i < stop
    ldr         r7, [sp, #l_type]           ; type
    blt         BLOCK_LOOP
    cmp         r11, #25                    ; i ?= 25
    bne         ln2_decode_mb_to
    ldr         r12, [r9, #detok_qcoeff_start_ptr]
    ldr         r10, [r9, #detok_coef_probs]
    mov         r7, #0                      ; type/i = 0
    mov         r3, #16                     ; stop = 16
    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
    str         r7, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
    b           BLOCK_LOOP
 ln2_decode_mb_to
    cmp         r11, #16                    ; i ?= 16
    bne         ln1_decode_mb_to
    mov         r10, #detok_coef_probs
    add         r10, r10, #2*4              ; coef_probs[type]
    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
    mov         r7, #2                      ; type = 2
    mov         r3, #24                     ; stop = 24
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
    b           BLOCK_LOOP
 ln1_decode_mb_to
    ldr         r2, [sp, #l_bc]
    mov         r0, #0
    nop
    str         r8, [r2, #bool_decoder_user_buffer]
    str         r5, [r2, #bool_decoder_count]
    str         r4, [r2, #bool_decoder_value]
    str         r6, [r2, #bool_decoder_range]
    add         sp, sp, #l_stacksize
    ldmia       sp!, {r4 - r11, pc}
    ENDP  ; |vp8_decode_mb_tokens_v6|
    END
--- a/vp8/decoder/arm/detokenize_arm.h
+++ b/vp8/decoder/arm/detokenize_arm.h
@@ -7,13 +7,16 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef BIT_OPS_H
 #define BIT_OPS_H
 /* Evaluates to a mask with n bits set */
 #define BITS_MASK(n) ((1<<(n))-1)
-/* Returns len bits, with the LSB at position bit */
+#ifndef DETOKENIZE_ARM_H
-#define BITS_GET(val, bit, len) (((val)>>(bit))&BITS_MASK(len))
+#define DETOKENIZE_ARM_H
 #if HAVE_ARMV6
 #if CONFIG_ARM_ASM_DETOK
 void vp8_init_detokenizer(VP8D_COMP *dx);
 void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
 #endif
 #endif
 #endif
--- a/vp8/decoder/arm/detokenizearm_sjl.c
+++ b/vp8/decoder/arm/detokenizearm_sjl.c
@@ -1,731 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "type_aliases.h"
 #include "blockd.h"
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #define BR_COUNT 8
 #define BOOL_DATA UINT8
 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
 //ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X};
 DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
 #define LOW_VAL_CONTEXT_NODE        3
 #define TWO_CONTEXT_NODE            4
 #define THREE_CONTEXT_NODE          5
 #define HIGH_LOW_CONTEXT_NODE       6
 #define CAT_ONE_CONTEXT_NODE        7
 #define CAT_THREEFOUR_CONTEXT_NODE  8
 #define CAT_THREE_CONTEXT_NODE      9
 #define CAT_FIVE_CONTEXT_NODE       10
 DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
 {
    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
 };
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) =
 {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above
 };
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT **const A = x->above_context;
    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
    ENTROPY_CONTEXT *a;
    ENTROPY_CONTEXT *l;
    int i;
    for (i = 0; i < 24; i++)
    {
        a = A[ vp8_block2context[i] ] + vp8_block2above[i];
        l = L[ vp8_block2context[i] ] + vp8_block2left[i];
        *a = *l = 0;
    }
    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
    {
        a = A[Y2CONTEXT] + vp8_block2above[24];
        l = L[Y2CONTEXT] + vp8_block2left[24];
        *a = *l = 0;
    }
 }
 #define ONYXBLOCK2CONTEXT_OFFSET    0
 #define ONYXBLOCK2LEFT_OFFSET       25
 #define ONYXBLOCK2ABOVE_OFFSET 50
 DECLARE_ALIGNED(16, const static unsigned char, norm[128]) =
 {
    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 };
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 void init_detokenizer(VP8D_COMP *dx)
 {
    const VP8_COMMON *const oc = & dx->common;
    MACROBLOCKD *x = & dx->mb;
    dx->detoken.norm_ptr = (unsigned char *)norm;
    dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree;
    dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove;
    dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x;
    dx->detoken.scan = (int *)vp8_default_zig_zag1d;
    dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2;
    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
    dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]);
    dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]);
    dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]);
    dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]);
 }
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 //shift = norm[range]; \
 //      shift = norm_ptr[range]; \
 #define NORMALIZE \
    /*if(range < 0x80)*/                            \
    { \
        shift = detoken->norm_ptr[range]; \
        range <<= shift; \
        value <<= shift; \
        count -= shift; \
        if(count <= 0) \
        { \
            count += BR_COUNT ; \
            value |= (*bufptr) << (BR_COUNT-count); \
            bufptr++; \
        } \
    }
 #if 1
 #define DECODE_AND_APPLYSIGN(value_to_sign) \
    split = (range + 1) >> 1; \
    if ( (value >> 24) < split ) \
    { \
        range = split; \
        v= value_to_sign; \
    } \
    else \
    { \
        range = range-split; \
        value = value-(split<<24); \
        v = -value_to_sign; \
    } \
    range +=range;                   \
    value +=value;                   \
    if (!--count) \
    { \
        count = BR_COUNT; \
        value |= *bufptr; \
        bufptr++; \
    }
 #define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
    { \
        split = 1 +  ((( probability*(range-1) ) )>> 8); \
        if ( (value >> 24) < split ) \
        { \
            range = split; \
            NORMALIZE \
            goto branch; \
        } \
        value -= (split<<24); \
        range = range - split; \
        NORMALIZE \
    }
 #define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
    { \
        split = 1 + ((( probability*(range-1) ) ) >> 8); \
        if ( (value >> 24) < split ) \
        { \
            range = split; \
            NORMALIZE \
            Prob = coef_probs; \
            ++c; \
            Prob += vp8_coef_bands_x[c]; \
            goto branch; \
        } \
        value -= (split<<24); \
        range = range - split; \
        NORMALIZE \
    }
 #define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
    DECODE_AND_APPLYSIGN(val) \
    Prob = coef_probs + (ENTROPY_NODES*2); \
    if(c < 15){\
        qcoeff_ptr [ scan[c] ] = (INT16) v; \
        ++c; \
        goto DO_WHILE; }\
    qcoeff_ptr [ scan[15] ] = (INT16) v; \
    goto BLOCK_FINISHED;
 #define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
    split = 1 +  (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
    if(value >= (split<<24))\
    {\
        range = range-split;\
        value = value-(split<<24);\
        val += ((UINT16)1<<bits_count);\
    }\
    else\
    {\
        range = split;\
    }\
    NORMALIZE
 #endif
 #if 0
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT **const A = x->above_context;
    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
    const VP8_COMMON *const oc = & dx->common;
    BOOL_DECODER *bc = x->current_bc;
    ENTROPY_CONTEXT *a;
    ENTROPY_CONTEXT *l;
    int i;
    int eobtotal = 0;
    register int count;
    BOOL_DATA *bufptr;
    register unsigned int range;
    register unsigned int value;
    const int *scan;
    register unsigned int shift;
    UINT32 split;
    INT16 *qcoeff_ptr;
    UINT8 *coef_probs;
    int type;
    int stop;
    INT16 val, bits_count;
    INT16 c;
    INT16 t;
    INT16 v;
    vp8_prob *Prob;
    //int *scan;
    type = 3;
    i = 0;
    stop = 16;
    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
    {
        i = 24;
        stop = 24;
        type = 1;
        qcoeff_ptr = &x->qcoeff[24*16];
        scan = vp8_default_zig_zag1d;
        eobtotal -= 16;
    }
    else
    {
        scan = vp8_default_zig_zag1d;
        qcoeff_ptr = &x->qcoeff[0];
    }
    count   = bc->count;
    range   = bc->range;
    value   = bc->value;
    bufptr  = &bc->buffer[bc->pos];
    coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
 BLOCK_LOOP:
    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
    c = (INT16)(!type);
    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
    Prob = coef_probs;
    Prob += t * ENTROPY_NODES;
 DO_WHILE:
    Prob += vp8_coef_bands_x[c];
    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
 CHECK_0_:
    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_);
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
    bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
    do
    {
        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
        bits_count -- ;
    }
    while (bits_count >= 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_FIVE_CONTEXT_NODE_0_:
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_THREEFOUR_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_);
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_THREE_CONTEXT_NODE_0_:
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 HIGH_LOW_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_);
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_ONE_CONTEXT_NODE_0_:
    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 LOW_VAL_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
 THREE_CONTEXT_NODE_0_:
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
 TWO_CONTEXT_NODE_0_:
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
 ONE_CONTEXT_NODE_0_:
    DECODE_AND_APPLYSIGN(1);
    Prob = coef_probs + ENTROPY_NODES;
    if (c < 15)
    {
        qcoeff_ptr [ scan[c] ] = (INT16) v;
        ++c;
        goto DO_WHILE;
    }
    qcoeff_ptr [ scan[15] ] = (INT16) v;
 BLOCK_FINISHED:
    t = ((x->Block[i].eob = c) != !type);   // any nonzero data?
    eobtotal += x->Block[i].eob;
    *a = *l = t;
    qcoeff_ptr += 16;
    i++;
    if (i < stop)
        goto BLOCK_LOOP;
    if (i == 25)
    {
        scan = vp8_default_zig_zag1d;//x->scan_order1d;
        type = 0;
        i = 0;
        stop = 16;
        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
        qcoeff_ptr = &x->qcoeff[0];
        goto BLOCK_LOOP;
    }
    if (i == 16)
    {
        type = 2;
        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
        stop = 24;
        goto BLOCK_LOOP;
    }
    bc->count = count;
    bc->value = value;
    bc->range = range;
    bc->pos  = bufptr - bc->buffer;
    return eobtotal;
 }
 //#endif
 #else
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 #if 0
 //uses relative offsets
 const vp8_tree_index vp8_coef_tree_x[ 22] =   /* corresponding _CONTEXT_NODEs */
 {
    -DCT_EOB_TOKEN, 1,                             /* 0 = EOB */
    -ZERO_TOKEN, 1,                               /* 1 = ZERO */
    -ONE_TOKEN, 1,                               /* 2 = ONE */
    2, 5,                                       /* 3 = LOW_VAL */
    -TWO_TOKEN, 1,                         /* 4 = TWO */
    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
    2, 3,                                  /* 6 = HIGH_LOW */
    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
    2, 3,                                 /* 8 = CAT_THREEFOUR */
    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
 };
 #endif
 #define _SCALEDOWN 8 //16 //8
 int vp8_decode_mb_tokens_v5(DETOK *detoken, int type);
 int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type)
 {
    BOOL_DECODER *bc = detoken->current_bc;
    ENTROPY_CONTEXT *a;
    ENTROPY_CONTEXT *l;
    int i;
    register int count;
    BOOL_DATA *bufptr;
    register unsigned int range;
    register unsigned int value;
    register unsigned int shift;
    UINT32 split;
    INT16 *qcoeff_ptr;
    UINT8 *coef_probs;
 //  int type;
    int stop;
    INT16 c;
    INT16 t;
    INT16 v;
    vp8_prob *Prob;
 //  type = 3;
    i = 0;
    stop = 16;
    qcoeff_ptr = detoken->qcoeff_start_ptr;
 //  if( detoken->mode != B_PRED && detoken->mode != SPLITMV)
    if (type == 1)
    {
        i += 24;
        stop += 8; //24;
 //      type = 1;
        qcoeff_ptr += 24 * 16;
 //      eobtotal-=16;
    }
    count   = bc->count;
    range   = bc->range;
    value   = bc->value;
    bufptr  = &bc->buffer[bc->pos];
    coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
 BLOCK_LOOP:
    a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ];
    l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ];
    c = !type;
    a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET];
    l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET];
    //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \
    //Dest = ((A)!=0) + ((B)!=0);
    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
    Prob = coef_probs;
    Prob += t * ENTROPY_NODES;
    t = 0;
    do
    {
        {
 //                  onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x;
            Prob += detoken->ptr_onyx_coef_bands_x[c];
        GET_TOKEN_START:
            do
            {
                split = 1 + (((range - 1) * (Prob[t>>1])) >> 8);
                if (value >> 24 >= split)
                {
                    range = range - split;
                    value = value - (split << 24);
                    t += 1;
                    //used to eliminate else branch
                    split = range;
                }
                range = split;
                t = detoken->vp8_coef_tree_ptr[ t ];
                NORMALIZE
            }
            while (t  > 0) ;
        }
    GET_TOKEN_STOP:
        if (t == -DCT_EOB_TOKEN)
        {
            break;
        }
        v = -t;
        if (v > FOUR_TOKEN)
        {
            INT16 bits_count;
            TOKENEXTRABITS *teb_ptr;
 //                      teb_ptr = &onyxd_token_extra_bits2[t];
 //                  teb_ptr = &onyxd_token_extra_bits2[v];
            teb_ptr = &detoken->teb_base_ptr[v];
            v = teb_ptr->min_val;
            bits_count = teb_ptr->Length;
            do
            {
                split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN);
                if ((value >> 24) >= split)
                {
                    range = range - split;
                    value = value - (split << 24);
                    v += ((UINT16)1 << bits_count);
                    //used to eliminate else branch
                    split = range;
                }
                range = split;
                NORMALIZE
                bits_count -- ;
            }
            while (bits_count >= 0);
        }
        Prob = coef_probs;
        if (t)
        {
            split = 1 + (((range - 1) * vp8_prob_half) >> 8);
            if ((value >> 24) >= split)
            {
                range = range - split;
                value = value - (split << 24);
                v = (v ^ -1) + 1;           /* negate w/out conditionals */
                //used to eliminate else branch
                split = range;
            }
            range = split;
            NORMALIZE
            Prob += ENTROPY_NODES;
            if (t < -ONE_TOKEN)
                Prob += ENTROPY_NODES;
            t = -2;
        }
        //if t is zero, we will skip the eob table check
        t += 2;
        qcoeff_ptr [detoken->scan [c] ] = (INT16) v;
    }
    while (++c < 16);
    if (t != -DCT_EOB_TOKEN)
    {
        --c;
    }
    t = ((detoken->eob[i] = c) != !type);   // any nonzero data?
 //  eobtotal += detoken->eob[i];
    *a = *l = t;
    qcoeff_ptr += 16;
    i++;
    if (i < stop)
        goto BLOCK_LOOP;
    if (i == 25)
    {
        type = 0;
        i = 0;
        stop = 16;
 //      coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
        qcoeff_ptr = detoken->qcoeff_start_ptr;
        goto BLOCK_LOOP;
    }
    if (i == 16)
    {
        type = 2;
 //      coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
        stop = 24;
        goto BLOCK_LOOP;
    }
    bc->count = count;
    bc->value = value;
    bc->range = range;
    bc->pos  = bufptr - bc->buffer;
    return 0;
 }
 //#if 0
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
 //  const ONYX_COMMON * const oc = & dx->common;
    int eobtotal = 0;
    int i, type;
    /*
        dx->detoken.norm_ptr = norm;
        dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree;
        dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE;
        dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x;
        dx->detoken.scan = default_zig_zag1d;
        dx->detoken.teb_base_ptr = onyxd_token_extra_bits2;
        dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
        dx->detoken.A = x->above_context;
        dx->detoken.L = x->left_context;
        dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]);
        dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]);
        dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]);
        dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]);
    */
    dx->detoken.current_bc = x->current_bc;
    dx->detoken.A = x->above_context;
    dx->detoken.L = x->left_context;
    type = 3;
    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
    {
        type = 1;
        eobtotal -= 16;
    }
    vp8_decode_mb_tokens_v5(&dx->detoken, type);
    for (i = 0; i < 25; i++)
    {
        x->Block[i].eob = dx->detoken.eob[i];
        eobtotal += dx->detoken.eob[i];
    }
    return eobtotal;
 }
 #endif
--- a/vp8/decoder/arm/detokenizearm_v6.asm
+++ b/vp8/decoder/arm/detokenizearm_v6.asm
@@ -1,365 +0,0 @@
 ;
 ;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_decode_mb_tokens_v5|
    AREA    |.text|, CODE, READONLY  ; name this block of code
    INCLUDE vpx_asm_offsets.asm
 l_qcoeff    EQU     0
 l_i         EQU     4
 l_type      EQU     8
 l_stop      EQU     12
 l_c         EQU     16
 l_l_ptr      EQU     20
 l_a_ptr      EQU     24
 l_bc        EQU     28
 l_coef_ptr   EQU     32
 l_stacksize EQU     64
 ;; constant offsets -- these should be created at build time
 c_onyxblock2left_offset      EQU 25
 c_onyxblock2above_offset     EQU 50
 c_entropy_nodes              EQU 11
 c_dct_eob_token              EQU 11
 |vp8_decode_mb_tokens_v5| PROC
    stmdb       sp!, {r4 - r11, lr}
    sub         sp, sp, #l_stacksize
    mov         r7, r1
    mov         r9, r0                      ;DETOK *detoken
    ldr         r1, [r9, #detok_current_bc]
    ldr         r0, [r9, #detok_qcoeff_start_ptr]
    mov         r11, #0
    mov         r3, #0x10
    cmp         r7, #1
    addeq       r11, r11, #24
    addeq       r3, r3, #8
    addeq       r0, r0, #3, 24
    str         r0, [sp, #l_qcoeff]
    str         r11, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r1, [sp, #l_bc]
    add         lr, r9, r7, lsl #2
    ldr         r2, [r1, #bool_decoder_buffer]
    ldr         r3, [r1, #bool_decoder_pos]
    ldr         r10, [lr, #detok_coef_probs]
    ldr         r5, [r1, #bool_decoder_count]
    ldr         r6, [r1, #bool_decoder_range]
    ldr         r4, [r1, #bool_decoder_value]
    add         r8, r2, r3
    str         r10, [sp, #l_coef_ptr]
    ;align 4
 BLOCK_LOOP
    ldr         r3, [r9, #detok_ptr_onyxblock2context_leftabove]
    ldr         r2, [r9, #DETOK_A]
    ldr         r1, [r9, #DETOK_L]
    ldrb        r12, [r3, +r11]                                 ; detoken->ptr_onyxblock2context_leftabove[i]
    cmp         r7, #0                                          ; check type
    moveq       r7, #1
    movne       r7, #0
    ldr         r0, [r2, +r12, lsl #2]                          ; a
    add         r1, r1, r12, lsl #4
    add         r3, r3, r11
    ldrb        r2, [r3, #c_onyxblock2above_offset]
    ldrb        r3, [r3, #c_onyxblock2left_offset]
    mov         lr, #c_entropy_nodes
 ;;  ;++
    ldr         r2, [r0, +r2, lsl #2]!
    add         r3, r1, r3, lsl #2
    str         r3, [sp, #l_l_ptr]
    ldr         r3, [r3]
    cmp         r2, #0
    movne       r2, #1
    cmp         r3, #0
    addne       r2, r2, #1
    str         r0, [sp, #l_a_ptr]
    smlabb      r0, r2, lr, r10
    mov         r1, #0                                          ; t = 0
    str         r7, [sp, #l_c]
    ;align 4
 COEFF_LOOP
    ldr         r3, [r9, #detok_ptr_onyx_coef_bands_x]
    ldr         lr, [r9, #detok_onyx_coef_tree_ptr]
 ;;the following two lines are used if onyx_coef_bands_x is UINT16
 ;;  add         r3, r3, r7, lsl #1
 ;;  ldrh        r3, [r3]
 ;;the following line is used if onyx_coef_bands_x is UINT8
    ldrb        r3, [r7, +r3]
 ;;  ;++
 ;;  pld         [r8]
    ;++
    add         r0, r0, r3
    ;align 4
 get_token_loop
    ldrb        r2, [r0, +r1, asr #1]
    mov         r3, r6, lsl #8
    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
    mov         r10, #1
    smlawb      r2, r3, r2, r10
    ldrb        r12, [r8]                       ;load cx data byte in stall slot
    ;++
    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
    addhs       r1, r1, #1                      ;t += 1
    movhs       r4, r3                          ;update value
    subhs       r2, r6, r2                      ;range = range - split
    movlo       r6, r2
 ;;; ldrsbhs     r1, [r1, +lr]
    ldrsb     r1, [r1, +lr]
 ;; use branch for short pipelines ???
 ;;  cmp         r2, #0x80
 ;;  bcs         |$LN22@decode_mb_to|
    clz         r3, r2
    sub         r3, r3, #24
    subs        r5, r5, r3
    mov         r6, r2, lsl r3
    mov         r4, r4, lsl r3
 ;; use branch for short pipelines ???
 ;;  bgt         |$LN22@decode_mb_to|
    addle         r5, r5, #8
    rsble         r3, r5, #8
    addle         r8, r8, #1
    orrle         r4, r4, r12, lsl r3
 ;;|$LN22@decode_mb_to|
    cmp         r1, #0
    bgt         get_token_loop
    cmn         r1, #c_dct_eob_token             ;if(t == -DCT_EOB_TOKEN)
    beq         END_OF_BLOCK
    rsb         lr, r1, #0                      ;v = -t;
    cmp         lr, #4                          ;if(v > FOUR_TOKEN)
    ble         SKIP_EXTRABITS
    ldr         r3, [r9, #detok_teb_base_ptr]
    mov         r11, #1
    add         r7, r3, lr, lsl #4
    ldrsh       lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val
    ldrsh       r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length
 extrabits_loop
    add         r3, r0, r7
    ldrb        r2, [r3, #4]
    mov         r3, r6, lsl #8
    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
    mov         r10, #1
    smlawb      r2, r3, r2, r10
    ldrb        r12, [r8]
    ;++
    subs        r10, r4, r2, lsl #24            ;x = value-(split<<24)
    movhs       r4, r10                         ;update value
    subhs       r2, r6, r2                      ;range = range - split
    addhs       lr, lr, r11, lsl r0             ;v += ((UINT16)1<<bits_count)
    movlo       r6, r2                          ;range = split
 ;; use branch for short pipelines ???
 ;;  cmp         r2, #0x80
 ;;  bcs         |$LN10@decode_mb_to|
    clz         r3, r2
    sub         r3, r3, #24
    subs        r5, r5, r3
    mov         r6, r2, lsl r3                  ;range
    mov         r4, r4, lsl r3                  ;value
    addle       r5, r5, #8
    addle       r8, r8, #1
    rsble       r3, r5, #8
    orrle       r4, r4, r12, lsl r3
 ;;|$LN10@decode_mb_to|
    subs         r0, r0, #1
    bpl         extrabits_loop
 SKIP_EXTRABITS
    ldr         r11, [sp, #l_qcoeff]
    ldr         r0, [sp, #l_coef_ptr]
    cmp         r1, #0                          ;check for nonzero token
    beq         SKIP_EOB_CHECK              ;if t is zero, we will skip the eob table chec
    sub         r3, r6, #1                      ;range - 1
    ;++
    mov         r3, r3, lsl #7                  ; *= onyx_prob_half  (128)
    ;++
    mov         r3, r3, lsr #8
    add         r2, r3, #1                      ;split
    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
    movhs       r4, r3                          ;update value
    subhs       r2, r6, r2                      ;range = range - split
    mvnhs       r3, lr
    addhs       lr, r3, #1                      ;v = (v ^ -1) + 1
    movlo       r6, r2                          ;range = split
 ;; use branch for short pipelines ???
 ;;  cmp         r2, #0x80
 ;;  bcs         |$LN6@decode_mb_to|
    clz         r3, r2
    sub         r3, r3, #24
    subs        r5, r5, r3
    mov         r6, r2, lsl r3
    mov         r4, r4, lsl r3
    ldrleb      r2, [r8], #1
    addle       r5, r5, #8
    rsble       r3, r5, #8
    orrle       r4, r4, r2, lsl r3
 ;;|$LN6@decode_mb_to|
    add         r0, r0, #0xB
    cmn         r1, #1
    addlt       r0, r0, #0xB
    mvn         r1, #1
 SKIP_EOB_CHECK
    ldr         r7, [sp, #l_c]
    ldr         r3, [r9, #detok_scan]
    add         r1, r1, #2
    cmp         r7, #(0x10 - 1)                     ;assume one less for now.... increment below
    ldr         r3, [r3, +r7, lsl #2]
    add         r7, r7, #1
    add         r3, r11, r3, lsl #1
    str         r7, [sp, #l_c]
    strh        lr, [r3]
    blt         COEFF_LOOP
    sub         r7, r7, #1                          ;if(t != -DCT_EOB_TOKEN) --c
 END_OF_BLOCK
    ldr         r3, [sp, #l_type]
    ldr         r10, [sp, #l_coef_ptr]
    ldr         r0, [sp, #l_qcoeff]
    ldr         r11, [sp, #l_i]
    ldr         r12, [sp, #l_stop]
    cmp         r3, #0
    moveq       r1, #1
    movne       r1, #0
    add         r3, r11, r9
    cmp         r7, r1
    strb        r7, [r3, #detok_eob]
    ldr         r7, [sp, #l_l_ptr]
    ldr         r2, [sp, #l_a_ptr]
    movne       r3, #1
    moveq       r3, #0
    add         r0, r0, #0x20
    add         r11, r11, #1
    str         r3, [r7]
    str         r3, [r2]
    str         r0, [sp, #l_qcoeff]
    str         r11, [sp, #l_i]
    cmp         r11, r12                            ;i >= stop ?
    ldr         r7, [sp, #l_type]
    mov         lr, #0xB
    blt         BLOCK_LOOP
    cmp         r11, #0x19
    bne         ln2_decode_mb_to
    ldr         r12, [r9, #detok_qcoeff_start_ptr]
    ldr         r10, [r9, #detok_coef_probs]
    mov         r7, #0
    mov         r3, #0x10
    str         r12, [sp, #l_qcoeff]
    str         r7, [sp, #l_i]
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]
    b           BLOCK_LOOP
 ln2_decode_mb_to
    cmp         r11, #0x10
    bne         ln1_decode_mb_to
    ldr         r10, [r9, #0x30]
    mov         r7, #2
    mov         r3, #0x18
    str         r7, [sp, #l_type]
    str         r3, [sp, #l_stop]
    str         r10, [sp, #l_coef_ptr]
    b           BLOCK_LOOP
 ln1_decode_mb_to
    ldr         r2, [sp, #l_bc]
    mov         r0, #0
    nop
    ldr         r3, [r2, #bool_decoder_buffer]
    str         r5, [r2, #bool_decoder_count]
    str         r4, [r2, #bool_decoder_value]
    sub         r3, r8, r3
    str         r3, [r2, #bool_decoder_pos]
    str         r6, [r2, #bool_decoder_range]
    add         sp, sp, #l_stacksize
    ldmia       sp!, {r4 - r11, pc}
    ENDP  ; |vp8_decode_mb_tokens_v5|
    END
--- a/vp8/decoder/arm/dsystemdependent.c
+++ b/vp8/decoder/arm/dsystemdependent.c
@@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
    pbi->mb.rtcd         = &pbi->common.rtcd;
 #if HAVE_ARMV7
    pbi->dequant.block   = vp8_dequantize_b_neon;
    pbi->dequant.idct    = vp8_dequant_idct_neon;
    pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
    pbi->dboolhuff.start = vp8dx_start_decode_c;
    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
@@ -32,8 +30,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 #elif HAVE_ARMV6
    pbi->dequant.block   = vp8_dequantize_b_v6;
    pbi->dequant.idct    = vp8_dequant_idct_v6;
    pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
    pbi->dboolhuff.start = vp8dx_start_decode_c;
    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
@@ -9,31 +9,43 @@
 ;
-    EXPORT  |vp8_dequant_dc_idct_neon|
+    EXPORT  |vp8_dequant_dc_idct_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
+;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
 ;                                  unsigned char *dest, int pitch, int stride,
 ;                                  int Dc);
 ; r0    short *input,
 ; r1    short *dq,
-; r2    short *output,
+; r2    unsigned char *pred
-; r3    int pitch,
+; r3    unsigned char *dest
-; (stack)   int Dc
+; sp    int pitch
-|vp8_dequant_dc_idct_neon| PROC
+; sp+4  int stride
 ; sp+8  int Dc
 |vp8_dequant_dc_idct_add_neon| PROC
    vld1.16         {q3, q4}, [r0]
    vld1.16         {q5, q6}, [r1]
-    ldr             r1, [sp]                ;load Dc from stack
+    ldr             r1, [sp, #8]            ;load Dc from stack
-    ldr             r12, _dcidct_coeff_
+    ldr             r12, _CONSTANTS_
    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
    vmul.i16        q2, q4, q6
    vmov.16         d2[0], r1
    ldr             r1, [sp]                ; pitch
    vld1.32         {d14[0]}, [r2], r1
    vld1.32         {d14[1]}, [r2], r1
    vld1.32         {d15[0]}, [r2], r1
    vld1.32         {d15[1]}, [r2]
    ldr             r1, [sp, #4]            ; stride
 ;|short_idct4x4llm_neon| PROC
    vld1.16         {d0}, [r12]
    vswp            d3, d4                  ;q2(vp[4] vp[12])
@@ -47,14 +59,9 @@
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    ;d6 - c1:temp1
    ;d7 - d1:temp2
    ;d8 - d1:temp1
    ;d9 - c1:temp2
    vqsub.s16       d10, d6, d9             ;c1
    vqadd.s16       d11, d7, d8             ;d1
@@ -83,7 +90,7 @@
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    vqsub.s16       d10, d6, d9             ;c1
@@ -101,34 +108,29 @@
    vrshr.s16       d4, d4, #3
    vrshr.s16       d5, d5, #3
    add             r1, r2, r3
    add             r12, r1, r3
    add             r0, r12, r3
    vtrn.32         d2, d4
    vtrn.32         d3, d5
    vtrn.16         d2, d3
    vtrn.16         d4, d5
-    vst1.16         {d2}, [r2]
+    vaddw.u8        q1, q1, d14
-    vst1.16         {d3}, [r1]
+    vaddw.u8        q2, q2, d15
-    vst1.16         {d4}, [r12]
+
-    vst1.16         {d5}, [r0]
+    vqmovun.s16     d0, q1
    vqmovun.s16     d1, q2
    vst1.32         {d0[0]}, [r3], r1
    vst1.32         {d0[1]}, [r3], r1
    vst1.32         {d1[0]}, [r3], r1
    vst1.32         {d1[1]}, [r3]
    bx             lr
-    ENDP
+    ENDP           ; |vp8_dequant_dc_idct_add_neon|
-;-----------------
+; Constant Pool
-    AREA    dcidct4x4_dat, DATA, READWRITE          ;read/write by default
+_CONSTANTS_       DCD cospi8sqrt2minus1
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
-;One word each is reserved. Label filter_coeff can be used to access the data.
+sinpi8sqrt2       DCD 0x8a8c8a8c
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _dcidct_coeff_
    DCD     dcidct_coeff
 dcidct_coeff
    DCD     0x4e7b4e7b, 0x8a8c8a8c
 ;20091, 20091, 35468, 35468
    END
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
@@ -9,22 +9,33 @@
 ;
-    EXPORT  |vp8_dequant_idct_neon|
+    EXPORT  |vp8_dequant_idct_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
 ;                           unsigned char *dest, int pitch, int stride)
 ; r0    short *input,
 ; r1    short *dq,
-; r2    short *output,
+; r2    unsigned char *pred
-; r3    int pitch,
+; r3    unsigned char *dest
-|vp8_dequant_idct_neon| PROC
+; sp    int pitch
 ; sp+4  int stride
 |vp8_dequant_idct_add_neon| PROC
    vld1.16         {q3, q4}, [r0]
    vld1.16         {q5, q6}, [r1]
    ldr             r1, [sp]                ; pitch
    vld1.32         {d14[0]}, [r2], r1
    vld1.32         {d14[1]}, [r2], r1
    vld1.32         {d15[0]}, [r2], r1
    vld1.32         {d15[1]}, [r2]
-    ldr             r12, _didct_coeff_
+    ldr             r1, [sp, #4]            ; stride
    ldr             r12, _CONSTANTS_
    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
    vmul.i16        q2, q4, q6
@@ -42,14 +53,9 @@
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    ;d6 - c1:temp1
    ;d7 - d1:temp2
    ;d8 - d1:temp1
    ;d9 - c1:temp2
    vqsub.s16       d10, d6, d9             ;c1
    vqadd.s16       d11, d7, d8             ;d1
@@ -78,7 +84,7 @@
    vshr.s16        q3, q3, #1
    vshr.s16        q4, q4, #1
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q3, q3, q2
    vqadd.s16       q4, q4, q2
    vqsub.s16       d10, d6, d9             ;c1
@@ -96,34 +102,29 @@
    vrshr.s16       d4, d4, #3
    vrshr.s16       d5, d5, #3
    add             r1, r2, r3
    add             r12, r1, r3
    add             r0, r12, r3
    vtrn.32         d2, d4
    vtrn.32         d3, d5
    vtrn.16         d2, d3
    vtrn.16         d4, d5
-    vst1.16         {d2}, [r2]
+    vaddw.u8        q1, q1, d14
-    vst1.16         {d3}, [r1]
+    vaddw.u8        q2, q2, d15
-    vst1.16         {d4}, [r12]
+
-    vst1.16         {d5}, [r0]
+    vqmovun.s16     d0, q1
    vqmovun.s16     d1, q2
    vst1.32         {d0[0]}, [r3], r1
    vst1.32         {d0[1]}, [r3], r1
    vst1.32         {d1[0]}, [r3], r1
    vst1.32         {d1[1]}, [r3]
    bx             lr
-    ENDP
+    ENDP           ; |vp8_dequant_idct_add_neon|
-;-----------------
+; Constant Pool
-    AREA    didct4x4_dat, DATA, READWRITE           ;read/write by default
+_CONSTANTS_       DCD cospi8sqrt2minus1
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
-;One word each is reserved. Label filter_coeff can be used to access the data.
+sinpi8sqrt2       DCD 0x8a8c8a8c
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 _didct_coeff_
    DCD     didct_coeff
 didct_coeff
    DCD     0x4e7b4e7b, 0x8a8c8a8c
 ;20091, 20091, 35468, 35468
    END
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -0,0 +1,151 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "dequantize.h"
 void vp8_dequant_dc_idct_add_y_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
        else
            vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
        if (eobs[1] > 1)
            vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
        else
            vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
        if (eobs[2] > 1)
            vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
        else
            vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
        if (eobs[3] > 1)
            vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
        else
            vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
        q    += 64;
        dc   += 4;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_y_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
            ((int *)(q+16))[0] = 0;
        }
        if (eobs[2] > 1)
            vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
            ((int *)(q+32))[0] = 0;
        }
        if (eobs[3] > 1)
            vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
            ((int *)(q+48))[0] = 0;
        }
        q    += 64;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_neon
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstu += 4*stride;
        eobs += 2;
    }
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstv += 4*stride;
        eobs += 2;
    }
 }
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -13,7 +13,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
-DECLARE_ALIGNED(16, const unsigned int, vp8dx_bitreader_norm[256]) =
+DECLARE_ALIGNED(16, const unsigned char, vp8dx_bitreader_norm[256]) =
 {
    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -95,7 +95,7 @@ typedef struct vp8_dboolhuff_rtcd_vtable {
 #define IF_RTCD(x) NULL
 //#endif
-DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 /* wrapper functions to hide RTCD. static means inline means hopefully no
 * penalty
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -226,13 +226,14 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
                        int mv_contz;
                        while (j != L[++k])
-                            if (k >= 16)
+                        {
 #if CONFIG_DEBUG
                            if (k >= 16)
                            {
                                assert(0);
-
+                            }
 #else
                                ;
 #endif
                        }
                        mv_contz = vp8_mv_cont(&(vp8_left_bmi(mi, k)->mv.as_mv), &(vp8_above_bmi(mi, k, mis)->mv.as_mv));
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -18,6 +18,7 @@
 extern void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
                                 MACROBLOCKD *xd);
 extern void vp8_mt_loop_filter_frame(VP8D_COMP *pbi);
 extern void vp8_stop_lfthread(VP8D_COMP *pbi);
 extern void vp8_start_lfthread(VP8D_COMP *pbi);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -21,7 +21,7 @@
 #include "alloccommon.h"
 #include "entropymode.h"
 #include "quant_common.h"
-#include "segmentation_common.h"
+
 #include "setupintrarecon.h"
 #include "demode.h"
 #include "decodemv.h"
@@ -113,7 +113,7 @@ static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 // to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
        vp8_build_intra_predictors_mbuv_s(xd);
@@ -126,7 +126,6 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
    }
 }
 static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
 {
    /* If the MV points so far into the UMV border that no visible pixels
@@ -165,7 +164,7 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
 static void clamp_mvs(MACROBLOCKD *xd)
 {
-    if (xd->mbmi.mode == SPLITMV)
+    if (xd->mode_info_context->mbmi.mode == SPLITMV)
    {
        int i;
@@ -176,48 +175,69 @@ static void clamp_mvs(MACROBLOCKD *xd)
    }
    else
    {
-        clamp_mv_to_umv_border(&xd->mbmi.mv.as_mv, xd);
+        clamp_mv_to_umv_border(&xd->mode_info_context->mbmi.mv.as_mv, xd);
        clamp_uvmv_to_umv_border(&xd->block[16].bmi.mv.as_mv, xd);
    }
 }
-static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
+void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    int eobtotal = 0;
-    {
+    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
        vp8_build_intra_predictors_mbuv(xd);
-        if (xd->mbmi.mode != B_PRED)
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
    {
-            vp8_build_intra_predictors_mby_ptr(xd);
+        vp8_reset_mb_tokens_context(xd);
            vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
    }
    else
    {
-            vp8_recon_intra4x4mb(RTCD_VTABLE(recon), xd);
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
    }
    /* Perform temporary clamping of the MV to be used for prediction */
    if (do_clamp)
    {
        clamp_mvs(xd);
    }
    xd->mode_info_context->mbmi.dc_diff = 1;
    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
    {
        xd->mode_info_context->mbmi.dc_diff = 0;
        skip_recon_mb(pbi, xd);
        return;
    }
    if (xd->segmentation_enabled)
        mb_init_dequantizer(pbi, xd);
    // do prediction
    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
        vp8_build_intra_predictors_mbuv(xd);
        if (xd->mode_info_context->mbmi.mode != B_PRED)
        {
            vp8_build_intra_predictors_mby_ptr(xd);
        } else {
            vp8_intra_prediction_down_copy(xd);
        }
    }
    else
    {
        vp8_build_inter_predictors_mb(xd);
        vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
    }
    }
-
+    // dequantization and idct
-static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
    {
    int i;
        BLOCKD *b = &xd->block[24];
    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
    {
        DEQUANT_INVOKE(&pbi->dequant, block)(b);
        // do 2nd order transform on the dc block
-        if (b->eob > 1)
+        if (xd->eobs[24] > 1)
        {
            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
            ((int *)b->qcoeff)[0] = 0;
@@ -235,115 +255,47 @@ static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
            ((int *)b->qcoeff)[0] = 0;
        }
-
+        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
                        (xd->qcoeff, &xd->block[0].dequant[0][0],
                         xd->predictor, xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
    }
    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
    {
        for (i = 0; i < 16; i++)
        {
-            b = &xd->block[i];
+            BLOCKD *b = &xd->block[i];
            vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-            if (b->eob > 1)
+            if (xd->eobs[i] > 1)
            {
-                DEQUANT_INVOKE(&pbi->dequant, idct_dc)(b->qcoeff, &b->dequant[0][0], b->diff, 32, xd->block[24].diff[i]);
+                DEQUANT_INVOKE(&pbi->dequant, idct_add)
                    (b->qcoeff, &b->dequant[0][0],  b->predictor,
                    *(b->base_dst) + b->dst, 16, b->dst_stride);
            }
            else
            {
-                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(xd->block[24].diff[i], b->diff, 32);
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-            }
+                    (b->qcoeff[0] * b->dequant[0][0], b->predictor,
-        }
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
        for (i = 16; i < 24; i++)
        {
            b = &xd->block[i];
            if (b->eob > 1)
            {
                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, 16);
            }
            else
            {
                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, 16);
                ((int *)b->qcoeff)[0] = 0;
            }
        }
    }
    else
    {
-        for (i = 0; i < 24; i++)
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-        {
+                        (xd->qcoeff, &xd->block[0].dequant[0][0],
-
+                         xd->predictor, xd->dst.y_buffer,
-            b = &xd->block[i];
+                         xd->dst.y_stride, xd->eobs);
            if (b->eob > 1)
            {
                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, (32 - (i & 16)));
            }
            else
            {
                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, (32 - (i & 16)));
                ((int *)b->qcoeff)[0] = 0;
            }
        }
    }
    }
-void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-{
+                    (xd->qcoeff+16*16, &xd->block[16].dequant[0][0],
-    int eobtotal = 0;
+                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
-    MV  orig_mvs[24];
+                     xd->dst.uv_stride, xd->eobs+16);
    int i, do_clamp = xd->mbmi.need_to_clamp_mvs;
    if (xd->mbmi.mb_skip_coeff)
    {
        vp8_reset_mb_tokens_context(xd);
    }
    else
    {
        eobtotal = vp8_decode_mb_tokens(pbi, xd);
    }
    /* Perform temporary clamping of the MV to be used for prediction */
    if (do_clamp)
    {
        if (xd->mbmi.mode == SPLITMV)
            for (i=0; i<24; i++)
                orig_mvs[i] = xd->block[i].bmi.mv.as_mv;
        else
        {
            orig_mvs[0] = xd->mbmi.mv.as_mv;
            orig_mvs[1] = xd->block[16].bmi.mv.as_mv;
        }
        clamp_mvs(xd);
    }
    xd->mode_info_context->mbmi.dc_diff = 1;
    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV && eobtotal == 0)
    {
        xd->mode_info_context->mbmi.dc_diff = 0;
        skip_recon_mb(pbi, xd);
    }
    else
    {
        if (xd->segmentation_enabled)
            mb_init_dequantizer(pbi, xd);
        de_quantand_idct(pbi, xd);
        reconstruct_mb(pbi, xd);
    }
    /* Restore the original MV so as not to affect the entropy context. */
    if (do_clamp)
    {
        if (xd->mbmi.mode == SPLITMV)
            for (i=0; i<24; i++)
                xd->block[i].bmi.mv.as_mv = orig_mvs[i];
        else
        {
            xd->mbmi.mv.as_mv = orig_mvs[0];
            xd->block[16].bmi.mv.as_mv = orig_mvs[1];
        }
    }
 }
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
@@ -381,18 +333,17 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
    int i;
    int recon_yoffset, recon_uvoffset;
    int mb_col;
-    int recon_y_stride = pc->last_frame.y_stride;
+    int ref_fb_idx = pc->lst_fb_idx;
-    int recon_uv_stride = pc->last_frame.uv_stride;
+    int dst_fb_idx = pc->new_fb_idx;
    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-    vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
    recon_yoffset = mb_row * recon_y_stride * 16;
    recon_uvoffset = mb_row * recon_uv_stride * 8;
    // reset above block coeffs
-    xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
+    xd->above_context = pc->above_context;
    xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
    xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
    xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
    xd->up_available = (mb_row != 0);
    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
@@ -400,12 +351,8 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
    {
        // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
        // the partition_bmi array is unused in the decoder, so don't copy it.
        vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi,
                   sizeof(MB_MODE_INFO) - sizeof(xd->mbmi.partition_bmi));
-        if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
+        if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
        {
            for (i = 0; i < 16; i++)
            {
@@ -419,33 +366,23 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        xd->mb_to_left_edge = -((mb_col * 16) << 3);
        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-        xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
+        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-        xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
+        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-        xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
+        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
        xd->left_available = (mb_col != 0);
        // Select the appropriate reference frame for this MB
-        if (xd->mbmi.ref_frame == LAST_FRAME)
+        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-        {
+            ref_fb_idx = pc->lst_fb_idx;
-            xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
+        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-            xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
+            ref_fb_idx = pc->gld_fb_idx;
            xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
        }
        else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
        {
            // Golden frame reconstruction buffer
            xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
            xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
            xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
        }
        else
-        {
+            ref_fb_idx = pc->alt_fb_idx;
-            // Alternate reference frame reconstruction buffer
+
-            xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
+        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-            xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
+        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-            xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
+        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
        }
        vp8_build_uvmvs(xd, pc->full_pixel);
@@ -463,19 +400,14 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
        ++xd->mode_info_context;  /* next mb */
-        xd->gf_active_ptr++;      // GF useage flag for next MB
+        xd->above_context++;
        xd->above_context[Y1CONTEXT] += 4;
        xd->above_context[UCONTEXT ] += 2;
        xd->above_context[VCONTEXT ] += 2;
        xd->above_context[Y2CONTEXT] ++;
        pbi->current_mb_col_main = mb_col;
    }
    // adjust to the next row of mbs
    vp8_extend_mb_row(
-        &pc->new_frame,
+        &pc->yv12_fb[dst_fb_idx],
        xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
    );
@@ -623,10 +555,10 @@ static void init_frame(VP8D_COMP *pbi)
        }
    }
-    xd->left_context = pc->left_context;
+    xd->left_context = &pc->left_context;
    xd->mode_info_context = pc->mi;
    xd->frame_type = pc->frame_type;
-    xd->mbmi.mode = DC_PRED;
+    xd->mode_info_context->mbmi.mode = DC_PRED;
    xd->mode_info_stride = pc->mode_info_stride;
 }
@@ -890,11 +822,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                    }
    }
-    vpx_memcpy(&xd->pre, &pc->last_frame, sizeof(YV12_BUFFER_CONFIG));
+    vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
-    vpx_memcpy(&xd->dst, &pc->new_frame, sizeof(YV12_BUFFER_CONFIG));
+    vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
    // set up frame new frame for intra coded blocks
-    vp8_setup_intra_recon(&pc->new_frame);
+    vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
    vp8_setup_block_dptrs(xd);
@@ -911,14 +843,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    else
        vp8_decode_mode_mvs(pbi);
-    // reset since these guys are used as iterators
+    vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
    vpx_memset(pc->above_context[Y1CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 4);
    vpx_memset(pc->above_context[UCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
    vpx_memset(pc->above_context[VCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
    vpx_memset(pc->above_context[Y2CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols);
    xd->gf_active_ptr = (signed char *)pc->gf_active_flags;     // Point to base of GF active flags data structure
    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -32,8 +32,12 @@ void vp8_dequantize_b_c(BLOCKD *d)
    }
 }
-void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
                            unsigned char *dest, int pitch, int stride)
 {
    short output[16];
    short *diff_ptr = output;
    int r, c;
    int i;
    for (i = 0; i < 16; i++)
@@ -41,13 +45,40 @@ void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
        input[i] = dq[i] * input[i];
    }
-    vp8_short_idct4x4llm_c(input, output, pitch);
+    // the idct halves ( >> 1) the pitch
    vp8_short_idct4x4llm_c(input, output, 4 << 1);
    vpx_memset(input, 0, 32);
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            int a = diff_ptr[c] + pred[c];
            if (a < 0)
                a = 0;
            if (a > 255)
                a = 255;
            dest[c] = (unsigned char) a;
        }
-void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc)
+        dest += stride;
        diff_ptr += 4;
        pred += pitch;
    }
 }
 void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
                               unsigned char *dest, int pitch, int stride,
                               int Dc)
 {
    int i;
    short output[16];
    short *diff_ptr = output;
    int r, c;
    input[0] = (short)Dc;
@@ -56,6 +87,28 @@ void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, in
        input[i] = dq[i] * input[i];
    }
-    vp8_short_idct4x4llm_c(input, output, pitch);
+    // the idct halves ( >> 1) the pitch
    vp8_short_idct4x4llm_c(input, output, 4 << 1);
    vpx_memset(input, 0, 32);
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            int a = diff_ptr[c] + pred[c];
            if (a < 0)
                a = 0;
            if (a > 255)
                a = 255;
            dest[c] = (unsigned char) a;
        }
        dest += stride;
        diff_ptr += 4;
        pred += pitch;
    }
 }
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -16,11 +16,31 @@
 #define prototype_dequant_block(sym) \
    void sym(BLOCKD *x)
-#define prototype_dequant_idct(sym) \
+#define prototype_dequant_idct_add(sym) \
-    void sym(short *input, short *dq, short *output, int pitch)
+    void sym(short *input, short *dq, \
             unsigned char *pred, unsigned char *output, \
             int pitch, int stride)
-#define prototype_dequant_idct_dc(sym) \
+#define prototype_dequant_dc_idct_add(sym) \
-    void sym(short *input, short *dq, short *output, int pitch, int dc)
+    void sym(short *input, short *dq, \
             unsigned char *pred, unsigned char *output, \
             int pitch, int stride, \
             int dc)
 #define prototype_dequant_dc_idct_add_y_block(sym) \
    void sym(short *q, short *dq, \
             unsigned char *pre, unsigned char *dst, \
             int stride, char *eobs, short *dc)
 #define prototype_dequant_idct_add_y_block(sym) \
    void sym(short *q, short *dq, \
             unsigned char *pre, unsigned char *dst, \
             int stride, char *eobs)
 #define prototype_dequant_idct_add_uv_block(sym) \
    void sym(short *q, short *dq, \
             unsigned char *pre, unsigned char *dst_u, \
             unsigned char *dst_v, int stride, char *eobs)
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/dequantize_x86.h"
@@ -35,25 +55,52 @@
 #endif
 extern prototype_dequant_block(vp8_dequant_block);
-#ifndef vp8_dequant_idct
+#ifndef vp8_dequant_idct_add
-#define vp8_dequant_idct vp8_dequant_idct_c
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
 #endif
-extern prototype_dequant_idct(vp8_dequant_idct);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add);
-#ifndef vp8_dequant_idct_dc
+#ifndef vp8_dequant_dc_idct_add
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_c
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
 #endif
-extern prototype_dequant_idct_dc(vp8_dequant_idct_dc);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
 #ifndef vp8_dequant_dc_idct_add_y_block
 #define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
 #endif
 extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
 #ifndef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
 #endif
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
 #ifndef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
 #endif
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
 typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
-typedef prototype_dequant_idct((*vp8_dequant_idct_fn_t));
+
-typedef prototype_dequant_idct_dc((*vp8_dequant_idct_dc_fn_t));
+typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
 typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
 typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
 typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
 typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
 typedef struct
 {
    vp8_dequant_block_fn_t               block;
-    vp8_dequant_idct_fn_t     idct;
+    vp8_dequant_idct_add_fn_t            idct_add;
-    vp8_dequant_idct_dc_fn_t  idct_dc;
+    vp8_dequant_dc_idct_add_fn_t         dc_idct_add;
    vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
    vp8_dequant_idct_add_y_block_fn_t    idct_add_y_block;
    vp8_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;
 } vp8_dequant_rtcd_vtable_t;
 #if CONFIG_RUNTIME_CPU_DETECT
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -14,11 +14,12 @@
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "detokenize.h"
 #define BOOL_DATA UINT8
 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, UINT16, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -60,50 +61,50 @@ DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTR
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT **const A = x->above_context;
    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
    ENTROPY_CONTEXT *a;
    ENTROPY_CONTEXT *l;
    /* Clear entropy contexts for Y blocks */
    a = A[Y1CONTEXT];
    l = L[Y1CONTEXT];
    *a = 0;
    *(a+1) = 0;
    *(a+2) = 0;
    *(a+3) = 0;
    *l = 0;
    *(l+1) = 0;
    *(l+2) = 0;
    *(l+3) = 0;
    /* Clear entropy contexts for U blocks */
    a = A[UCONTEXT];
    l = L[UCONTEXT];
    *a = 0;
    *(a+1) = 0;
    *l = 0;
    *(l+1) = 0;
    /* Clear entropy contexts for V blocks */
    a = A[VCONTEXT];
    l = L[VCONTEXT];
    *a = 0;
    *(a+1) = 0;
    *l = 0;
    *(l+1) = 0;
    /* Clear entropy contexts for Y2 blocks */
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
    {
-        a = A[Y2CONTEXT];
+        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-        l = L[Y2CONTEXT];
+        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-        *a = 0;
+    }
-        *l = 0;
+    else
    {
        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
    }
 }
-DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+
 #if CONFIG_ARM_ASM_DETOK
 // mashup of vp8_block2left and vp8_block2above so we only need one pointer
 // for the assembly version.
 DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
 {
    //vp8_block2left
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
    //vp8_block2above
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
 void vp8_init_detokenizer(VP8D_COMP *dx)
 {
    const VP8_COMMON *const oc = & dx->common;
    MACROBLOCKD *x = & dx->mb;
    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
    dx->detoken.scan = vp8_default_zig_zag1d;
    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
 }
 #endif
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 #define FILL \
    if(count < 0) \
        VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
@@ -200,14 +201,45 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
    }\
    NORMALIZE
 #if CONFIG_ARM_ASM_DETOK
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
-    ENTROPY_CONTEXT **const A = x->above_context;
+    int eobtotal = 0;
-    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    int i, type;
    dx->detoken.current_bc = x->current_bc;
    dx->detoken.A = x->above_context;
    dx->detoken.L = x->left_context;
    type = 3;
    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
    {
        type = 1;
        eobtotal -= 16;
    }
    vp8_decode_mb_tokens_v6(&dx->detoken, type);
    for (i = 0; i < 25; i++)
    {
        x->eobs[i] = dx->detoken.eob[i];
        eobtotal += dx->detoken.eob[i];
    }
    return eobtotal;
 }
 #else
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
    const VP8_COMMON *const oc = & dx->common;
    BOOL_DECODER *bc = x->current_bc;
    char *eobs = x->eobs;
    ENTROPY_CONTEXT *a;
    ENTROPY_CONTEXT *l;
    int i;
@@ -231,29 +263,24 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
    int stop;
    INT16 val, bits_count;
    INT16 c;
    INT16 t;
    INT16 v;
    const vp8_prob *Prob;
    //int *scan;
    type = 3;
    i = 0;
    stop = 16;
-    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    scan = vp8_default_zig_zag1d;
    qcoeff_ptr = &x->qcoeff[0];
    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
    {
        i = 24;
        stop = 24;
        type = 1;
-        qcoeff_ptr = &x->qcoeff[24*16];
+        qcoeff_ptr += 24*16;
        scan = vp8_default_zig_zag1d;
        eobtotal -= 16;
    }
    else
    {
        scan = vp8_default_zig_zag1d;
        qcoeff_ptr = &x->qcoeff[0];
    }
    bufend  = bc->user_buffer_end;
    bufptr  = bc->user_buffer;
@@ -265,13 +292,15 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
    coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
 BLOCK_LOOP:
-    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+    a = A + vp8_block2above[i];
-    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+    l = L + vp8_block2left[i];
    c = (INT16)(!type);
-    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+//    Dest = ((A)!=0) + ((B)!=0);
    VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
    Prob = coef_probs;
-    Prob += t * ENTROPY_NODES;
+    Prob += v * ENTROPY_NODES;
 DO_WHILE:
    Prob += vp8_coef_bands_x[c];
@@ -358,9 +387,8 @@ ONE_CONTEXT_NODE_0_:
    qcoeff_ptr [ scan[15] ] = (INT16) v;
 BLOCK_FINISHED:
-    t = ((x->block[i].eob = c) != !type);   // any nonzero data?
+    *a = *l = ((eobs[i] = c) != !type);   // any nonzero data?
-    eobtotal += x->block[i].eob;
+    eobtotal += c;
    *a = *l = t;
    qcoeff_ptr += 16;
    i++;
@@ -370,12 +398,11 @@ BLOCK_FINISHED:
    if (i == 25)
    {
        scan = vp8_default_zig_zag1d;//x->scan_order1d;
        type = 0;
        i = 0;
        stop = 16;
        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
-        qcoeff_ptr = &x->qcoeff[0];
+        qcoeff_ptr -= (24*16 + 16);
        goto BLOCK_LOOP;
    }
@@ -395,3 +422,4 @@ BLOCK_FINISHED:
    return eobtotal;
 }
 #endif //!CONFIG_ASM_DETOK
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h
@@ -9,12 +9,16 @@
 */
-#ifndef detokenize_h
+#ifndef DETOKENIZE_H
-#define detokenize_h 1
+#define DETOKENIZE_H
 #include "onyxd_int.h"
 #if ARCH_ARM
 #include "arm/detokenize_arm.h"
 #endif
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
-#endif /* detokenize_h */
+#endif /* DETOKENIZE_H */
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -21,8 +21,11 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 #if CONFIG_RUNTIME_CPU_DETECT
    pbi->mb.rtcd                     = &pbi->common.rtcd;
    pbi->dequant.block               = vp8_dequantize_b_c;
-    pbi->dequant.idct    = vp8_dequant_idct_c;
+    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
-    pbi->dequant.idct_dc = vp8_dequant_dc_idct_c;
+    pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_c;
    pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
    pbi->dboolhuff.start             = vp8dx_start_decode_c;
    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
 #if 0 //For use with RTCD, when implemented
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -0,0 +1,116 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "dequantize.h"
 void vp8_dequant_dc_idct_add_y_block_c
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
 {
    int i, j;
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
            else
                vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
            q   += 16;
            pre += 4;
            dst += 4;
            dc  ++;
        }
        pre += 64 - 16;
        dst += 4*stride - 16;
    }
 }
 void vp8_dequant_idct_add_y_block_c
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
                ((int *)q)[0] = 0;
            }
            q   += 16;
            pre += 4;
            dst += 4;
        }
        pre += 64 - 16;
        dst += 4*stride - 16;
    }
 }
 void vp8_dequant_idct_add_uv_block_c
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i, j;
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
                ((int *)q)[0] = 0;
            }
            q    += 16;
            pre  += 4;
            dstu += 4;
        }
        pre  += 32 - 8;
        dstu += 4*stride - 8;
    }
    for (i = 0; i < 2; i++)
    {
        for (j = 0; j < 2; j++)
        {
            if (*eobs++ > 1)
                vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
            else
            {
                vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
                ((int *)q)[0] = 0;
            }
            q    += 16;
            pre  += 4;
            dstv += 4;
        }
        pre  += 32 - 8;
        dstv += 4*stride - 8;
    }
 }
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -24,18 +24,16 @@
 #include "threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
-#include "segmentation_common.h"
+
 #include "quant_common.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
-
+#include "detokenize.h"
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 // DEBUG code
 #if CONFIG_DEBUG
 void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
 {
@@ -129,6 +127,9 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
        cm->last_sharpness_level = cm->sharpness_level;
    }
 #if CONFIG_ARM_ASM_DETOK
    vp8_init_detokenizer(pbi);
 #endif
    pbi->common.error.setjmp = 0;
    return (VP8D_PTR) pbi;
 }
@@ -180,38 +181,38 @@ int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_C
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int ref_fb_idx;
    if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
+        ref_fb_idx = cm->lst_fb_idx;
    else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
+        ref_fb_idx = cm->gld_fb_idx;
    else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
+        ref_fb_idx = cm->alt_fb_idx;
    else
        return -1;
    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
    return 0;
 }
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int ref_fb_idx;
    if (ref_frame_flag == VP8_LAST_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
+        ref_fb_idx = cm->lst_fb_idx;
    else if (ref_frame_flag == VP8_GOLD_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
+        ref_fb_idx = cm->gld_fb_idx;
    else if (ref_frame_flag == VP8_ALT_FLAG)
-        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
+        ref_fb_idx = cm->alt_fb_idx;
    else
        return -1;
    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
    return 0;
 }
@@ -221,12 +222,95 @@ extern void vp8_push_neon(INT64 *store);
 extern void vp8_pop_neon(INT64 *store);
 static INT64 dx_store_reg[8];
 #endif
 static int get_free_fb (VP8_COMMON *cm)
 {
    int i;
    for (i = 0; i < NUM_YV12_BUFFERS; i++)
        if (cm->fb_idx_ref_cnt[i] == 0)
            break;
    cm->fb_idx_ref_cnt[i] = 1;
    return i;
 }
 static void ref_cnt_fb (int *buf, int *idx, int new_idx)
 {
    if (buf[*idx] > 0)
        buf[*idx]--;
    *idx = new_idx;
    buf[new_idx]++;
 }
 // If any buffer copy / swapping is signalled it should be done here.
 static int swap_frame_buffers (VP8_COMMON *cm)
 {
    int fb_to_update_with, err = 0;
    if (cm->refresh_last_frame)
        fb_to_update_with = cm->lst_fb_idx;
    else
        fb_to_update_with = cm->new_fb_idx;
    // The alternate reference frame or golden frame can be updated
    //  using the new, last, or golden/alt ref frame.  If it
    //  is updated using the newly decoded frame it is a refresh.
    //  An update using the last or golden/alt ref frame is a copy.
    if (cm->copy_buffer_to_arf)
    {
        int new_fb = 0;
        if (cm->copy_buffer_to_arf == 1)
            new_fb = fb_to_update_with;
        else if (cm->copy_buffer_to_arf == 2)
            new_fb = cm->gld_fb_idx;
        else
            err = -1;
        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
    }
    if (cm->copy_buffer_to_gf)
    {
        int new_fb = 0;
        if (cm->copy_buffer_to_gf == 1)
            new_fb = fb_to_update_with;
        else if (cm->copy_buffer_to_gf == 2)
            new_fb = cm->alt_fb_idx;
        else
            err = -1;
        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
    }
    if (cm->refresh_golden_frame)
        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
    if (cm->refresh_alt_ref_frame)
        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
    if (cm->refresh_last_frame)
    {
        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
        cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
    }
    else
        cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
    cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
    return err;
 }
 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, INT64 time_stamp)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int retcode = 0;
    struct vpx_usec_timer timer;
 //  if(pbi->ready_for_new_data == 0)
@@ -257,6 +341,8 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
    pbi->Source = source;
    pbi->source_sz = size;
    cm->new_fb_idx = get_free_fb (cm);
    retcode = vp8_decode_frame(pbi);
    if (retcode < 0)
@@ -269,23 +355,17 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        return retcode;
    }
    // Update the GF useage maps.
    vp8_update_gf_useage_maps(cm, &pbi->mb);
    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
        vp8_stop_lfthread(pbi);
-    if (cm->refresh_last_frame)
+    if (swap_frame_buffers (cm))
    {
-        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
-
+        pbi->common.error.setjmp = 0;
-        cm->frame_to_show = &cm->last_frame;
+        return -1;
    }
    else
    {
        cm->frame_to_show = &cm->new_frame;
    }
 /*
    if (!pbi->b_multithreaded_lf)
    {
        struct vpx_usec_timer lpftimer;
@@ -293,16 +373,45 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        // Apply the loop filter if appropriate.
        if (cm->filter_level > 0)
        {
            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
            cm->last_frame_type = cm->frame_type;
            cm->last_filter_type = cm->filter_type;
            cm->last_sharpness_level = cm->sharpness_level;
        }
        vpx_usec_timer_mark(&lpftimer);
        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
    }else{
      struct vpx_usec_timer lpftimer;
      vpx_usec_timer_start(&lpftimer);
      // Apply the loop filter if appropriate.
      if (cm->filter_level > 0)
          vp8_mt_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
      vpx_usec_timer_mark(&lpftimer);
      pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
    }
    if (cm->filter_level > 0) {
        cm->last_frame_type = cm->frame_type;
        cm->last_filter_type = cm->filter_type;
        cm->last_sharpness_level = cm->sharpness_level;
    }
 */
    if(pbi->common.filter_level)
    {
        struct vpx_usec_timer lpftimer;
        vpx_usec_timer_start(&lpftimer);
        // Apply the loop filter if appropriate.
        if (pbi->b_multithreaded_lf && cm->multi_token_partition != ONE_PARTITION)
            vp8_mt_loop_filter_frame(pbi);   //cm, &pbi->mb, cm->filter_level);
        else
            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
        vpx_usec_timer_mark(&lpftimer);
        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
        cm->last_frame_type = cm->frame_type;
        cm->last_filter_type = cm->filter_type;
        cm->last_sharpness_level = cm->sharpness_level;
    }
    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
@@ -314,49 +423,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
 #endif
    // If any buffer copy / swaping is signalled it should be done here.
    if (cm->copy_buffer_to_arf)
    {
        if (cm->copy_buffer_to_arf == 1)
        {
            if (cm->refresh_last_frame)
                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
            else
                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
        }
        else if (cm->copy_buffer_to_arf == 2)
            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
    }
    if (cm->copy_buffer_to_gf)
    {
        if (cm->copy_buffer_to_gf == 1)
        {
            if (cm->refresh_last_frame)
                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
            else
                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
        }
        else if (cm->copy_buffer_to_gf == 2)
            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
    }
    // Should the golden or alternate reference frame be refreshed?
    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
    {
        if (cm->refresh_golden_frame)
            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
        if (cm->refresh_alt_ref_frame)
            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
        //vpx_log("Decoder: recovery frame received \n");
        // Update data structures that monitors GF useage
        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
    }
    vp8_clear_system_state();
    vpx_usec_timer_mark(&timer);
--- a/vp8/decoder/onyxd_if_sjl.c
+++ b/vp8/decoder/onyxd_if_sjl.c
@@ -1,399 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "onyxc_int.h"
 #include "postproc.h"
 #include "onyxd.h"
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "alloccommon.h"
 #include "vpx_scale/yv12extend.h"
 #include "loopfilter.h"
 #include "swapyv12buffer.h"
 #include "g_common.h"
 #include "threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
 #include "segmentation_common.h"
 #include "quant_common.h"
 #include "vpx_scale/vpxscale.h"
 #include "systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #ifndef VPX_NO_GLOBALS
 static int init_ct = 0;
 #else
 # include "vpx_global_handling.h"
 # define init_ct ((int)vpxglobalm(onyxd,init_ct))
 #endif
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 extern void init_detokenizer(VP8D_COMP *dx);
 // DEBUG code
 void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
 {
    FILE *yuv_file = fopen((char *)name, "ab");
    unsigned char *src = s->y_buffer;
    int h = s->y_height;
    do
    {
        fwrite(src, s->y_width, 1,  yuv_file);
        src += s->y_stride;
    }
    while (--h);
    src = s->u_buffer;
    h = s->uv_height;
    do
    {
        fwrite(src, s->uv_width, 1,  yuv_file);
        src += s->uv_stride;
    }
    while (--h);
    src = s->v_buffer;
    h = s->uv_height;
    do
    {
        fwrite(src, s->uv_width, 1, yuv_file);
        src += s->uv_stride;
    }
    while (--h);
    fclose(yuv_file);
 }
 void vp8dx_initialize()
 {
    if (!init_ct++)
    {
        vp8_initialize_common();
        vp8_scale_machine_specific_config();
    }
 }
 void vp8dx_shutdown()
 {
    if (!--init_ct)
    {
        vp8_shutdown_common();
    }
 }
 VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 {
    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
    if (!pbi)
        return NULL;
    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
    vp8dx_initialize();
    vp8_create_common(&pbi->common);
    vp8_dmachine_specific_config(pbi);
    pbi->common.current_video_frame = 0;
    pbi->ready_for_new_data = 1;
    pbi->CPUFreq = 0; //vp8_get_processor_freq();
    pbi->max_threads = oxcf->max_threads;
    vp8_decoder_create_threads(pbi);
    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
    vp8cx_init_de_quantizer(pbi);
    {
        VP8_COMMON *cm = &pbi->common;
        vp8_init_loop_filter(cm);
        cm->last_frame_type = KEY_FRAME;
        cm->last_filter_type = cm->filter_type;
        cm->last_sharpness_level = cm->sharpness_level;
    }
    init_detokenizer(pbi);
    return (VP8D_PTR) pbi;
 }
 void vp8dx_remove_decompressor(VP8D_PTR ptr)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    if (!pbi)
        return;
    vp8_decoder_remove_threads(pbi);
    vp8_remove_common(&pbi->common);
    vpx_free(pbi);
    vp8dx_shutdown();
 }
 void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) comp;
    (void) pbi;
    (void) x;
    switch (oxst)
    {
    case VP8D_OK:
        break;
    }
 }
 int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) comp;
    (void) pbi;
    switch (oxst)
    {
    case VP8D_OK:
        break;
    }
    return -1;
 }
 int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    if (ref_frame_flag == VP8_LAST_FLAG)
        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
    else if (ref_frame_flag == VP8_GOLD_FLAG)
        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
    else if (ref_frame_flag == VP8_ALT_FLAG)
        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
    else
        return -1;
    return 0;
 }
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    if (ref_frame_flag == VP8_LAST_FLAG)
        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
    else if (ref_frame_flag == VP8_GOLD_FLAG)
        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
    else if (ref_frame_flag == VP8_ALT_FLAG)
        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
    else
        return -1;
    return 0;
 }
 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, char *source, INT64 time_stamp)
 {
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    VP8_COMMON *cm = &pbi->common;
    int retcode = 0;
    struct vpx_usec_timer timer;
    (void) size;
 //  if(pbi->ready_for_new_data == 0)
 //      return -1;
    vpx_usec_timer_start(&timer);
    if (ptr == 0)
    {
        return -1;
    }
    //cm->current_video_frame++;
    pbi->Source = source;
    retcode = vp8_decode_frame(pbi);
    if (retcode < 0)
        return retcode;
    // Update the GF useage maps.
    vp8_update_gf_useage_maps(cm, &pbi->mb);
    if (pbi->b_multithreaded)
        vp8_stop_lfthread(pbi);
    if (cm->refresh_last_frame)
    {
        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
        cm->frame_to_show = &cm->last_frame;
    }
    else
    {
        cm->frame_to_show = &cm->new_frame;
    }
    if (!pbi->b_multithreaded)
    {
        struct vpx_usec_timer lpftimer;
        vpx_usec_timer_start(&lpftimer);
        // Apply the loop filter if appropriate.
        if (cm->filter_level > 0)
        {
            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
            cm->last_frame_type = cm->frame_type;
            cm->last_filter_type = cm->filter_type;
            cm->last_sharpness_level = cm->sharpness_level;
        }
        vpx_usec_timer_mark(&lpftimer);
        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
    }
    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
 #if 0
    // DEBUG code
    //vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
    if (cm->current_video_frame <= 5)
        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
 #endif
    // If any buffer copy / swaping is signalled it should be done here.
    if (cm->copy_buffer_to_arf)
    {
        if (cm->copy_buffer_to_arf == 1)
        {
            if (cm->refresh_last_frame)
                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
            else
                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
        }
        else if (cm->copy_buffer_to_arf == 2)
            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
    }
    if (cm->copy_buffer_to_gf)
    {
        if (cm->copy_buffer_to_gf == 1)
        {
            if (cm->refresh_last_frame)
                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
            else
                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
        }
        else if (cm->copy_buffer_to_gf == 2)
            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
    }
    // Should the golden or alternate reference frame be refreshed?
    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
    {
        if (cm->refresh_golden_frame)
            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
        if (cm->refresh_alt_ref_frame)
            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
        //vpx_log("Decoder: recovery frame received \n");
        // Update data structures that monitors GF useage
        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
    }
    vp8_clear_system_state();
    vpx_usec_timer_mark(&timer);
    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
    pbi->time_decoding += pbi->decode_microseconds;
 //  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
    cm->current_video_frame++;
    pbi->ready_for_new_data = 0;
    pbi->last_time_stamp = time_stamp;
    {
        int i;
        INT64 earliest_time = pbi->dr[0].time_stamp;
        INT64 latest_time = pbi->dr[0].time_stamp;
        INT64 time_diff = 0;
        int bytes = 0;
        pbi->dr[pbi->common.current_video_frame&0xf].size = pbi->bc.pos + pbi->bc2.pos + 4;;
        pbi->dr[pbi->common.current_video_frame&0xf].time_stamp = time_stamp;
        for (i = 0; i < 16; i++)
        {
            bytes += pbi->dr[i].size;
            if (pbi->dr[i].time_stamp < earliest_time)
                earliest_time = pbi->dr[i].time_stamp;
            if (pbi->dr[i].time_stamp > latest_time)
                latest_time = pbi->dr[i].time_stamp;
        }
        time_diff = latest_time - earliest_time;
        if (time_diff > 0)
        {
            pbi->common.bitrate = 80000.00 * bytes / time_diff  ;
            pbi->common.framerate = 160000000.00 / time_diff ;
        }
    }
    return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
 {
    int ret = -1;
    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
    if (pbi->ready_for_new_data == 1)
        return ret;
    // ie no raw frame to show!!!
    if (pbi->common.show_frame == 0)
        return ret;
    pbi->ready_for_new_data = 1;
    *time_stamp = pbi->last_time_stamp;
    *time_end_stamp = 0;
    sd->clrtype = pbi->common.clr_type;
    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
    vp8_clear_system_state();
    return ret;
 }
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -48,21 +48,20 @@ typedef struct
 typedef struct
 {
-    int *scan;
+    int const *scan;
-    UINT8 *ptr_onyxblock2context_leftabove;
+    UINT8 const *ptr_block2leftabove;
-    vp8_tree_index *vp8_coef_tree_ptr;  //onyx_coef_tree_ptr; ???
+    vp8_tree_index const *vp8_coef_tree_ptr;
-    TOKENEXTRABITS *teb_base_ptr;
+    TOKENEXTRABITS const *teb_base_ptr;
    unsigned char *norm_ptr;
-//  UINT16 *ptr_onyx_coef_bands_x;
+    UINT8 *ptr_coef_bands_x;
    UINT8 *ptr_onyx_coef_bands_x;
-    ENTROPY_CONTEXT   **A;
+    ENTROPY_CONTEXT_PLANES *A;
-    ENTROPY_CONTEXT(*L)[4];
+    ENTROPY_CONTEXT_PLANES *L;
    INT16 *qcoeff_start_ptr;
    BOOL_DECODER *current_bc;
-    UINT8 *coef_probs[4];
+    vp8_prob const *coef_probs[4];
    UINT8 eob[25];
@@ -95,20 +94,22 @@ typedef struct VP8Decompressor
    int current_mb_col_main;
    int decoding_thread_count;
    int allocated_decoding_thread_count;
    int *current_mb_col;                  //Each row remembers its already decoded column.
    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
    // variable for threading
    DECLARE_ALIGNED(16, MACROBLOCKD, lpfmb);
 #if CONFIG_MULTITHREAD
-    pthread_t           h_thread_lpf;         // thread for postprocessing
+    //pthread_t           h_thread_lpf;         // thread for postprocessing
-    sem_t               h_event_lpf;          // Event for post_proc completed
+    sem_t               h_event_end_lpf;          // Event for post_proc completed
-    sem_t               h_event_start_lpf;
+    sem_t               *h_event_start_lpf;
 #endif
    MB_ROW_DEC           *mb_row_di;
    DECODETHREAD_DATA   *de_thread_data;
 #if CONFIG_MULTITHREAD
    pthread_t           *h_decoding_thread;
-    sem_t               *h_event_mbrdecoding;
+    sem_t               *h_event_start_decoding;
-    sem_t               h_event_main;
+    sem_t               h_event_end_decoding;
    // end of threading data
 #endif
    vp8_reader *mbc;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -12,6 +12,9 @@
 #ifndef WIN32
 # include <unistd.h>
 #endif
 #ifdef __APPLE__
 #include <mach/mach_init.h>
 #endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "threading.h"
@@ -20,6 +23,8 @@
 #include "extend.h"
 #include "vpx_ports/vpx_timer.h"
 #define MAX_ROWS 256
 extern void vp8_decode_mb_row(VP8D_COMP *pbi,
                              VP8_COMMON *pc,
                              int mb_row,
@@ -28,11 +33,10 @@ extern void vp8_decode_mb_row(VP8D_COMP *pbi,
 extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
 void vp8_thread_loop_filter(VP8D_COMP *pbi, MB_ROW_DEC *mbrd, int ithread);
 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i, j;
@@ -43,13 +47,10 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 #if CONFIG_RUNTIME_CPU_DETECT
        mbd->rtcd = xd->rtcd;
 #endif
        mbd->subpixel_predict        = xd->subpixel_predict;
        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
        mbd->gf_active_ptr            = xd->gf_active_ptr;
        mbd->mode_info        = pc->mi - 1;
        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
@@ -59,11 +60,8 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
        mbd->frames_since_golden      = pc->frames_since_golden;
        mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
-        mbd->pre = pc->last_frame;
+        mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
-        mbd->dst = pc->new_frame;
+        mbd->dst = pc->yv12_fb[pc->new_fb_idx];
        vp8_setup_block_dptrs(mbd);
        vp8_build_block_doffsets(mbd);
@@ -71,9 +69,6 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
        mbd->mbmi.mode = DC_PRED;
        mbd->mbmi.uv_mode = DC_PRED;
        mbd->current_bc = &pbi->bc2;
        for (j = 0; j < 25; j++)
@@ -82,6 +77,8 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
        }
    }
    for (i=0; i< pc->mb_rows; i++)
        pbi->current_mb_col[i]=-1;
 #else
    (void) pbi;
    (void) xd;
@@ -90,6 +87,69 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 #endif
 }
 void vp8_setup_loop_filter_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *const pc = & pbi->common;
    int i, j;
    for (i = 0; i < count; i++)
    {
        MACROBLOCKD *mbd = &mbrd[i].mbd;
 //#if CONFIG_RUNTIME_CPU_DETECT
 //        mbd->rtcd = xd->rtcd;
 //#endif
        //mbd->subpixel_predict        = xd->subpixel_predict;
        //mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
        //mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
        //mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
        mbd->mode_info        = pc->mi - 1;
        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
        mbd->mode_info_stride  = pc->mode_info_stride;
        //mbd->frame_type = pc->frame_type;
        //mbd->frames_since_golden      = pc->frames_since_golden;
        //mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
        //mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
        //mbd->dst = pc->yv12_fb[pc->new_fb_idx];
        //vp8_setup_block_dptrs(mbd);
        //vp8_build_block_doffsets(mbd);
        mbd->segmentation_enabled    = xd->segmentation_enabled;  //
        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;  //
        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));   //
        //signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
        //signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
        //unsigned char mode_ref_lf_delta_enabled;
        //unsigned char mode_ref_lf_delta_update;
        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
        //mbd->mbmi.mode = DC_PRED;
        //mbd->mbmi.uv_mode = DC_PRED;
        //mbd->current_bc = &pbi->bc2;
        //for (j = 0; j < 25; j++)
        //{
        //    mbd->block[j].dequant = xd->block[j].dequant;
        //}
    }
    for (i=0; i< pc->mb_rows; i++)
        pbi->current_mb_col[i]=-1;
 #else
    (void) pbi;
    (void) xd;
    (void) mbrd;
    (void) count;
 #endif
 }
 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
@@ -97,48 +157,51 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
-    ENTROPY_CONTEXT mb_row_left_context[4][4];
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
    while (1)
    {
        int current_filter_level = 0;
        if (pbi->b_multithreaded_rd == 0)
            break;
-        //if(WaitForSingleObject(pbi->h_event_mbrdecoding[ithread], INFINITE) == WAIT_OBJECT_0)
+        //if(WaitForSingleObject(pbi->h_event_start_decoding[ithread], INFINITE) == WAIT_OBJECT_0)
-        if (sem_wait(&pbi->h_event_mbrdecoding[ithread]) == 0)
+        if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
        {
            if (pbi->b_multithreaded_rd == 0)
                break;
            else
            {
                VP8_COMMON *pc = &pbi->common;
                int mb_row       = mbrd->mb_row;
                MACROBLOCKD *xd = &mbrd->mbd;
-                //printf("ithread:%d mb_row %d\n", ithread, mb_row);
+                int mb_row;
                int num_part = 1 << pbi->common.multi_token_partition;
                volatile int *last_row_current_mb_col;
                for (mb_row = ithread+1; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
                {
                    int i;
                    int recon_yoffset, recon_uvoffset;
                    int mb_col;
-                int recon_y_stride = pc->last_frame.y_stride;
+                    int ref_fb_idx = pc->lst_fb_idx;
-                int recon_uv_stride = pc->last_frame.uv_stride;
+                    int dst_fb_idx = pc->new_fb_idx;
                    int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
                    int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-                volatile int *last_row_current_mb_col;
+                    pbi->mb_row_di[ithread].mb_row = mb_row;
                    pbi->mb_row_di[ithread].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
-                if (ithread > 0)
+                    last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
                    last_row_current_mb_col = &pbi->mb_row_di[ithread-1].current_mb_col;
                else
                    last_row_current_mb_col = &pbi->current_mb_col_main;
                    recon_yoffset = mb_row * recon_y_stride * 16;
                    recon_uvoffset = mb_row * recon_uv_stride * 8;
                    // reset above block coeffs
-                xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
+                    xd->above_context = pc->above_context;
-                xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
+                    xd->left_context = &mb_row_left_context;
-                xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
+                    vpx_memset(&mb_row_left_context, 0, sizeof(mb_row_left_context));
                xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
                xd->left_context = mb_row_left_context;
                vpx_memset(mb_row_left_context, 0, sizeof(mb_row_left_context));
                    xd->up_available = (mb_row != 0);
                    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
@@ -146,19 +209,16 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
                    {
-
+                        if ((mb_col & 7) == 0)
-                    while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != pc->mb_cols - 1)
+                        {
                            while (mb_col > (*last_row_current_mb_col - 8) && *last_row_current_mb_col != pc->mb_cols - 1)
                            {
                                x86_pause_hint();
                                thread_sleep(0);
                            }
                        }
-                    // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
+                        if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
                    // the partition_bmi array is unused in the decoder, so don't copy it.
                    vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi,
                               sizeof(MB_MODE_INFO) - sizeof(xd->mbmi.partition_bmi));
                    if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
                        {
                            for (i = 0; i < 16; i++)
                            {
@@ -172,57 +232,42 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
                        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-                    xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
+                        xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-                    xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
+                        xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-                    xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
+                        xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                        xd->left_available = (mb_col != 0);
                        // Select the appropriate reference frame for this MB
-                    if (xd->mbmi.ref_frame == LAST_FRAME)
+                        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-                    {
+                            ref_fb_idx = pc->lst_fb_idx;
-                        xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
+                        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-                        xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
+                            ref_fb_idx = pc->gld_fb_idx;
                        xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
                    }
                    else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
                    {
                        // Golden frame reconstruction buffer
                        xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
                        xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
                        xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
                    }
                        else
-                    {
+                            ref_fb_idx = pc->alt_fb_idx;
-                        // Alternate reference frame reconstruction buffer
+
-                        xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
+                        xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
+                        xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
                    }
                        vp8_build_uvmvs(xd, pc->full_pixel);
                        vp8_decode_macroblock(pbi, xd);
                        recon_yoffset += 16;
                        recon_uvoffset += 8;
                        ++xd->mode_info_context;  /* next mb */
-                    xd->gf_active_ptr++;      // GF useage flag for next MB
+                        xd->above_context++;
                    xd->above_context[Y1CONTEXT] += 4;
                    xd->above_context[UCONTEXT ] += 2;
                    xd->above_context[VCONTEXT ] += 2;
                    xd->above_context[Y2CONTEXT] ++;
                    pbi->mb_row_di[ithread].current_mb_col = mb_col;
                        //pbi->mb_row_di[ithread].current_mb_col = mb_col;
                        pbi->current_mb_col[mb_row] = mb_col;
                    }
                    // adjust to the next row of mbs
                    vp8_extend_mb_row(
-                    &pc->new_frame,
+                    &pc->yv12_fb[dst_fb_idx],
                    xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
                    );
@@ -231,23 +276,27 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                    // since we have multithread
                    xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
                //memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
                if ((mb_row & 1) == 1)
                {
                    pbi->last_mb_row_decoded = mb_row;
-                    //printf("S%d", pbi->last_mb_row_decoded);
+
                }
            }
        }
-                if (ithread == (pbi->decoding_thread_count - 1) || mb_row == pc->mb_rows - 1)
+        // If |pbi->common.filter_level| is 0 the value can change in-between
        // the sem_post and the check to call vp8_thread_loop_filter.
        current_filter_level = pbi->common.filter_level;
        //  add this to each frame
        if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
        {
-                    //SetEvent(pbi->h_event_main);
+            //SetEvent(pbi->h_event_end_decoding);
-                    sem_post(&pbi->h_event_main);
+            sem_post(&pbi->h_event_end_decoding);
                }
            }
        }
        }
        if ((pbi->b_multithreaded_lf) && (current_filter_level))
            vp8_thread_loop_filter(pbi, mbrd, ithread);
    }
 #else
    (void) p_data;
 #endif
@@ -255,93 +304,60 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
    return 0 ;
 }
-THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
+
 void vp8_thread_loop_filter(VP8D_COMP *pbi, MB_ROW_DEC *mbrd, int ithread)
 {
 #if CONFIG_MULTITHREAD
    VP8D_COMP *pbi = (VP8D_COMP *)p_data;
-    while (1)
+        if (sem_wait(&pbi->h_event_start_lpf[ithread]) == 0)
        {
-        if (pbi->b_multithreaded_lf == 0)
+           // if (pbi->b_multithreaded_lf == 0) // we're shutting down      ????
-            break;
+           //     break;
-
+           // else
        //printf("before waiting for start_lpf\n");
        //if(WaitForSingleObject(pbi->h_event_start_lpf, INFINITE) == WAIT_OBJECT_0)
        if (sem_wait(&pbi->h_event_start_lpf) == 0)
            {
            if (pbi->b_multithreaded_lf == 0) // we're shutting down
                break;
            else
            {
                VP8_COMMON *cm  = &pbi->common;
-                MACROBLOCKD *mbd = &pbi->lpfmb;
+                MACROBLOCKD *mbd = &mbrd->mbd;
                int default_filt_lvl = pbi->common.filter_level;
-                YV12_BUFFER_CONFIG *post = &cm->new_frame;
+                YV12_BUFFER_CONFIG *post = cm->frame_to_show;
                loop_filter_info *lfi = cm->lf_info;
                //int frame_type = cm->frame_type;
                int mb_row;
                int mb_col;
                int baseline_filter_level[MAX_MB_SEGMENTS];
                int filter_level;
                int alt_flt_enabled = mbd->segmentation_enabled;
                int i;
                unsigned char *y_ptr, *u_ptr, *v_ptr;
-                volatile int *last_mb_row_decoded = &pbi->last_mb_row_decoded;
+                volatile int *last_row_current_mb_col;
                //MODE_INFO * this_mb_mode_info = cm->mi;
                mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
                // Note the baseline filter values for each segment
                if (alt_flt_enabled)
                {
                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
                    {
                        if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
                            baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
                        else
                        {
                            baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
                            baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
                        }
                    }
                }
                else
                {
                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
                        baseline_filter_level[i] = default_filt_lvl;
                }
                // Initialize the loop filter for this frame.
                vp8_init_loop_filter(cm);
                // Set up the buffer pointers
-                y_ptr = post->y_buffer;
+                y_ptr = post->y_buffer + post->y_stride  * 16 * (ithread +1);
-                u_ptr = post->u_buffer;
+                u_ptr = post->u_buffer + post->uv_stride *  8 * (ithread +1);
-                v_ptr = post->v_buffer;
+                v_ptr = post->v_buffer + post->uv_stride *  8 * (ithread +1);
                // vp8_filter each macro block
-                for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+                for (mb_row = ithread+1; mb_row < cm->mb_rows; mb_row+= (pbi->decoding_thread_count + 1))
                {
                    last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
                    while (mb_row >= *last_mb_row_decoded)
                    {
                        x86_pause_hint();
                        thread_sleep(0);
                    }
                    //printf("R%d", mb_row);
                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                    {
                        int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-                        filter_level = baseline_filter_level[Segment];
+                        if ((mb_col & 7) == 0)
                        {
                            while (mb_col > (*last_row_current_mb_col-8) && *last_row_current_mb_col != cm->mb_cols - 1)
                            {
                                x86_pause_hint();
                                thread_sleep(0);
                            }
                        }
                        filter_level = pbi->mt_baseline_filter_level[Segment];
                        // Apply any context driven MB level adjustment
                        vp8_adjust_mb_lf_value(mbd, &filter_level);
@@ -367,29 +383,28 @@ THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
                        v_ptr += 8;
                        mbd->mode_info_context++;     // step to next MB
-
+                        pbi->current_mb_col[mb_row] = mb_col;
                    }
                    y_ptr += post->y_stride  * 16 - post->y_width;
                    u_ptr += post->uv_stride *  8 - post->uv_width;
                    v_ptr += post->uv_stride *  8 - post->uv_width;
                    mbd->mode_info_context++;         // Skip border mb
                }
-                //printf("R%d\n", mb_row);
+                    y_ptr += post->y_stride  * 16 * (pbi->decoding_thread_count + 1) - post->y_width;
-                // When done, signal main thread that ME is finished
+                    u_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
-                //SetEvent(pbi->h_event_lpf);
+                    v_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
                sem_post(&pbi->h_event_lpf);
            }
                    mbd->mode_info_context += pbi->decoding_thread_count * mbd->mode_info_stride;         // Skip border mb
                }
            }
        }
        //  add this to each frame
        if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
        {
          sem_post(&pbi->h_event_end_lpf);
        }
 #else
-    (void) p_data;
+    (void) pbi;
 #endif
    return 0;
 }
 void vp8_decoder_create_threads(VP8D_COMP *pbi)
@@ -401,39 +416,38 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
    pbi->b_multithreaded_rd = 0;
    pbi->b_multithreaded_lf = 0;
    pbi->allocated_decoding_thread_count = 0;
-    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; //vp8_get_proc_core_count();
+    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
    if (core_count > 1)
    {
        sem_init(&pbi->h_event_lpf, 0, 0);
        sem_init(&pbi->h_event_start_lpf, 0, 0);
        pbi->b_multithreaded_lf = 1;
        pthread_create(&pbi->h_thread_lpf, 0, vp8_thread_loop_filter, (pbi));
    }
    if (core_count > 1)
    {
        pbi->b_multithreaded_rd = 1;
        pbi->b_multithreaded_lf = 1;  // this can be merged with pbi->b_multithreaded_rd ?
        pbi->decoding_thread_count = core_count -1;
        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_mbrdecoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
        CHECK_MEM_ERROR(pbi->current_mb_col, vpx_malloc(sizeof(int) * MAX_ROWS));  // pc->mb_rows));
        CHECK_MEM_ERROR(pbi->h_event_start_lpf, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
        {
-            sem_init(&pbi->h_event_mbrdecoding[ithread], 0, 0);
+            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
            sem_init(&pbi->h_event_start_lpf[ithread], 0, 0);
            pbi->de_thread_data[ithread].ithread  = ithread;
            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
        }
-        sem_init(&pbi->h_event_main, 0, 0);
+        sem_init(&pbi->h_event_end_decoding, 0, 0);
        sem_init(&pbi->h_event_end_lpf, 0, 0);
        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
    }
@@ -448,39 +462,35 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
    if (pbi->b_multithreaded_lf)
    {
        int i;
        pbi->b_multithreaded_lf = 0;
-        sem_post(&pbi->h_event_start_lpf);
+
-        pthread_join(pbi->h_thread_lpf, 0);
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
-        sem_destroy(&pbi->h_event_start_lpf);
+            sem_destroy(&pbi->h_event_start_lpf[i]);
        sem_destroy(&pbi->h_event_end_lpf);
    }
    //shutdown MB Decoding thread;
    if (pbi->b_multithreaded_rd)
    {
        pbi->b_multithreaded_rd = 0;
        // allow all threads to exit
    {
        int i;
        pbi->b_multithreaded_rd = 0;
        // allow all threads to exit
        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
        {
-
+            sem_post(&pbi->h_event_start_decoding[i]);
                sem_post(&pbi->h_event_mbrdecoding[i]);
            pthread_join(pbi->h_decoding_thread[i], NULL);
        }
        }
        {
            int i;
        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
        {
-                sem_destroy(&pbi->h_event_mbrdecoding[i]);
+            sem_destroy(&pbi->h_event_start_decoding[i]);
        }
-
+        sem_destroy(&pbi->h_event_end_decoding);
        }
        sem_destroy(&pbi->h_event_main);
        if (pbi->h_decoding_thread)
        {
@@ -488,10 +498,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            pbi->h_decoding_thread = NULL;
        }
-        if (pbi->h_event_mbrdecoding)
+        if (pbi->h_event_start_decoding)
        {
-            vpx_free(pbi->h_event_mbrdecoding);
+            vpx_free(pbi->h_event_start_decoding);
-            pbi->h_event_mbrdecoding = NULL;
+            pbi->h_event_start_decoding = NULL;
        }
        if (pbi->h_event_start_lpf)
        {
            vpx_free(pbi->h_event_start_lpf);
            pbi->h_event_start_lpf = NULL;
        }
        if (pbi->mb_row_di)
@@ -505,8 +521,13 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
            vpx_free(pbi->de_thread_data);
            pbi->de_thread_data = NULL;
        }
    }
        if (pbi->current_mb_col)
        {
            vpx_free(pbi->current_mb_col);
            pbi->current_mb_col = NULL ;
        }
    }
 #else
    (void) pbi;
 #endif
@@ -516,9 +537,12 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 void vp8_start_lfthread(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
  /*
    memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
    pbi->last_mb_row_decoded = 0;
    sem_post(&pbi->h_event_start_lpf);
    */
    (void) pbi;
 #else
    (void) pbi;
 #endif
@@ -527,14 +551,17 @@ void vp8_start_lfthread(VP8D_COMP *pbi)
 void vp8_stop_lfthread(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
  /*
    struct vpx_usec_timer timer;
    vpx_usec_timer_start(&timer);
-    sem_wait(&pbi->h_event_lpf);
+    sem_wait(&pbi->h_event_end_lpf);
    vpx_usec_timer_mark(&timer);
    pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
    */
    (void) pbi;
 #else
    (void) pbi;
 #endif
@@ -550,49 +577,246 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
    int ibc = 0;
    int num_part = 1 << pbi->common.multi_token_partition;
    int i;
    volatile int *last_row_current_mb_col = NULL;
    vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
    for (i = 0; i < pbi->decoding_thread_count; i++)
        sem_post(&pbi->h_event_start_decoding[i]);
    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
    {
        int i;
        pbi->current_mb_col_main = -1;
-        xd->current_bc = &pbi->mbc[ibc];
+        xd->current_bc = &pbi->mbc[mb_row%num_part];
        ibc++ ;
-        if (ibc == num_part)
+        //vp8_decode_mb_row(pbi, pc, mb_row, xd);
            ibc = 0;
        for (i = 0; i < pbi->decoding_thread_count; i++)
        {
-            if ((mb_row + i + 1) >= pc->mb_rows)
+            int i;
-                break;
+            int recon_yoffset, recon_uvoffset;
            int mb_col;
            int ref_fb_idx = pc->lst_fb_idx;
            int dst_fb_idx = pc->new_fb_idx;
            int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
            int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-            pbi->mb_row_di[i].mb_row = mb_row + i + 1;
+           // volatile int *last_row_current_mb_col = NULL;
-            pbi->mb_row_di[i].mbd.current_bc =  &pbi->mbc[ibc];
+            if (mb_row > 0)
-            ibc++;
+                last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
-            if (ibc == num_part)
+            vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
-                ibc = 0;
+            recon_yoffset = mb_row * recon_y_stride * 16;
            recon_uvoffset = mb_row * recon_uv_stride * 8;
            // reset above block coeffs
-            pbi->mb_row_di[i].current_mb_col = -1;
+            xd->above_context = pc->above_context;
-            sem_post(&pbi->h_event_mbrdecoding[i]);
+            xd->up_available = (mb_row != 0);
        }
-        vp8_decode_mb_row(pbi, pc, mb_row, xd);
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+            for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
        if (mb_row < pc->mb_rows - 1)
            {
-            sem_wait(&pbi->h_event_main);
+                if ( mb_row > 0 && (mb_col & 7) == 0){
                    while (mb_col > (*last_row_current_mb_col - 8) && *last_row_current_mb_col != pc->mb_cols - 1)
                    {
                        x86_pause_hint();
                        thread_sleep(0);
                    }
                }
                if (xd->mode_info_context->mbmi.mode == SPLITMV || xd->mode_info_context->mbmi.mode == B_PRED)
                {
                    for (i = 0; i < 16; i++)
                    {
                        BLOCKD *d = &xd->block[i];
                        vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
                    }
                }
                // Distance of Mb to the various image edges.
                // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
                xd->mb_to_left_edge = -((mb_col * 16) << 3);
                xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
                xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
                xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
                xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
                xd->left_available = (mb_col != 0);
                // Select the appropriate reference frame for this MB
                if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
                    ref_fb_idx = pc->lst_fb_idx;
                else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
                    ref_fb_idx = pc->gld_fb_idx;
                else
                    ref_fb_idx = pc->alt_fb_idx;
                xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
                xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
                xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
                vp8_build_uvmvs(xd, pc->full_pixel);
                vp8_decode_macroblock(pbi, xd);
                recon_yoffset += 16;
                recon_uvoffset += 8;
                ++xd->mode_info_context;  /* next mb */
                xd->above_context++;
                //pbi->current_mb_col_main = mb_col;
                pbi->current_mb_col[mb_row] = mb_col;
            }
            // adjust to the next row of mbs
            vp8_extend_mb_row(
                &pc->yv12_fb[dst_fb_idx],
                xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
            );
            ++xd->mode_info_context;      /* skip prediction column */
            pbi->last_mb_row_decoded = mb_row;
        }
        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
    }
    sem_wait(&pbi->h_event_end_decoding);   // add back for each frame
 #else
    (void) pbi;
    (void) xd;
 #endif
 }
 void vp8_mt_loop_filter_frame( VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
    VP8_COMMON *cm  = &pbi->common;
    MACROBLOCKD *mbd = &pbi->mb;
    int default_filt_lvl = pbi->common.filter_level;
    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
    loop_filter_info *lfi = cm->lf_info;
    int frame_type = cm->frame_type;
    int mb_row;
    int mb_col;
    int filter_level;
    int alt_flt_enabled = mbd->segmentation_enabled;
    int i;
    unsigned char *y_ptr, *u_ptr, *v_ptr;
    volatile int *last_row_current_mb_col=NULL;
    vp8_setup_loop_filter_thread_data(pbi, mbd, pbi->mb_row_di, pbi->decoding_thread_count);
    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
    // Note the baseline filter values for each segment
    if (alt_flt_enabled)
    {
        for (i = 0; i < MAX_MB_SEGMENTS; i++)
        {
            // Abs value
            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
                pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
            // Delta Value
            else
            {
                pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
                pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
            }
        }
    }
    else
    {
        for (i = 0; i < MAX_MB_SEGMENTS; i++)
            pbi->mt_baseline_filter_level[i] = default_filt_lvl;
    }
    // Initialize the loop filter for this frame.
    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
        vp8_init_loop_filter(cm);
    else if (frame_type != cm->last_frame_type)
        vp8_frame_init_loop_filter(lfi, frame_type);
    for (i = 0; i < pbi->decoding_thread_count; i++)
        sem_post(&pbi->h_event_start_lpf[i]);
        // sem_post(&pbi->h_event_start_lpf);
    // Set up the buffer pointers
    y_ptr = post->y_buffer;
    u_ptr = post->u_buffer;
    v_ptr = post->v_buffer;
    // vp8_filter each macro block
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row+= (pbi->decoding_thread_count + 1))
    {
        if (mb_row > 0)
            last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
            if ( mb_row > 0 && (mb_col & 7) == 0){
            // if ( mb_row > 0 ){
                while (mb_col > (*last_row_current_mb_col-8) && *last_row_current_mb_col != cm->mb_cols - 1)
                {
                    x86_pause_hint();
                    thread_sleep(0);
                }
            }
            filter_level = pbi->mt_baseline_filter_level[Segment];
            // Distance of Mb to the various image edges.
            // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
            // Apply any context driven MB level adjustment
            vp8_adjust_mb_lf_value(mbd, &filter_level);
            if (filter_level)
            {
                if (mb_col > 0)
                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
                if (mbd->mode_info_context->mbmi.dc_diff > 0)
                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
                // don't apply across umv border
                if (mb_row > 0)
                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
                if (mbd->mode_info_context->mbmi.dc_diff > 0)
                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
            }
            y_ptr += 16;
            u_ptr += 8;
            v_ptr += 8;
            mbd->mode_info_context++;     // step to next MB
            pbi->current_mb_col[mb_row] = mb_col;
        }
        mbd->mode_info_context++;         // Skip border mb
        //update for multi-thread
        y_ptr += post->y_stride  * 16 * (pbi->decoding_thread_count + 1) - post->y_width;
        u_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
        v_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
        mbd->mode_info_context += pbi->decoding_thread_count * mbd->mode_info_stride;
    }
    sem_wait(&pbi->h_event_end_lpf);
 #else
    (void) pbi;
 #endif
 }
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -50,12 +50,12 @@ sym(vp8_dequantize_b_impl_mmx):
    ret
-;void dequant_idct_mmx(short *input, short *dq, short *output, int pitch)
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
-global sym(vp8_dequant_idct_mmx)
+global sym(vp8_dequant_idct_add_mmx)
-sym(vp8_dequant_idct_mmx):
+sym(vp8_dequant_idct_add_mmx):
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
@@ -77,7 +77,8 @@ sym(vp8_dequant_idct_mmx):
        movq        mm3,    [rax+24]
        pmullw      mm3,    [rdx+24]
-        mov         rdx,    arg(2) ;output
+        mov         rdx,    arg(3) ;dest
        mov         rsi,    arg(2) ;pred
        pxor        mm7,    mm7
@@ -88,7 +89,8 @@ sym(vp8_dequant_idct_mmx):
        movq        [rax+24],mm7
-        movsxd      rax,            dword ptr arg(3) ;pitch
+        movsxd      rax,            dword ptr arg(4) ;pitch
        movsxd      rdi,            dword ptr arg(5) ;stride
        psubw       mm0,            mm2             ; b1= 0-2
        paddw       mm2,            mm2             ;
@@ -207,13 +209,34 @@ sym(vp8_dequant_idct_mmx):
        punpckldq   mm2,            mm4             ; 32 22 12 02
        punpckhdq   mm5,            mm4             ; 33 23 13 03
-        movq        [rdx],          mm0
+        pxor        mm7,            mm7
-        movq        [rdx+rax],      mm1
+        movd        mm4,            [rsi]
-        movq        [rdx+rax*2],    mm2
+        punpcklbw   mm4,            mm7
        paddsw      mm0,            mm4
        packuswb    mm0,            mm7
        movd        [rdx],          mm0
-        add         rdx,            rax
+        movd        mm4,            [rsi+rax]
-        movq        [rdx+rax*2],    mm5
+        punpcklbw   mm4,            mm7
        paddsw      mm1,            mm4
        packuswb    mm1,            mm7
        movd        [rdx+rdi],      mm1
        movd        mm4,            [rsi+2*rax]
        punpcklbw   mm4,            mm7
        paddsw      mm2,            mm4
        packuswb    mm2,            mm7
        movd        [rdx+rdi*2],    mm2
        add         rdx,            rdi
        add         rsi,            rax
        movd        mm4,            [rsi+2*rax]
        punpcklbw   mm4,            mm7
        paddsw      mm5,            mm4
        packuswb    mm5,            mm7
        movd        [rdx+rdi*2],    mm5
    ; begin epilog
    pop rdi
@@ -224,12 +247,12 @@ sym(vp8_dequant_idct_mmx):
    ret
-;void dequant_dc_idct_mmx(short *input, short *dq, short *output, int pitch, int Dc)
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
-global sym(vp8_dequant_dc_idct_mmx)
+global sym(vp8_dequant_dc_idct_add_mmx)
-sym(vp8_dequant_dc_idct_mmx):
+sym(vp8_dequant_dc_idct_add_mmx):
    push        rbp
    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
+    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
@@ -238,8 +261,6 @@ sym(vp8_dequant_dc_idct_mmx):
        mov         rax,    arg(0) ;input
        mov         rdx,    arg(1) ;dq
        movsxd      rcx,    dword ptr arg(4) ;Dc
        movq        mm0,    [rax   ]
        pmullw      mm0,    [rdx]
@@ -252,7 +273,8 @@ sym(vp8_dequant_dc_idct_mmx):
        movq        mm3,    [rax+24]
        pmullw      mm3,    [rdx+24]
-        mov         rdx,    arg(2) ;output
+        mov         rdx,    arg(3) ;dest
        mov         rsi,    arg(2) ;pred
        pxor        mm7,    mm7
@@ -262,8 +284,15 @@ sym(vp8_dequant_dc_idct_mmx):
        movq        [rax+16],mm7
        movq        [rax+24],mm7
-        pinsrw      mm0,    rcx,  0
+        ; move lower word of Dc to lower word of mm0
-        movsxd      rax,            dword ptr arg(3) ;pitch
+        psrlq       mm0,    16
        movzx       rcx,    word ptr arg(6) ;Dc
        psllq       mm0,    16
        movd        mm7,    rcx
        por         mm0,    mm7
        movsxd      rax,            dword ptr arg(4) ;pitch
        movsxd      rdi,            dword ptr arg(5) ;stride
        psubw       mm0,            mm2             ; b1= 0-2
        paddw       mm2,            mm2             ;
@@ -382,13 +411,34 @@ sym(vp8_dequant_dc_idct_mmx):
        punpckldq   mm2,            mm4             ; 32 22 12 02
        punpckhdq   mm5,            mm4             ; 33 23 13 03
-        movq        [rdx],          mm0
+        pxor        mm7,            mm7
-        movq        [rdx+rax],      mm1
+        movd        mm4,            [rsi]
-        movq        [rdx+rax*2],    mm2
+        punpcklbw   mm4,            mm7
        paddsw      mm0,            mm4
        packuswb    mm0,            mm7
        movd        [rdx],          mm0
-        add         rdx,            rax
+        movd        mm4,            [rsi+rax]
-        movq        [rdx+rax*2],    mm5
+        punpcklbw   mm4,            mm7
        paddsw      mm1,            mm4
        packuswb    mm1,            mm7
        movd        [rdx+rdi],      mm1
        movd        mm4,            [rsi+2*rax]
        punpcklbw   mm4,            mm7
        paddsw      mm2,            mm4
        packuswb    mm2,            mm7
        movd        [rdx+rdi*2],    mm2
        add         rdx,            rdi
        add         rsi,            rax
        movd        mm4,            [rsi+2*rax]
        punpcklbw   mm4,            mm7
        paddsw      mm5,            mm4
        packuswb    mm5,            mm7
        movd        [rdx+rdi*2],    mm5
    ; begin epilog
    pop rdi
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -21,19 +21,48 @@
 */
 #if HAVE_MMX
 extern prototype_dequant_block(vp8_dequantize_b_mmx);
-extern prototype_dequant_idct(vp8_dequant_idct_mmx);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
-extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_mmx);
+extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
-
+extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_mmx
-#undef  vp8_dequant_idct
+#undef  vp8_dequant_idct_add
-#define vp8_dequant_idct vp8_dequant_idct_mmx
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-#undef  vp8_dequant_idct_dc
+#undef  vp8_dequant_dc_idct_add
-#define vp8_dequant_idct_dc vp8_dequant_dc_idct_mmx
+#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
 #undef vp8_dequant_dc_idct_add_y_block
 #define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx
 #endif
 #endif
 #if HAVE_SSE2
 extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef vp8_dequant_dc_idct_add_y_block
 #define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
 #undef vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
 #endif
 #endif
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -0,0 +1,151 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "dequantize.h"
 void vp8_dequant_dc_idct_add_y_block_mmx
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
        else
            vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
        if (eobs[1] > 1)
            vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
        else
            vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
        if (eobs[2] > 1)
            vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
        else
            vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
        if (eobs[3] > 1)
            vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
        else
            vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
        q    += 64;
        dc   += 4;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_y_block_mmx
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
            ((int *)(q+16))[0] = 0;
        }
        if (eobs[2] > 1)
            vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
            ((int *)(q+32))[0] = 0;
        }
        if (eobs[3] > 1)
            vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
            ((int *)(q+48))[0] = 0;
        }
        q    += 64;
        pre  += 64;
        dst  += 4*stride;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_mmx
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstu += 4*stride;
        eobs += 2;
    }
    for (i = 0; i < 2; i++)
    {
        if (eobs[0] > 1)
            vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
            ((int *)q)[0] = 0;
        }
        if (eobs[1] > 1)
            vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
        else
        {
            vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
            ((int *)(q+16))[0] = 0;
        }
        q    += 32;
        pre  += 32;
        dstv += 4*stride;
        eobs += 2;
    }
 }
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -0,0 +1,114 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "dequantize.h"
 void idct_dequant_dc_0_2x_sse2
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int dst_stride, short *dc);
 void idct_dequant_dc_full_2x_sse2
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int dst_stride, short *dc);
 void idct_dequant_0_2x_sse2
            (short *q, short *dq ,unsigned char *pre,
             unsigned char *dst, int dst_stride, int blk_stride);
 void idct_dequant_full_2x_sse2
            (short *q, short *dq ,unsigned char *pre,
             unsigned char *dst, int dst_stride, int blk_stride);
 void vp8_dequant_dc_idct_add_y_block_sse2
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs, short *dc)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (((short *)(eobs))[0] & 0xfefe)
            idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
        else
            idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
        if (((short *)(eobs))[1] & 0xfefe)
            idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
        else
            idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
        q    += 64;
        dc   += 4;
        pre  += 64;
        dst  += stride*4;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_y_block_sse2
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dst, int stride, char *eobs)
 {
    int i;
    for (i = 0; i < 4; i++)
    {
        if (((short *)(eobs))[0] & 0xfefe)
            idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
        else
            idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
        if (((short *)(eobs))[1] & 0xfefe)
            idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
        else
            idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
        q    += 64;
        pre  += 64;
        dst  += stride*4;
        eobs += 4;
    }
 }
 void vp8_dequant_idct_add_uv_block_sse2
            (short *q, short *dq, unsigned char *pre,
             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
    if (((short *)(eobs))[0] & 0xfefe)
        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
    else
        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
    q    += 32;
    pre  += 32;
    dstu += stride*4;
    if (((short *)(eobs))[1] & 0xfefe)
        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
    else
        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
    q    += 32;
    pre  += 32;
    if (((short *)(eobs))[2] & 0xfefe)
        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
    else
        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
    q    += 32;
    pre  += 32;
    dstv += stride*4;
    if (((short *)(eobs))[3] & 0xfefe)
        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
    else
        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
 }
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -39,14 +39,24 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 #if CONFIG_RUNTIME_CPU_DETECT
    /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
    if (flags & HAS_MMX)
    {
        pbi->dequant.block               = vp8_dequantize_b_mmx;
-        pbi->dequant.idct    = vp8_dequant_idct_mmx;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        pbi->dequant.idct_dc = vp8_dequant_dc_idct_mmx;
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
    }
 #endif
 #if HAVE_SSE2
    if (flags & HAS_SSE2)
    {
        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
    }
 #endif
 #endif
 #endif
 }
--- a/vp8/dixie/bool_decoder.h
+++ b/vp8/dixie/bool_decoder.h
@@ -1,144 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license and
 *  patent grant that can be found in the LICENSE file in the root of
 *  the source tree. All contributing project authors may be found in
 *  the AUTHORS file in the root of the source tree.
 */
 #ifndef BOOL_DECODER_H
 #define BOOL_DECODER_H
 #include <stddef.h>
 struct bool_decoder
 {
    const unsigned char *input;      /* next compressed data byte */
    size_t               input_len;  /* length of the input buffer */
    unsigned int         range;      /* identical to encoder's range */
    unsigned int         value;      /* contains at least 8 significant
                                      * bits */
    int                  bit_count;  /* # of bits shifted out of value,
                                      * max 7 */
 };
 static void
 init_bool_decoder(struct bool_decoder *d,
                  const unsigned char *start_partition,
                  size_t               sz)
 {
    if (sz >= 2)
    {
        d->value = (start_partition[0] << 8) /* first 2 input bytes */
                   | start_partition[1];
        d->input = start_partition + 2;      /* ptr to next byte */
        d->input_len = sz - 2;
    }
    else
    {
        d->value = 0;
        d->input = NULL;
        d->input_len = 0;
    }
    d->range = 255;    /* initial range is full */
    d->bit_count = 0;  /* have not yet shifted out any bits */
 }
 static int bool_get(struct bool_decoder *d, int probability)
 {
    /* range and split are identical to the corresponding values
       used by the encoder when this bool was written */
    unsigned int  split = 1 + (((d->range - 1) * probability) >> 8);
    unsigned int  SPLIT = split << 8;
    int           retval;           /* will be 0 or 1 */
    if (d->value >= SPLIT)    /* encoded a one */
    {
        retval = 1;
        d->range -= split;  /* reduce range */
        d->value -= SPLIT;  /* subtract off left endpoint of interval */
    }
    else                  /* encoded a zero */
    {
        retval = 0;
        d->range = split; /* reduce range, no change in left endpoint */
    }
    while (d->range < 128)    /* shift out irrelevant value bits */
    {
        d->value <<= 1;
        d->range <<= 1;
        if (++d->bit_count == 8)    /* shift in new bits 8 at a time */
        {
            d->bit_count = 0;
            if (d->input_len)
            {
                d->value |= *d->input++;
                d->input_len--;
            }
        }
    }
    return retval;
 }
 static int bool_get_bit(struct bool_decoder *br)
 {
    return bool_get(br, 128);
 }
 static int bool_get_uint(struct bool_decoder *br, int bits)
 {
    int z = 0;
    int bit;
    for (bit = bits - 1; bit >= 0; bit--)
    {
        z |= (bool_get_bit(br) << bit);
    }
    return z;
 }
 static int bool_get_int(struct bool_decoder *br, int bits)
 {
    int z = 0;
    int bit;
    for (bit = bits - 1; bit >= 0; bit--)
    {
        z |= (bool_get_bit(br) << bit);
    }
    return bool_get_bit(br) ? -z : z;
 }
 static int bool_maybe_get_int(struct bool_decoder *br, int bits)
 {
    return bool_get_bit(br) ? bool_get_int(br, bits) : 0;
 }
 static int
 bool_read_tree(struct bool_decoder *bool,
               const int           *t,
               const unsigned char *p)
 {
    int i = 0;
    while ((i = t[ i + bool_get(bool, p[i>>1])]) > 0) ;
    return -i;
 }
 #endif
--- a/vp8/dixie/dequant_data.h
+++ b/vp8/dixie/dequant_data.h
@@ -1,38 +0,0 @@
 static const int dc_q_lookup[128] =
 {
    4,    5,    6,    7,    8,    9,    10,   10,
    11,   12,   13,   14,   15,   16,   17,   17,
    18,   19,   20,   20,   21,   21,   22,   22,
    23,   23,   24,   25,   25,   26,   27,   28,
    29,   30,   31,   32,   33,   34,   35,   36,
    37,   37,   38,   39,   40,   41,   42,   43,
    44,   45,   46,   46,   47,   48,   49,   50,
    51,   52,   53,   54,   55,   56,   57,   58,
    59,   60,   61,   62,   63,   64,   65,   66,
    67,   68,   69,   70,   71,   72,   73,   74,
    75,   76,   76,   77,   78,   79,   80,   81,
    82,   83,   84,   85,   86,   87,   88,   89,
    91,   93,   95,   96,   98,   100,  101,  102,
    104,  106,  108,  110,  112,  114,  116,  118,
    122,  124,  126,  128,  130,  132,  134,  136,
    138,  140,  143,  145,  148,  151,  154,  157
 };
 static const int ac_q_lookup[128] =
 {
    4,    5,    6,    7,    8,    9,    10,   11,
    12,   13,   14,   15,   16,   17,   18,   19,
    20,   21,   22,   23,   24,   25,   26,   27,
    28,   29,   30,   31,   32,   33,   34,   35,
    36,   37,   38,   39,   40,   41,   42,   43,
    44,   45,   46,   47,   48,   49,   50,   51,
    52,   53,   54,   55,   56,   57,   58,   60,
    62,   64,   66,   68,   70,   72,   74,   76,
    78,   80,   82,   84,   86,   88,   90,   92,
    94,   96,   98,   100,  102,  104,  106,  108,
    110,  112,  114,  116,  119,  122,  125,  128,
    131,  134,  137,  140,  143,  146,  149,  152,
    155,  158,  161,  164,  167,  170,  173,  177,
    181,  185,  189,  193,  197,  201,  205,  209,
    213,  217,  221,  225,  229,  234,  239,  245,
    249,  254,  259,  264,  269,  274,  279,  284
 };
--- a/vp8/dixie/dixie.c
+++ b/vp8/dixie/dixie.c
@@ -1,560 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx/internal/vpx_codec_internal.h"
 #include "bit_ops.h"
 #include "dixie.h"
 #include "vp8_prob_data.h"
 #include "dequant_data.h"
 #include "modemv.h"
 #include "tokens.h"
 #include "predict.h"
 #include "dixie_loopfilter.h"
 #include <string.h>
 #include <assert.h>
 enum
 {
    FRAME_HEADER_SZ = 3,
    KEYFRAME_HEADER_SZ = 7
 };
 #define ARRAY_COPY(a,b) {\
    assert(sizeof(a)==sizeof(b));memcpy(a,b,sizeof(a));}
 static void
 decode_entropy_header(struct vp8_decoder_ctx    *ctx,
                      struct bool_decoder       *bool,
                      struct vp8_entropy_hdr    *hdr)
 {
    int i, j, k, l;
    /* Read coefficient probability updates */
    for (i = 0; i < BLOCK_TYPES; i++)
        for (j = 0; j < COEF_BANDS; j++)
            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
                for (l = 0; l < ENTROPY_NODES; l++)
                    if (bool_get(bool,
                                 k_coeff_entropy_update_probs
                                     [i][j][k][l]))
                        hdr->coeff_probs[i][j][k][l] =
                            bool_get_uint(bool, 8);
    /* Read coefficient skip mode probability */
    hdr->coeff_skip_enabled = bool_get_bit(bool);
    if (hdr->coeff_skip_enabled)
        hdr->coeff_skip_prob = bool_get_uint(bool, 8);
    /* Parse interframe probability updates */
    if (!ctx->frame_hdr.is_keyframe)
    {
        hdr->prob_inter = bool_get_uint(bool, 8);
        hdr->prob_last  = bool_get_uint(bool, 8);
        hdr->prob_gf    = bool_get_uint(bool, 8);
        if (bool_get_bit(bool))
            for (i = 0; i < 4; i++)
                hdr->y_mode_probs[i] = bool_get_uint(bool, 8);
        if (bool_get_bit(bool))
            for (i = 0; i < 3; i++)
                hdr->uv_mode_probs[i] = bool_get_uint(bool, 8);
        for (i = 0; i < 2; i++)
            for (j = 0; j < MV_PROB_CNT; j++)
                if (bool_get(bool, k_mv_entropy_update_probs[i][j]))
                {
                    int x = bool_get_uint(bool, 7);
                    hdr->mv_probs[i][j] = x ? x << 1 : 1;
                }
    }
 }
 static void
 decode_reference_header(struct vp8_decoder_ctx    *ctx,
                        struct bool_decoder       *bool,
                        struct vp8_reference_hdr  *hdr)
 {
    unsigned int key = ctx->frame_hdr.is_keyframe;
    hdr->refresh_gf    = key ? 1 : bool_get_bit(bool);
    hdr->refresh_arf   = key ? 1 : bool_get_bit(bool);
    hdr->copy_gf       = key ? 0 : !hdr->refresh_gf
                         ? bool_get_uint(bool, 2) : 0;
    hdr->copy_arf      = key ? 0 : !hdr->refresh_arf
                         ? bool_get_uint(bool, 2) : 0;
    hdr->sign_bias[GOLDEN_FRAME] = key ? 0 : bool_get_bit(bool);
    hdr->sign_bias[ALTREF_FRAME] = key ? 0 : bool_get_bit(bool);
    hdr->refresh_entropy = bool_get_bit(bool);
    hdr->refresh_last  = key ? 1 : bool_get_bit(bool);
 }
 static void
 decode_quantizer_header(struct vp8_decoder_ctx    *ctx,
                        struct bool_decoder       *bool,
                        struct vp8_quant_hdr      *hdr)
 {
    int update;
    int last_q = hdr->q_index;
    hdr->q_index = bool_get_uint(bool, 7);
    update = last_q != hdr->q_index;
    update |= (hdr->y1_dc_delta_q = bool_maybe_get_int(bool, 4));
    update |= (hdr->y2_dc_delta_q = bool_maybe_get_int(bool, 4));
    update |= (hdr->y2_ac_delta_q = bool_maybe_get_int(bool, 4));
    update |= (hdr->uv_dc_delta_q = bool_maybe_get_int(bool, 4));
    update |= (hdr->uv_ac_delta_q = bool_maybe_get_int(bool, 4));
    hdr->delta_update = update;
 }
 static void
 decode_and_init_token_partitions(struct vp8_decoder_ctx    *ctx,
                                 struct bool_decoder       *bool,
                                 const unsigned char       *data,
                                 unsigned int               sz,
                                 struct vp8_token_hdr      *hdr)
 {
    int i;
    hdr->partitions = 1 << bool_get_uint(bool, 2);
    if (sz < 3 *(hdr->partitions - 1))
        vpx_internal_error(&ctx->error, VPX_CODEC_CORRUPT_FRAME,
                           "Truncated packet found parsing partition"
                           " lengths.");
    sz -= 3 * (hdr->partitions - 1);
    for (i = 0; i < hdr->partitions; i++)
    {
        if (i < hdr->partitions - 1)
        {
            hdr->partition_sz[i] = (data[2] << 16)
                                   | (data[1] << 8) | data[0];
            data += 3;
        }
        else
            hdr->partition_sz[i] = sz;
        if (sz < hdr->partition_sz[i])
            vpx_internal_error(&ctx->error, VPX_CODEC_CORRUPT_FRAME,
                               "Truncated partition %d", i);
        sz -= hdr->partition_sz[i];
    }
    for (i = 0; i < ctx->token_hdr.partitions; i++)
    {
        init_bool_decoder(&ctx->tokens[i].bool, data,
                          ctx->token_hdr.partition_sz[i]);
        data += ctx->token_hdr.partition_sz[i];
    }
 }
 static void
 decode_loopfilter_header(struct vp8_decoder_ctx    *ctx,
                         struct bool_decoder       *bool,
                         struct vp8_loopfilter_hdr *hdr)
 {
    if (ctx->frame_hdr.is_keyframe)
        memset(hdr, 0, sizeof(*hdr));
    hdr->use_simple    = bool_get_bit(bool);
    hdr->level         = bool_get_uint(bool, 6);
    hdr->sharpness     = bool_get_uint(bool, 3);
    hdr->delta_enabled = bool_get_bit(bool);
    if (hdr->delta_enabled && bool_get_bit(bool))
    {
        int i;
        for (i = 0; i < BLOCK_CONTEXTS; i++)
            hdr->ref_delta[i] = bool_maybe_get_int(bool, 6);
        for (i = 0; i < BLOCK_CONTEXTS; i++)
            hdr->mode_delta[i] = bool_maybe_get_int(bool, 6);
    }
 }
 static void
 decode_segmentation_header(struct vp8_decoder_ctx *ctx,
                           struct bool_decoder    *bool,
                           struct vp8_segment_hdr *hdr)
 {
    if (ctx->frame_hdr.is_keyframe)
        memset(hdr, 0, sizeof(*hdr));
    hdr->enabled = bool_get_bit(bool);
    if (hdr->enabled)
    {
        int i;
        hdr->update_map = bool_get_bit(bool);
        hdr->update_data = bool_get_bit(bool);
        if (hdr->update_data)
        {
            hdr->abs = bool_get_bit(bool);
            for (i = 0; i < MAX_MB_SEGMENTS; i++)
                hdr->quant_idx[i] = bool_maybe_get_int(bool, 7);
            for (i = 0; i < MAX_MB_SEGMENTS; i++)
                hdr->lf_level[i] = bool_maybe_get_int(bool, 6);
        }
        if (hdr->update_map)
        {
            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
                hdr->tree_probs[i] = bool_get_bit(bool)
                                     ? bool_get_uint(bool, 8)
                                     : 255;
        }
    }
    else
    {
        hdr->update_map = 0;
        hdr->update_data = 0;
    }
 }
 static void
 dequant_global_init(struct dequant_factors dqf[MAX_MB_SEGMENTS])
 {
    int i;
    for (i = 0; i < MAX_MB_SEGMENTS; i++)
        dqf[i].quant_idx = -1;
 }
 static int
 clamp_q(int q)
 {
    if (q < 0) return 0;
    else if (q > 127) return 127;
    return q;
 }
 static int
 dc_q(int q)
 {
    return dc_q_lookup[clamp_q(q)];
 }
 static int
 ac_q(int q)
 {
    return ac_q_lookup[clamp_q(q)];
 }
 static void
 dequant_init(struct dequant_factors        factors[MAX_MB_SEGMENTS],
             const struct vp8_segment_hdr *seg,
             const struct vp8_quant_hdr   *quant_hdr)
 {
    int i, q;
    struct dequant_factors *dqf = factors;
    for (i = 0; i < (seg->enabled ? MAX_MB_SEGMENTS : 1); i++)
    {
        q = quant_hdr->q_index;
        if (seg->enabled)
            q = (!seg->abs) ? q + seg->quant_idx[i] : seg->quant_idx[i];
        if (dqf->quant_idx != q || quant_hdr->delta_update)
        {
            dqf->factor[TOKEN_BLOCK_Y1][0] =
                dc_q(q + quant_hdr->y1_dc_delta_q);
            dqf->factor[TOKEN_BLOCK_Y1][1] =
                ac_q(q);
            dqf->factor[TOKEN_BLOCK_UV][0] =
                dc_q(q + quant_hdr->uv_dc_delta_q);
            dqf->factor[TOKEN_BLOCK_UV][1] =
                ac_q(q + quant_hdr->uv_ac_delta_q);
            dqf->factor[TOKEN_BLOCK_Y2][0] =
                dc_q(q + quant_hdr->y2_dc_delta_q) * 2;
            dqf->factor[TOKEN_BLOCK_Y2][1] =
                ac_q(q + quant_hdr->y2_ac_delta_q) * 155 / 100;
            if (dqf->factor[TOKEN_BLOCK_Y2][1] < 8)
                dqf->factor[TOKEN_BLOCK_Y2][1] = 8;
            if (dqf->factor[TOKEN_BLOCK_UV][0] > 132)
                dqf->factor[TOKEN_BLOCK_UV][0] = 132;
            dqf->quant_idx = q;
        }
        dqf++;
    }
 }
 static void
 decode_frame(struct vp8_decoder_ctx *ctx,
             const unsigned char    *data,
             unsigned int            sz)
 {
    vpx_codec_err_t  res;
    struct bool_decoder  bool;
    int                  i, row, partition;
    ctx->saved_entropy_valid = 0;
    if ((res = vp8_parse_frame_header(data, sz, &ctx->frame_hdr)))
        vpx_internal_error(&ctx->error, res,
                           "Failed to parse frame header");
    if (ctx->frame_hdr.is_experimental)
        vpx_internal_error(&ctx->error, VPX_CODEC_UNSUP_BITSTREAM,
                           "Experimental bitstreams not supported.");
    data += FRAME_HEADER_SZ;
    sz -= FRAME_HEADER_SZ;
    if (ctx->frame_hdr.is_keyframe)
    {
        data += KEYFRAME_HEADER_SZ;
        sz -= KEYFRAME_HEADER_SZ;
        ctx->mb_cols = (ctx->frame_hdr.kf.w + 15) / 16;
        ctx->mb_rows = (ctx->frame_hdr.kf.h + 15) / 16;
    }
    /* Start the bitreader for the header/entropy partition */
    init_bool_decoder(&bool, data, ctx->frame_hdr.part0_sz);
    /* Skip the colorspace and clamping bits */
    if (ctx->frame_hdr.is_keyframe)
        if (bool_get_uint(&bool, 2))
            vpx_internal_error(&ctx->error, VPX_CODEC_UNSUP_BITSTREAM,
                               "Reserved bits not supported.");
    decode_segmentation_header(ctx, &bool, &ctx->segment_hdr);
    decode_loopfilter_header(ctx, &bool, &ctx->loopfilter_hdr);
    decode_and_init_token_partitions(ctx,
                                     &bool,
                                     data + ctx->frame_hdr.part0_sz,
                                     sz - ctx->frame_hdr.part0_sz,
                                     &ctx->token_hdr);
    decode_quantizer_header(ctx, &bool, &ctx->quant_hdr);
    decode_reference_header(ctx, &bool, &ctx->reference_hdr);
    /* Set keyframe entropy defaults. These get updated on keyframes
     * regardless of the refresh_entropy setting.
     */
    if (ctx->frame_hdr.is_keyframe)
    {
        ARRAY_COPY(ctx->entropy_hdr.coeff_probs,
                   k_default_coeff_probs);
        ARRAY_COPY(ctx->entropy_hdr.mv_probs,
                   k_default_mv_probs);
        ARRAY_COPY(ctx->entropy_hdr.y_mode_probs,
                   k_default_y_mode_probs);
        ARRAY_COPY(ctx->entropy_hdr.uv_mode_probs,
                   k_default_uv_mode_probs);
    }
    if (!ctx->reference_hdr.refresh_entropy)
    {
        ctx->saved_entropy = ctx->entropy_hdr;
        ctx->saved_entropy_valid = 1;
    }
    decode_entropy_header(ctx, &bool, &ctx->entropy_hdr);
    vp8_dixie_modemv_init(ctx);
    vp8_dixie_tokens_init(ctx);
    vp8_dixie_predict_init(ctx);
    dequant_init(ctx->dequant_factors, &ctx->segment_hdr,
                 &ctx->quant_hdr);
    for (row = 0, partition = 0; row < ctx->mb_rows; row++)
    {
        vp8_dixie_modemv_process_row(ctx, &bool, row, 0, ctx->mb_cols);
        vp8_dixie_tokens_process_row(ctx, partition, row, 0,
                                     ctx->mb_cols);
        vp8_dixie_predict_process_row(ctx, row, 0, ctx->mb_cols);
        if (ctx->loopfilter_hdr.level && row)
            vp8_dixie_loopfilter_process_row(ctx, row - 1, 0,
                                             ctx->mb_cols);
        if (++partition == ctx->token_hdr.partitions)
            partition = 0;
    }
    if (ctx->loopfilter_hdr.level)
        vp8_dixie_loopfilter_process_row(ctx, row - 1, 0, ctx->mb_cols);
    ctx->frame_cnt++;
    if (!ctx->reference_hdr.refresh_entropy)
    {
        ctx->entropy_hdr = ctx->saved_entropy;
        ctx->saved_entropy_valid = 0;
    }
    /* Handle reference frame updates */
    if (ctx->reference_hdr.copy_arf == 1)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[ALTREF_FRAME]);
        ctx->ref_frames[ALTREF_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[LAST_FRAME]);
    }
    else if (ctx->reference_hdr.copy_arf == 2)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[ALTREF_FRAME]);
        ctx->ref_frames[ALTREF_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[GOLDEN_FRAME]);
    }
    if (ctx->reference_hdr.copy_gf == 1)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[GOLDEN_FRAME]);
        ctx->ref_frames[GOLDEN_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[LAST_FRAME]);
    }
    else if (ctx->reference_hdr.copy_gf == 2)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[GOLDEN_FRAME]);
        ctx->ref_frames[GOLDEN_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[ALTREF_FRAME]);
    }
    if (ctx->reference_hdr.refresh_gf)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[GOLDEN_FRAME]);
        ctx->ref_frames[GOLDEN_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[CURRENT_FRAME]);
    }
    if (ctx->reference_hdr.refresh_arf)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[ALTREF_FRAME]);
        ctx->ref_frames[ALTREF_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[CURRENT_FRAME]);
    }
    if (ctx->reference_hdr.refresh_last)
    {
        vp8_dixie_release_ref_frame(ctx->ref_frames[LAST_FRAME]);
        ctx->ref_frames[LAST_FRAME] =
            vp8_dixie_ref_frame(ctx->ref_frames[CURRENT_FRAME]);
    }
 }
 void
 vp8_dixie_decode_init(struct vp8_decoder_ctx *ctx)
 {
    dequant_global_init(ctx->dequant_factors);
 }
 #define CHECK_FOR_UPDATE(lval,rval,update_flag) do {\
        unsigned int old = lval; \
        update_flag |= (old != (lval = rval)); \
    } while(0)
 vpx_codec_err_t
 vp8_parse_frame_header(const unsigned char   *data,
                       unsigned int           sz,
                       struct vp8_frame_hdr  *hdr)
 {
    unsigned long raw;
    if (sz < 10)
        return VPX_CODEC_CORRUPT_FRAME;
    /* The frame header is defined as a three byte little endian
     * value
     */
    raw = data[0] | (data[1] << 8) | (data[2] << 16);
    hdr->is_keyframe     = !BITS_GET(raw, 0, 1);
    hdr->version         = BITS_GET(raw, 1, 2);
    hdr->is_experimental = BITS_GET(raw, 3, 1);
    hdr->is_shown        = BITS_GET(raw, 4, 1);
    hdr->part0_sz        = BITS_GET(raw, 5, 19);
    if (sz <= hdr->part0_sz + (hdr->is_keyframe ? 10 : 3))
        return VPX_CODEC_CORRUPT_FRAME;
    hdr->frame_size_updated = 0;
    if (hdr->is_keyframe)
    {
        unsigned int update = 0;
        /* Keyframe header consists of a three byte sync code followed
         * by the width and height and associated scaling factors.
         */
        if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a)
            return VPX_CODEC_UNSUP_BITSTREAM;
        raw = data[6] | (data[7] << 8)
              | (data[8] << 16) | (data[9] << 24);
        CHECK_FOR_UPDATE(hdr->kf.w,       BITS_GET(raw,  0, 14),
                         update);
        CHECK_FOR_UPDATE(hdr->kf.scale_w, BITS_GET(raw, 14,  2),
                         update);
        CHECK_FOR_UPDATE(hdr->kf.h,       BITS_GET(raw, 16, 14),
                         update);
        CHECK_FOR_UPDATE(hdr->kf.scale_h, BITS_GET(raw, 30,  2),
                         update);
        hdr->frame_size_updated = update;
        if (!hdr->kf.w || !hdr->kf.h)
            return VPX_CODEC_UNSUP_BITSTREAM;
    }
    return VPX_CODEC_OK;
 }
 vpx_codec_err_t
 vp8_dixie_decode_frame(struct vp8_decoder_ctx *ctx,
                       const unsigned char    *data,
                       unsigned int            sz)
 {
    volatile struct vp8_decoder_ctx *ctx_ = ctx;
    ctx->error.error_code = VPX_CODEC_OK;
    ctx->error.has_detail = 0;
    if (!setjmp(ctx->error.jmp))
        decode_frame(ctx, data, sz);
    return ctx_->error.error_code;
 }
 void
 vp8_dixie_decode_destroy(struct vp8_decoder_ctx *ctx)
 {
    vp8_dixie_predict_destroy(ctx);
    vp8_dixie_tokens_destroy(ctx);
    vp8_dixie_modemv_destroy(ctx);
 }
--- a/vp8/dixie/dixie.h
+++ b/vp8/dixie/dixie.h
@@ -1,308 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef DIXIE_H
 #define DIXIE_H
 #include "vpx/internal/vpx_codec_internal.h"
 #include "bool_decoder.h"
 struct vp8_frame_hdr
 {
    unsigned int is_keyframe;      /* Frame is a keyframe */
    unsigned int is_experimental;  /* Frame is a keyframe */
    unsigned int version;          /* Bitstream version */
    unsigned int is_shown;         /* Frame is to be displayed. */
    unsigned int part0_sz;         /* Partition 0 length, in bytes */
    struct vp8_kf_hdr
    {
        unsigned int w;        /* Width */
        unsigned int h;        /* Height */
        unsigned int scale_w;  /* Scaling factor, Width */
        unsigned int scale_h;  /* Scaling factor, Height */
    } kf;
    unsigned int frame_size_updated; /* Flag to indicate a resolution
                                      * update.
                                      */
 };
 enum
 {
    MB_FEATURE_TREE_PROBS = 3,
    MAX_MB_SEGMENTS = 4
 };
 struct vp8_segment_hdr
 {
    unsigned int         enabled;
    unsigned int         update_data;
    unsigned int         update_map;
    unsigned int         abs;    /* 0=deltas, 1=absolute values */
    unsigned int         tree_probs[MB_FEATURE_TREE_PROBS];
    int                  lf_level[MAX_MB_SEGMENTS];
    int                  quant_idx[MAX_MB_SEGMENTS];
 };
 enum
 {
    BLOCK_CONTEXTS = 4
 };
 struct vp8_loopfilter_hdr
 {
    unsigned int         use_simple;
    unsigned int         level;
    unsigned int         sharpness;
    unsigned int         delta_enabled;
    int                  ref_delta[BLOCK_CONTEXTS];
    int                  mode_delta[BLOCK_CONTEXTS];
 };
 enum
 {
    MAX_PARTITIONS = 8
 };
 struct vp8_token_hdr
 {
    unsigned int        partitions;
    unsigned int        partition_sz[MAX_PARTITIONS];
 };
 struct vp8_quant_hdr
 {
    unsigned int       q_index;
    int                delta_update;
    int                y1_dc_delta_q;
    int                y2_dc_delta_q;
    int                y2_ac_delta_q;
    int                uv_dc_delta_q;
    int                uv_ac_delta_q;
 };
 struct vp8_reference_hdr
 {
    unsigned int refresh_last;
    unsigned int refresh_gf;
    unsigned int refresh_arf;
    unsigned int copy_gf;
    unsigned int copy_arf;
    unsigned int sign_bias[4];
    unsigned int refresh_entropy;
 };
 enum
 {
    BLOCK_TYPES        = 4,
    PREV_COEF_CONTEXTS = 3,
    COEF_BANDS         = 8,
    ENTROPY_NODES      = 11,
 };
 typedef unsigned char coeff_probs_table_t[BLOCK_TYPES][COEF_BANDS]
 [PREV_COEF_CONTEXTS]
 [ENTROPY_NODES];
 enum
 {
    MV_PROB_CNT = 2 + 8 - 1 + 10 /* from entropymv.h */
 };
 typedef unsigned char mv_component_probs_t[MV_PROB_CNT];
 struct vp8_entropy_hdr
 {
    coeff_probs_table_t   coeff_probs;
    mv_component_probs_t  mv_probs[2];
    unsigned int          coeff_skip_enabled;
    unsigned char         coeff_skip_prob;
    unsigned char         y_mode_probs[4];
    unsigned char         uv_mode_probs[3];
    unsigned char         prob_inter;
    unsigned char         prob_last;
    unsigned char         prob_gf;
 };
 enum reference_frame
 {
    CURRENT_FRAME,
    LAST_FRAME,
    GOLDEN_FRAME,
    ALTREF_FRAME,
    NUM_REF_FRAMES
 };
 enum prediction_mode
 {
    /* 16x16 intra modes */
    DC_PRED, V_PRED, H_PRED, TM_PRED, B_PRED,
    /* 16x16 inter modes */
    NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV,
    MB_MODE_COUNT,
    /* 4x4 intra modes */
    B_DC_PRED = 0, B_TM_PRED, B_VE_PRED, B_HE_PRED, B_LD_PRED,
    B_RD_PRED, B_VR_PRED, B_VL_PRED, B_HD_PRED, B_HU_PRED,
    /* 4x4 inter modes */
    LEFT4X4, ABOVE4X4, ZERO4X4, NEW4X4,
    B_MODE_COUNT
 };
 enum splitmv_partitioning
 {
    SPLITMV_16X8,
    SPLITMV_8X16,
    SPLITMV_8X8,
    SPLITMV_4X4
 };
 typedef short filter_t[6];
 typedef union mv
 {
    struct
    {
        int16_t x, y;
    }  d;
    uint32_t               raw;
 } mv_t;
 struct mb_base_info
 {
    unsigned char y_mode     : 4;
    unsigned char uv_mode    : 4;
    unsigned char segment_id : 2;
    unsigned char ref_frame  : 2;
    unsigned char skip_coeff : 1;
    unsigned char need_mc_border : 1;
    enum splitmv_partitioning  partitioning : 2;
    union mv      mv;
    unsigned int  eob_mask;
 };
 struct mb_info
 {
    struct mb_base_info base;
    union
    {
        union mv              mvs[16];
        enum prediction_mode  modes[16];
    } split;
 };
 /* A "token entropy context" has 4 Y values, 2 U, 2 V, and 1 Y2 */
 typedef int token_entropy_ctx_t[4 + 2 + 2 + 1];
 struct token_decoder
 {
    struct bool_decoder  bool;
    token_entropy_ctx_t  left_token_entropy_ctx;
    short               *coeffs;
 };
 enum token_block_type
 {
    TOKEN_BLOCK_Y1,
    TOKEN_BLOCK_UV,
    TOKEN_BLOCK_Y2,
    TOKEN_BLOCK_TYPES,
 };
 struct dequant_factors
 {
    int   quant_idx;
    short factor[TOKEN_BLOCK_TYPES][2]; /* [ Y1, UV, Y2 ] [ DC, AC ] */
 };
 struct ref_cnt_img
 {
    vpx_image_t  img;
    unsigned int ref_cnt;
 };
 struct vp8_decoder_ctx
 {
    struct vpx_internal_error_info  error;
    unsigned int                    frame_cnt;
    struct vp8_frame_hdr            frame_hdr;
    struct vp8_segment_hdr          segment_hdr;
    struct vp8_loopfilter_hdr       loopfilter_hdr;
    struct vp8_token_hdr            token_hdr;
    struct vp8_quant_hdr            quant_hdr;
    struct vp8_reference_hdr        reference_hdr;
    struct vp8_entropy_hdr          entropy_hdr;
    struct vp8_entropy_hdr          saved_entropy;
    unsigned int                    saved_entropy_valid;
    unsigned int                    mb_rows;
    unsigned int                    mb_cols;
    struct mb_info                 *mb_info_storage;
    struct mb_info                **mb_info_rows_storage;
    struct mb_info                **mb_info_rows;
    token_entropy_ctx_t            *above_token_entropy_ctx;
    struct token_decoder            tokens[MAX_PARTITIONS];
    struct dequant_factors          dequant_factors[MAX_MB_SEGMENTS];
    struct ref_cnt_img              frame_strg[NUM_REF_FRAMES];
    struct ref_cnt_img             *ref_frames[NUM_REF_FRAMES];
    ptrdiff_t                       ref_frame_offsets[4];
    const filter_t                 *subpixel_filters;
 };
 void
 vp8_dixie_decode_init(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_decode_destroy(struct vp8_decoder_ctx *ctx);
 vpx_codec_err_t
 vp8_parse_frame_header(const unsigned char   *data,
                       unsigned int           sz,
                       struct vp8_frame_hdr  *hdr);
 vpx_codec_err_t
 vp8_dixie_decode_frame(struct vp8_decoder_ctx *ctx,
                       const unsigned char    *data,
                       unsigned int            sz);
 #define CLAMP_255(x) ((x)<0?0:((x)>255?255:(x)))
 #endif
--- a/vp8/dixie/dixie_loopfilter.c
+++ b/vp8/dixie/dixie_loopfilter.c
@@ -1,530 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "dixie.h"
 #include "dixie_loopfilter.h"
 #define ABS(x) ((x) >= 0 ? (x) : -(x))
 #define p3 pixels[-4*stride]
 #define p2 pixels[-3*stride]
 #define p1 pixels[-2*stride]
 #define p0 pixels[-1*stride]
 #define q0 pixels[ 0*stride]
 #define q1 pixels[ 1*stride]
 #define q2 pixels[ 2*stride]
 #define q3 pixels[ 3*stride]
 #define static
 static int
 saturate_int8(int x)
 {
    if (x < -128)
        return -128;
    if (x > 127)
        return 127;
    return x;
 }
 static int
 saturate_uint8(int x)
 {
    if (x < 0)
        return 0;
    if (x > 255)
        return 255;
    return x;
 }
 static int
 high_edge_variance(unsigned char *pixels,
                   int            stride,
                   int            hev_threshold)
 {
    return ABS(p1 - p0) > hev_threshold || ABS(q1 - q0) > hev_threshold;
 }
 static int
 simple_threshold(unsigned char *pixels,
                 int            stride,
                 int            filter_limit)
 {
    return (ABS(p0 - q0) * 2 + (ABS(p1 - q1) >> 1)) <= filter_limit;
 }
 static int
 normal_threshold(unsigned char *pixels,
                 int            stride,
                 int            edge_limit,
                 int            interior_limit)
 {
    int E = edge_limit;
    int I = interior_limit;
    return simple_threshold(pixels, stride, 2 * E + I)
           && ABS(p3 - p2) <= I && ABS(p2 - p1) <= I
           && ABS(p1 - p0) <= I && ABS(q3 - q2) <= I
           && ABS(q2 - q1) <= I && ABS(q1 - q0) <= I;
 }
 static void
 filter_common(unsigned char *pixels,
              int            stride,
              int            use_outer_taps)
 {
    int a, f1, f2;
    a = 3 * (q0 - p0);
    if (use_outer_taps)
        a += saturate_int8(p1 - q1);
    a = saturate_int8(a);
    f1 = ((a + 4 > 127) ? 127 : a + 4) >> 3;
    f2 = ((a + 3 > 127) ? 127 : a + 3) >> 3;
    p0 = saturate_uint8(p0 + f2);
    q0 = saturate_uint8(q0 - f1);
    if (!use_outer_taps)
    {
        /* This handles the case of subblock_filter()
         * (from the bitstream guide.
         */
        a = (f1 + 1) >> 1;
        p1 = saturate_uint8(p1 + a);
        q1 = saturate_uint8(q1 - a);
    }
 }
 static void
 filter_mb_edge(unsigned char *pixels,
               int            stride)
 {
    int w, a;
    w = saturate_int8(saturate_int8(p1 - q1) + 3 * (q0 - p0));
    a = (27 * w + 63) >> 7;
    p0 = saturate_uint8(p0 + a);
    q0 = saturate_uint8(q0 - a);
    a = (18 * w + 63) >> 7;
    p1 = saturate_uint8(p1 + a);
    q1 = saturate_uint8(q1 - a);
    a = (9 * w + 63) >> 7;
    p2 = saturate_uint8(p2 + a);
    q2 = saturate_uint8(q2 - a);
 }
 static void
 filter_mb_v_edge(unsigned char *src,
                 int            stride,
                 int            edge_limit,
                 int            interior_limit,
                 int            hev_threshold,
                 int            size)
 {
    int i;
    for (i = 0; i < 8 * size; i++)
    {
        if (normal_threshold(src, 1, edge_limit, interior_limit))
        {
            if (high_edge_variance(src, 1, hev_threshold))
                filter_common(src, 1, 1);
            else
                filter_mb_edge(src, 1);
        }
        src += stride;
    }
 }
 static void
 filter_subblock_v_edge(unsigned char *src,
                       int            stride,
                       int            edge_limit,
                       int            interior_limit,
                       int            hev_threshold,
                       int            size)
 {
    int i;
    for (i = 0; i < 8 * size; i++)
    {
        if (normal_threshold(src, 1, edge_limit, interior_limit))
            filter_common(src, 1,
                          high_edge_variance(src, 1, hev_threshold));
        src += stride;
    }
 }
 static void
 filter_mb_h_edge(unsigned char *src,
                 int            stride,
                 int            edge_limit,
                 int            interior_limit,
                 int            hev_threshold,
                 int            size)
 {
    int i;
    for (i = 0; i < 8 * size; i++)
    {
        if (normal_threshold(src, stride, edge_limit, interior_limit))
        {
            if (high_edge_variance(src, stride, hev_threshold))
                filter_common(src, stride, 1);
            else
                filter_mb_edge(src, stride);
        }
        src += 1;
    }
 }
 static void
 filter_subblock_h_edge(unsigned char *src,
                       int            stride,
                       int            edge_limit,
                       int            interior_limit,
                       int            hev_threshold,
                       int            size)
 {
    int i;
    for (i = 0; i < 8 * size; i++)
    {
        if (normal_threshold(src, stride, edge_limit, interior_limit))
            filter_common(src, stride,
                          high_edge_variance(src, stride,
                                             hev_threshold));
        src += 1;
    }
 }
 static void
 filter_v_edge_simple(unsigned char *src,
                     int            stride,
                     int            filter_limit)
 {
    int i;
    for (i = 0; i < 16; i++)
    {
        if (simple_threshold(src, 1, filter_limit))
            filter_common(src, 1, 1);
        src += stride;
    }
 }
 static void
 filter_h_edge_simple(unsigned char *src,
                     int            stride,
                     int            filter_limit)
 {
    int i;
    for (i = 0; i < 16; i++)
    {
        if (simple_threshold(src, stride, filter_limit))
            filter_common(src, stride, 1);
        src += 1;
    }
 }
 static void
 calculate_filter_parameters(struct vp8_decoder_ctx *ctx,
                            struct mb_info         *mbi,
                            int                    *edge_limit_,
                            int                    *interior_limit_,
                            int                    *hev_threshold_)
 {
    int filter_level, interior_limit, hev_threshold;
    /* Reference code/spec seems to conflate filter_level and
     * edge_limit
     */
    filter_level = ctx->loopfilter_hdr.level;
    if (ctx->segment_hdr.enabled)
    {
        if (!ctx->segment_hdr.abs)
            filter_level +=
                ctx->segment_hdr.lf_level[mbi->base.segment_id];
        else
            filter_level =
                ctx->segment_hdr.lf_level[mbi->base.segment_id];
    }
    if (ctx->loopfilter_hdr.delta_enabled)
    {
        filter_level +=
            ctx->loopfilter_hdr.ref_delta[mbi->base.ref_frame];
        if (mbi->base.ref_frame == CURRENT_FRAME)
        {
            if (mbi->base.y_mode == B_PRED)
                filter_level += ctx->loopfilter_hdr.mode_delta[0];
        }
        else if (mbi->base.y_mode == ZEROMV)
            filter_level += ctx->loopfilter_hdr.mode_delta[1];
        else if (mbi->base.y_mode == SPLITMV)
            filter_level += ctx->loopfilter_hdr.mode_delta[3];
        else
            filter_level += ctx->loopfilter_hdr.mode_delta[2];
    }
    if (filter_level > 63)
        filter_level = 63;
    else if (filter_level < 0)
        filter_level = 0;
    interior_limit = filter_level;
    if (ctx->loopfilter_hdr.sharpness)
    {
        interior_limit >>= ctx->loopfilter_hdr.sharpness > 4 ? 2 : 1;
        if (interior_limit > 9 - ctx->loopfilter_hdr.sharpness)
            interior_limit = 9 - ctx->loopfilter_hdr.sharpness;
    }
    if (interior_limit < 1)
        interior_limit = 1;
    hev_threshold = (filter_level >= 15);
    if (filter_level >= 40)
        hev_threshold++;
    if (filter_level >= 20 && !ctx->frame_hdr.is_keyframe)
        hev_threshold++;
    *edge_limit_ = filter_level;
    *interior_limit_ = interior_limit;
    *hev_threshold_ = hev_threshold;
 }
 static void
 filter_row_normal(struct vp8_decoder_ctx *ctx,
                  unsigned int            row,
                  unsigned int            start_col,
                  unsigned int            num_cols)
 {
    unsigned char  *y, *u, *v;
    int             stride, uv_stride;
    struct mb_info *mbi;
    unsigned int    col;
    /* Adjust pointers based on row, start_col */
    stride    = ctx->ref_frames[CURRENT_FRAME]->img.stride[PLANE_Y];
    uv_stride = ctx->ref_frames[CURRENT_FRAME]->img.stride[PLANE_U];
    y = ctx->ref_frames[CURRENT_FRAME]->img.planes[PLANE_Y];
    u = ctx->ref_frames[CURRENT_FRAME]->img.planes[PLANE_U];
    v = ctx->ref_frames[CURRENT_FRAME]->img.planes[PLANE_V];
    y += (stride * row + start_col) * 16;
    u += (uv_stride * row + start_col) * 8;
    v += (uv_stride * row + start_col) * 8;
    mbi = ctx->mb_info_rows[row] + start_col;
    for (col = start_col; col < start_col + num_cols; col++)
    {
        int edge_limit, interior_limit, hev_threshold;
        /* TODO: only need to recalculate every MB if segmentation is
         * enabled.
         */
        calculate_filter_parameters(ctx, mbi, &edge_limit,
                                    &interior_limit, &hev_threshold);
        if (edge_limit)
        {
            if (col)
            {
                filter_mb_v_edge(y, stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 2);
                filter_mb_v_edge(u, uv_stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 1);
                filter_mb_v_edge(v, uv_stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 1);
            }
            /* NOTE: This conditional is actually dependent on the
             * number of coefficients decoded, not the skip flag as
             * coded in the bitstream. The tokens task is expected to
             * set 31 if there is *any* non-zero data.
             */
            if (mbi->base.eob_mask
                || mbi->base.y_mode == SPLITMV
                || mbi->base.y_mode == B_PRED)
            {
                filter_subblock_v_edge(y + 4, stride, edge_limit,
                                       interior_limit, hev_threshold,
                                       2);
                filter_subblock_v_edge(y + 8, stride, edge_limit,
                                       interior_limit, hev_threshold,
                                       2);
                filter_subblock_v_edge(y + 12, stride, edge_limit,
                                       interior_limit, hev_threshold,
                                       2);
                filter_subblock_v_edge(u + 4, uv_stride, edge_limit,
                                       interior_limit, hev_threshold,
                                       1);
                filter_subblock_v_edge(v + 4, uv_stride, edge_limit,
                                       interior_limit, hev_threshold,
                                       1);
            }
            if (row)
            {
                filter_mb_h_edge(y, stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 2);
                filter_mb_h_edge(u, uv_stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 1);
                filter_mb_h_edge(v, uv_stride, edge_limit + 2,
                                 interior_limit, hev_threshold, 1);
            }
            if (mbi->base.eob_mask
                || mbi->base.y_mode == SPLITMV
                || mbi->base.y_mode == B_PRED)
            {
                filter_subblock_h_edge(y + 4 * stride, stride,
                                       edge_limit, interior_limit,
                                       hev_threshold, 2);
                filter_subblock_h_edge(y + 8 * stride, stride,
                                       edge_limit, interior_limit,
                                       hev_threshold, 2);
                filter_subblock_h_edge(y + 12 * stride, stride,
                                       edge_limit, interior_limit,
                                       hev_threshold, 2);
                filter_subblock_h_edge(u + 4 * uv_stride, uv_stride,
                                       edge_limit, interior_limit,
                                       hev_threshold, 1);
                filter_subblock_h_edge(v + 4 * uv_stride, uv_stride,
                                       edge_limit, interior_limit,
                                       hev_threshold, 1);
            }
        }
        y += 16;
        u += 8;
        v += 8;
        mbi++;
    }
 }
 static void
 filter_row_simple(struct vp8_decoder_ctx *ctx,
                  unsigned int            row,
                  unsigned int            start_col,
                  unsigned int            num_cols)
 {
    unsigned char  *y;
    int             stride;
    struct mb_info *mbi;
    unsigned int    col;
    /* Adjust pointers based on row, start_col */
    stride    = ctx->ref_frames[CURRENT_FRAME]->img.stride[PLANE_Y];
    y = ctx->ref_frames[CURRENT_FRAME]->img.planes[PLANE_Y];
    y += (stride * row + start_col) * 16;
    mbi = ctx->mb_info_rows[row] + start_col;
    for (col = start_col; col < start_col + num_cols; col++)
    {
        int edge_limit, interior_limit, hev_threshold;
        /* TODO: only need to recalculate every MB if segmentation is
         * enabled.
         */
        calculate_filter_parameters(ctx, mbi, &edge_limit,
                                    &interior_limit, &hev_threshold);
        if (edge_limit)
        {
            /* NOTE: This conditional is actually dependent on the
             * number of coefficients decoded, not the skip flag as
             * coded in the bitstream. The tokens task is expected to
             * set 31 if there is *any* non-zero data.
             */
            int filter_subblocks = (mbi->base.eob_mask
                                    || mbi->base.y_mode == SPLITMV
                                    || mbi->base.y_mode == B_PRED);
            int mb_limit = (edge_limit + 2) * 2 + interior_limit;
            int b_limit = edge_limit * 2 + interior_limit;
            if (col)
                filter_v_edge_simple(y, stride, mb_limit);
            if (filter_subblocks)
            {
                filter_v_edge_simple(y + 4, stride, b_limit);
                filter_v_edge_simple(y + 8, stride, b_limit);
                filter_v_edge_simple(y + 12, stride, b_limit);
            }
            if (row)
                filter_h_edge_simple(y, stride, mb_limit);
            if (filter_subblocks)
            {
                filter_h_edge_simple(y + 4 * stride, stride, b_limit);
                filter_h_edge_simple(y + 8 * stride, stride, b_limit);
                filter_h_edge_simple(y + 12 * stride, stride, b_limit);
            }
        }
        y += 16;
        mbi++;
    }
 }
 void
 vp8_dixie_loopfilter_process_row(struct vp8_decoder_ctx *ctx,
                                 unsigned int            row,
                                 unsigned int            start_col,
                                 unsigned int            num_cols)
 {
    if (ctx->loopfilter_hdr.use_simple)
        filter_row_simple(ctx, row, start_col, num_cols);
    else
        filter_row_normal(ctx, row, start_col, num_cols);
 }
--- a/vp8/dixie/dixie_loopfilter.h
+++ b/vp8/dixie/dixie_loopfilter.h
@@ -1,19 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef DIXIE_LOOPFILTER_H
 #define DIXIE_LOOPFILTER_H
 void
 vp8_dixie_loopfilter_process_row(struct vp8_decoder_ctx *ctx,
                                 unsigned int            row,
                                 unsigned int            start_col,
                                 unsigned int            num_cols);
 #endif
--- a/vp8/dixie/idct_add.c
+++ b/vp8/dixie/idct_add.c
@@ -1,142 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "dixie.h"
 #include "idct_add.h"
 #include <assert.h>
 void
 vp8_dixie_walsh(const short *input, short *output)
 {
    int i;
    int a1, b1, c1, d1;
    int a2, b2, c2, d2;
    const short *ip = input;
    short *op = output;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[12];
        b1 = ip[4] + ip[8];
        c1 = ip[4] - ip[8];
        d1 = ip[0] - ip[12];
        op[0] = a1 + b1;
        op[4] = c1 + d1;
        op[8] = a1 - b1;
        op[12] = d1 - c1;
        ip++;
        op++;
    }
    ip = output;
    op = output;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[3];
        b1 = ip[1] + ip[2];
        c1 = ip[1] - ip[2];
        d1 = ip[0] - ip[3];
        a2 = a1 + b1;
        b2 = c1 + d1;
        c2 = a1 - b1;
        d2 = d1 - c1;
        op[0] = (a2 + 3) >> 3;
        op[1] = (b2 + 3) >> 3;
        op[2] = (c2 + 3) >> 3;
        op[3] = (d2 + 3) >> 3;
        ip += 4;
        op += 4;
    }
 }
 #define cospi8sqrt2minus1 20091
 #define sinpi8sqrt2       35468
 #define rounding          0
 static void
 idct_columns(const short *input, short *output)
 {
    int i;
    int a1, b1, c1, d1;
    const short *ip = input;
    short *op = output;
    int temp1, temp2;
    int shortpitch = 4;
    for (i = 0; i < 4; i++)
    {
        a1 = ip[0] + ip[8];
        b1 = ip[0] - ip[8];
        temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
        temp2 = ip[12] +
            ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
        c1 = temp1 - temp2;
        temp1 = ip[4] +
            ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
        temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
        d1 = temp1 + temp2;
        op[shortpitch*0] = a1 + d1;
        op[shortpitch*3] = a1 - d1;
        op[shortpitch*1] = b1 + c1;
        op[shortpitch*2] = b1 - c1;
        ip++;
        op++;
    }
 }
 void
 vp8_dixie_idct_add(unsigned char        *recon,
                   const unsigned char  *predict,
                   int                   stride,
                   const short          *coeffs)
 {
    int i;
    int a1, b1, c1, d1, temp1, temp2;
    short tmp[16];
    idct_columns(coeffs, tmp);
    coeffs = tmp;
    for (i = 0; i < 4; i++)
    {
        a1 = coeffs[0] + coeffs[2];
        b1 = coeffs[0] - coeffs[2];
        temp1 = (coeffs[1] * sinpi8sqrt2 + rounding) >> 16;
        temp2 = coeffs[3] +
            ((coeffs[3] * cospi8sqrt2minus1 + rounding) >> 16);
        c1 = temp1 - temp2;
        temp1 = coeffs[1] +
            ((coeffs[1] * cospi8sqrt2minus1 + rounding) >> 16);
        temp2 = (coeffs[3] * sinpi8sqrt2 + rounding) >> 16;
        d1 = temp1 + temp2;
        recon[0] = CLAMP_255(predict[0] + ((a1 + d1 + 4) >> 3));
        recon[3] = CLAMP_255(predict[3] + ((a1 - d1 + 4) >> 3));
        recon[1] = CLAMP_255(predict[1] + ((b1 + c1 + 4) >> 3));
        recon[2] = CLAMP_255(predict[2] + ((b1 - c1 + 4) >> 3));
        coeffs += 4;
        recon += stride;
        predict += stride;
    }
 }
--- a/vp8/dixie/idct_add.h
+++ b/vp8/dixie/idct_add.h
@@ -1,35 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef IDCT_ADD_H
 #define IDCT_ADD_H
 void
 vp8_dixie_idct_add_init(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_idct_add(unsigned char        *recon,
                   const unsigned char  *predict,
                   int                   stride,
                   const short          *coeffs);
 void
 vp8_dixie_walsh(const short *in, short *out);
 void
 vp8_dixie_idct_add_process_row(struct vp8_decoder_ctx *ctx,
                               short                  *coeffs,
                               unsigned int            row,
                               unsigned int            start_col,
                               unsigned int            num_cols);
 #endif
--- a/vp8/dixie/modemv.c
+++ b/vp8/dixie/modemv.c
@@ -1,686 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "dixie.h"
 #include "modemv_data.h"
 #include <stdlib.h>
 #include <assert.h>
 struct mv_clamp_rect
 {
    int to_left, to_right, to_top, to_bottom;
 };
 static union mv
        clamp_mv(union mv raw, const struct mv_clamp_rect *bounds)
 {
    union mv newmv;
    newmv.d.x = (raw.d.x < bounds->to_left)
                ? bounds->to_left : raw.d.x;
    newmv.d.x = (raw.d.x > bounds->to_right)
                ? bounds->to_right : newmv.d.x;
    newmv.d.y = (raw.d.y < bounds->to_top)
                ? bounds->to_top : raw.d.y;
    newmv.d.y = (raw.d.y > bounds->to_bottom)
                ? bounds->to_bottom : newmv.d.y;
    return newmv;
 }
 static int
 read_segment_id(struct bool_decoder *bool, struct vp8_segment_hdr *seg)
 {
    return bool_get(bool, seg->tree_probs[0])
           ? 2 + bool_get(bool, seg->tree_probs[2])
           : bool_get(bool, seg->tree_probs[1]);
 }
 static enum prediction_mode
 above_block_mode(const struct mb_info *this,
                 const struct mb_info *above,
                 unsigned int b)
 {
    if (b < 4)
    {
        switch (above->base.y_mode)
        {
        case DC_PRED:
            return B_DC_PRED;
        case V_PRED:
            return B_VE_PRED;
        case H_PRED:
            return B_HE_PRED;
        case TM_PRED:
            return B_TM_PRED;
        case B_PRED:
            return above->split.modes[b+12];
        default:
            assert(0);
        }
    }
    return this->split.modes[b-4];
 }
 static enum prediction_mode
 left_block_mode(const struct mb_info *this,
                const struct mb_info *left,
                unsigned int b)
 {
    if (!(b & 3))
    {
        switch (left->base.y_mode)
        {
        case DC_PRED:
            return B_DC_PRED;
        case V_PRED:
            return B_VE_PRED;
        case H_PRED:
            return B_HE_PRED;
        case TM_PRED:
            return B_TM_PRED;
        case B_PRED:
            return left->split.modes[b+3];
        default:
            assert(0);
        }
    }
    return this->split.modes[b-1];
 }
 static void
 decode_kf_mb_mode(struct mb_info      *this,
                  struct mb_info      *left,
                  struct mb_info      *above,
                  struct bool_decoder *bool)
 {
    int y_mode, uv_mode;
    y_mode = bool_read_tree(bool, kf_y_mode_tree, kf_y_mode_probs);
    if (y_mode == B_PRED)
    {
        unsigned int i;
        for (i = 0; i < 16; i++)
        {
            enum prediction_mode a = above_block_mode(this, above, i);
            enum prediction_mode l = left_block_mode(this, left, i);
            enum prediction_mode b;
            b = bool_read_tree(bool, b_mode_tree,
                               kf_b_mode_probs[a][l]);
            this->split.modes[i] = b;
        }
    }
    uv_mode = bool_read_tree(bool, uv_mode_tree, kf_uv_mode_probs);
    this->base.y_mode = y_mode;
    this->base.uv_mode = uv_mode;
    this->base.mv.raw = 0;
    this->base.ref_frame = 0;
 }
 static void
 decode_intra_mb_mode(struct mb_info         *this,
                     struct vp8_entropy_hdr *hdr,
                     struct bool_decoder    *bool)
 {
    /* Like decode_kf_mb_mode, but with probabilities transmitted in the
     * bitstream and no context on the above/left block mode.
     */
    int y_mode, uv_mode;
    y_mode = bool_read_tree(bool, y_mode_tree, hdr->y_mode_probs);
    if (y_mode == B_PRED)
    {
        unsigned int i;
        for (i = 0; i < 16; i++)
        {
            enum prediction_mode b;
            b = bool_read_tree(bool, b_mode_tree, default_b_mode_probs);
            this->split.modes[i] = b;
        }
    }
    uv_mode = bool_read_tree(bool, uv_mode_tree, hdr->uv_mode_probs);
    this->base.y_mode = y_mode;
    this->base.uv_mode = uv_mode;
    this->base.mv.raw = 0;
    this->base.ref_frame = CURRENT_FRAME;
 }
 static int
 read_mv_component(struct bool_decoder *bool,
                  const unsigned char  mvc[MV_PROB_CNT])
 {
    enum {IS_SHORT, SIGN, SHORT, BITS = SHORT + 8 - 1, LONG_WIDTH = 10};
    int x = 0;
    if (bool_get(bool, mvc[IS_SHORT])) /* Large */
    {
        int i = 0;
        for (i = 0; i < 3; i++)
            x += bool_get(bool, mvc[BITS + i]) << i;
        /* Skip bit 3, which is sometimes implicit */
        for (i = LONG_WIDTH - 1; i > 3; i--)
            x += bool_get(bool, mvc[BITS + i]) << i;
        if (!(x & 0xFFF0)  ||  bool_get(bool, mvc[BITS + 3]))
            x += 8;
    }
    else   /* small */
        x = bool_read_tree(bool, small_mv_tree, mvc + SHORT);
    if (x && bool_get(bool, mvc[SIGN]))
        x = -x;
    return x << 1;
 }
 static mv_t
 above_block_mv(const struct mb_info *this,
               const struct mb_info *above,
               unsigned int          b)
 {
    if (b < 4)
    {
        if (above->base.y_mode == SPLITMV)
            return above->split.mvs[b+12];
        return above->base.mv;
    }
    return this->split.mvs[b-4];
 }
 static mv_t
 left_block_mv(const struct mb_info *this,
              const struct mb_info *left,
              unsigned int          b)
 {
    if (!(b & 3))
    {
        if (left->base.y_mode == SPLITMV)
            return left->split.mvs[b+3];
        return left->base.mv;
    }
    return this->split.mvs[b-1];
 }
 static enum prediction_mode
 submv_ref(struct bool_decoder *bool, union mv l, union mv a)
 {
    enum subblock_mv_ref
    {
        SUBMVREF_NORMAL,
        SUBMVREF_LEFT_ZED,
        SUBMVREF_ABOVE_ZED,
        SUBMVREF_LEFT_ABOVE_SAME,
        SUBMVREF_LEFT_ABOVE_ZED
    };
    int lez = !(l.raw);
    int aez = !(a.raw);
    int lea = l.raw == a.raw;
    enum subblock_mv_ref ctx = SUBMVREF_NORMAL;
    if (lea && lez)
        ctx = SUBMVREF_LEFT_ABOVE_ZED;
    else if (lea)
        ctx = SUBMVREF_LEFT_ABOVE_SAME;
    else if (aez)
        ctx = SUBMVREF_ABOVE_ZED;
    else if (lez)
        ctx = SUBMVREF_LEFT_ZED;
    return bool_read_tree(bool, submv_ref_tree, submv_ref_probs2[ctx]);
 }
 static void
 read_mv(struct bool_decoder  *bool,
        union mv             *mv,
        mv_component_probs_t  mvc[2])
 {
    mv->d.y = read_mv_component(bool, mvc[0]);
    mv->d.x = read_mv_component(bool, mvc[1]);
 }
 static void
 mv_bias(const struct mb_info *mb,
        const unsigned int   sign_bias[3],
        enum reference_frame ref_frame,
        union mv             *mv)
 {
    if (sign_bias[mb->base.ref_frame] ^ sign_bias[ref_frame])
    {
        mv->d.x *= -1;
        mv->d.y *= -1;
    }
 }
 enum near_mv_v
 {
    CNT_BEST = 0,
    CNT_ZEROZERO = 0,
    CNT_NEAREST,
    CNT_NEAR,
    CNT_SPLITMV
 };
 static void
 find_near_mvs(const struct mb_info   *this,
              const struct mb_info   *left,
              const struct mb_info   *above,
              const unsigned int      sign_bias[3],
              union  mv               near_mvs[4],
              int                     cnt[4])
 {
    const struct mb_info *aboveleft = above - 1;
    union  mv             *mv = near_mvs;
    int                   *cntx = cnt;
    /* Zero accumulators */
    mv[0].raw = mv[1].raw = mv[2].raw = 0;
    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
    /* Process above */
    if (above->base.ref_frame != CURRENT_FRAME)
    {
        if (above->base.mv.raw)
        {
            (++mv)->raw = above->base.mv.raw;
            mv_bias(above, sign_bias, this->base.ref_frame, mv);
            ++cntx;
        }
        *cntx += 2;
    }
    /* Process left */
    if (left->base.ref_frame != CURRENT_FRAME)
    {
        if (left->base.mv.raw)
        {
            union mv this_mv;
            this_mv.raw = left->base.mv.raw;
            mv_bias(left, sign_bias, this->base.ref_frame, &this_mv);
            if (this_mv.raw != mv->raw)
            {
                (++mv)->raw = this_mv.raw;
                ++cntx;
            }
            *cntx += 2;
        }
        else
            cnt[CNT_ZEROZERO] += 2;
    }
    /* Process above left */
    if (aboveleft->base.ref_frame != CURRENT_FRAME)
    {
        if (aboveleft->base.mv.raw)
        {
            union mv this_mv;
            this_mv.raw = aboveleft->base.mv.raw;
            mv_bias(aboveleft, sign_bias, this->base.ref_frame,
                    &this_mv);
            if (this_mv.raw != mv->raw)
            {
                (++mv)->raw = this_mv.raw;
                ++cntx;
            }
            *cntx += 1;
        }
        else
            cnt[CNT_ZEROZERO] += 1;
    }
    /* If we have three distinct MV's ... */
    if (cnt[CNT_SPLITMV])
    {
        /* See if above-left MV can be merged with NEAREST */
        if (mv->raw == near_mvs[CNT_NEAREST].raw)
            cnt[CNT_NEAREST] += 1;
    }
    cnt[CNT_SPLITMV] = ((above->base.y_mode == SPLITMV)
                        + (left->base.y_mode == SPLITMV)) * 2
                       + (aboveleft->base.y_mode == SPLITMV);
    /* Swap near and nearest if necessary */
    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
    {
        int tmp;
        tmp = cnt[CNT_NEAREST];
        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
        cnt[CNT_NEAR] = tmp;
        tmp = near_mvs[CNT_NEAREST].raw;
        near_mvs[CNT_NEAREST].raw = near_mvs[CNT_NEAR].raw;
        near_mvs[CNT_NEAR].raw = tmp;
    }
    /* Use near_mvs[CNT_BEST] to store the "best" MV. Note that this
     * storage shares the same address as near_mvs[CNT_ZEROZERO].
     */
    if (cnt[CNT_NEAREST] >= cnt[CNT_BEST])
        near_mvs[CNT_BEST] = near_mvs[CNT_NEAREST];
 }
 static void
 decode_split_mv(struct mb_info         *this,
                const struct mb_info   *left,
                const struct mb_info   *above,
                struct vp8_entropy_hdr *hdr,
                union  mv              *best_mv,
                struct bool_decoder    *bool)
 {
    const int *partition;
    int        j, k, mask, partition_id;
    partition_id = bool_read_tree(bool, split_mv_tree, split_mv_probs);
    partition = mv_partitions[partition_id];
    this->base.partitioning = partition_id;
    for (j = 0, mask = 0; mask < 65535; j++)
    {
        union mv mv, left_mv, above_mv;
        enum prediction_mode subblock_mode;
        /* Find the first subblock in this partition. */
        for (k = 0; j != partition[k]; k++);
        /* Decode the next MV */
        left_mv = left_block_mv(this, left, k);
        above_mv = above_block_mv(this, above, k);
        subblock_mode = submv_ref(bool, left_mv,  above_mv);
        switch (subblock_mode)
        {
        case LEFT4X4:
            mv = left_mv;
            break;
        case ABOVE4X4:
            mv = above_mv;
            break;
        case ZERO4X4:
            mv.raw = 0;
            break;
        case NEW4X4:
            read_mv(bool, &mv, hdr->mv_probs);
            mv.d.x += best_mv->d.x;
            mv.d.y += best_mv->d.y;
            break;
        default:
            assert(0);
        }
        /* Fill the MV's for this partition */
        for (; k < 16; k++)
            if (j == partition[k])
            {
                this->split.mvs[k] = mv;
                mask |= 1 << k;
            }
    }
 }
 static int
 need_mc_border(union mv mv, int l, int t, int b_w, int w, int h)
 {
    int b, r;
    /* Get distance to edge for top-left pixel */
    l += (mv.d.x >> 3);
    t += (mv.d.y >> 3);
    /* Get distance to edge for bottom-right pixel */
    r = w - (l + b_w);
    b = h - (t + b_w);
    return (l >> 1 < 2 || r >> 1 < 3 || t >> 1 < 2 || b >> 1 < 3);
 }
 static void
 decode_mvs(struct vp8_decoder_ctx       *ctx,
           struct mb_info               *this,
           const struct mb_info         *left,
           const struct mb_info         *above,
           const struct mv_clamp_rect   *bounds,
           struct bool_decoder          *bool)
 {
    struct vp8_entropy_hdr *hdr = &ctx->entropy_hdr;
    union mv          near_mvs[4];
    union mv          clamped_best_mv;
    int               mv_cnts[4];
    unsigned char     probs[4];
    enum {BEST, NEAREST, NEAR};
    int x, y, w, h, b;
    this->base.ref_frame = bool_get(bool, hdr->prob_last)
                           ? 2 + bool_get(bool, hdr->prob_gf)
                           : 1;
    find_near_mvs(this, this - 1, above, ctx->reference_hdr.sign_bias,
                  near_mvs, mv_cnts);
    probs[0] = mv_counts_to_probs[mv_cnts[0]][0];
    probs[1] = mv_counts_to_probs[mv_cnts[1]][1];
    probs[2] = mv_counts_to_probs[mv_cnts[2]][2];
    probs[3] = mv_counts_to_probs[mv_cnts[3]][3];
    this->base.y_mode = bool_read_tree(bool, mv_ref_tree, probs);
    this->base.uv_mode = this->base.y_mode;
    this->base.need_mc_border = 0;
    x = (-bounds->to_left - 128) >> 3;
    y = (-bounds->to_top - 128) >> 3;
    w = ctx->mb_cols * 16;
    h = ctx->mb_rows * 16;
    switch (this->base.y_mode)
    {
    case NEARESTMV:
        this->base.mv = clamp_mv(near_mvs[NEAREST], bounds);
        break;
    case NEARMV:
        this->base.mv = clamp_mv(near_mvs[NEAR], bounds);
        break;
    case ZEROMV:
        this->base.mv.raw = 0;
        return; //skip need_mc_border check
    case NEWMV:
        clamped_best_mv = clamp_mv(near_mvs[BEST], bounds);
        read_mv(bool, &this->base.mv, hdr->mv_probs);
        this->base.mv.d.x += clamped_best_mv.d.x;
        this->base.mv.d.y += clamped_best_mv.d.y;
        break;
    case SPLITMV:
    {
        union mv          chroma_mv[4] = {{{0}}};
        clamped_best_mv = clamp_mv(near_mvs[BEST], bounds);
        decode_split_mv(this, left, above, hdr, &clamped_best_mv, bool);
        this->base.mv = this->split.mvs[15];
        for (b = 0; b < 16; b++)
        {
            chroma_mv[(b>>1&1) + (b>>2&2)].d.x +=
                this->split.mvs[b].d.x;
            chroma_mv[(b>>1&1) + (b>>2&2)].d.y +=
                this->split.mvs[b].d.y;
            if (need_mc_border(this->split.mvs[b],
            x + (b & 3) * 4, y + (b & ~3), 4, w, h))
            {
                this->base.need_mc_border = 1;
                break;
            }
        }
        for (b = 0; b < 4; b++)
        {
            chroma_mv[b].d.x += 4 + 8 * (chroma_mv[b].d.x >> 31);
            chroma_mv[b].d.y += 4 + 8 * (chroma_mv[b].d.y >> 31);
            chroma_mv[b].d.x /= 4;
            chroma_mv[b].d.y /= 4;
            //note we're passing in non-subsampled coordinates
            if (need_mc_border(chroma_mv[b],
            x + (b & 1) * 8, y + (b >> 1) * 8, 16, w, h))
            {
                this->base.need_mc_border = 1;
                break;
            }
        }
        return; //skip need_mc_border check
    }
    default:
        assert(0);
    }
    if (need_mc_border(this->base.mv, x, y, 16, w, h))
        this->base.need_mc_border = 1;
 }
 void
 vp8_dixie_modemv_process_row(struct vp8_decoder_ctx *ctx,
 struct bool_decoder    *bool,
 int                     row,
 int                     start_col,
 int                     num_cols)
 {
    struct mb_info       *above, *this;
    unsigned int          col;
    struct mv_clamp_rect  bounds;
    this = ctx->mb_info_rows[row] + start_col;
    above = ctx->mb_info_rows[row - 1] + start_col;
    /* Calculate the eighth-pel MV bounds using a 1 MB border. */
    bounds.to_left   = -((start_col + 1) << 7);
    bounds.to_right  = (ctx->mb_cols - start_col) << 7;
    bounds.to_top    = -((row + 1) << 7);
    bounds.to_bottom = (ctx->mb_rows - row) << 7;
    for (col = start_col; col < start_col + num_cols; col++)
    {
        if (ctx->segment_hdr.update_map)
            this->base.segment_id = read_segment_id(bool,
            &ctx->segment_hdr);
        if (ctx->entropy_hdr.coeff_skip_enabled)
            this->base.skip_coeff = bool_get(bool,
            ctx->entropy_hdr.coeff_skip_prob);
        if (ctx->frame_hdr.is_keyframe)
        {
            if (!ctx->segment_hdr.update_map)
                this->base.segment_id = 0;
            decode_kf_mb_mode(this, this - 1, above, bool);
        }
        else
        {
            if (bool_get(bool, ctx->entropy_hdr.prob_inter))
                decode_mvs(ctx, this, this - 1, above, &bounds, bool);
            else
                decode_intra_mb_mode(this, &ctx->entropy_hdr, bool);
            bounds.to_left -= 16 << 3;
            bounds.to_right -= 16 << 3;
        }
        /* Advance to next mb */
        this++;
        above++;
    }
 }
 void
 vp8_dixie_modemv_init(struct vp8_decoder_ctx *ctx)
 {
    unsigned int    mbi_w, mbi_h, i;
    struct mb_info *mbi;
    mbi_w = ctx->mb_cols + 1; /* For left border col */
    mbi_h = ctx->mb_rows + 1; /* For above border row */
    if (ctx->frame_hdr.frame_size_updated)
    {
        free(ctx->mb_info_storage);
        ctx->mb_info_storage = NULL;
        free(ctx->mb_info_rows_storage);
        ctx->mb_info_rows_storage = NULL;
    }
    if (!ctx->mb_info_storage)
        ctx->mb_info_storage = calloc(mbi_w * mbi_h,
        sizeof(*ctx->mb_info_storage));
    if (!ctx->mb_info_rows_storage)
        ctx->mb_info_rows_storage = calloc(mbi_h,
        sizeof(*ctx->mb_info_rows_storage));
    /* Set up row pointers */
    mbi = ctx->mb_info_storage + 1;
    for (i = 0; i < mbi_h; i++)
    {
        ctx->mb_info_rows_storage[i] = mbi;
        mbi += mbi_w;
    }
    ctx->mb_info_rows = ctx->mb_info_rows_storage + 1;
 }
 void
 vp8_dixie_modemv_destroy(struct vp8_decoder_ctx *ctx)
 {
    free(ctx->mb_info_storage);
    ctx->mb_info_storage = NULL;
    free(ctx->mb_info_rows_storage);
    ctx->mb_info_rows_storage = NULL;
 }
--- a/vp8/dixie/modemv.h
+++ b/vp8/dixie/modemv.h
@@ -1,28 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef MODEMV_H
 #define MODEMV_H
 void
 vp8_dixie_modemv_init(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_modemv_destroy(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_modemv_process_row(struct vp8_decoder_ctx *ctx,
                             struct bool_decoder    *bool,
                             int                     row,
                             int                     start_col,
                             int                     num_cols);
 #endif
--- a/vp8/dixie/modemv_data.h
+++ b/vp8/dixie/modemv_data.h
@@ -1,216 +0,0 @@
 static const unsigned char kf_y_mode_probs[]  = { 145, 156, 163, 128};
 static const unsigned char kf_uv_mode_probs[] = { 142, 114, 183};
 static const unsigned char kf_b_mode_probs[10][10][9] =
 {
  { /* above mode 0 */
    { /* left mode 0 */ 231, 120,  48,  89, 115, 113, 120, 152, 112},
    { /* left mode 1 */ 152, 179,  64, 126, 170, 118,  46,  70,  95},
    { /* left mode 2 */ 175,  69, 143,  80,  85,  82,  72, 155, 103},
    { /* left mode 3 */  56,  58,  10, 171, 218, 189,  17,  13, 152},
    { /* left mode 4 */ 144,  71,  10,  38, 171, 213, 144,  34,  26},
    { /* left mode 5 */ 114,  26,  17, 163,  44, 195,  21,  10, 173},
    { /* left mode 6 */ 121,  24,  80, 195,  26,  62,  44,  64,  85},
    { /* left mode 7 */ 170,  46,  55,  19, 136, 160,  33, 206,  71},
    { /* left mode 8 */  63,  20, 8, 114, 114, 208,  12,   9, 226},
    { /* left mode 9 */  81,  40,  11,  96, 182,  84,  29,  16,  36}
  },
  { /* above mode 1 */
    { /* left mode 0 */ 134, 183,  89, 137,  98, 101, 106, 165, 148},
    { /* left mode 1 */  72, 187, 100, 130, 157, 111,  32,  75,  80},
    { /* left mode 2 */  66, 102, 167,  99,  74,  62,  40, 234, 128},
    { /* left mode 3 */  41,  53, 9, 178, 241, 141,  26,   8, 107},
    { /* left mode 4 */ 104,  79,  12,  27, 217, 255,  87,  17,   7},
    { /* left mode 5 */  74,  43,  26, 146,  73, 166,  49,  23, 157},
    { /* left mode 6 */  65,  38, 105, 160,  51,  52,  31, 115, 128},
    { /* left mode 7 */  87,  68,  71,  44, 114,  51,  15, 186,  23},
    { /* left mode 8 */  47,  41,  14, 110, 182, 183,  21,  17, 194},
    { /* left mode 9 */  66,  45,  25, 102, 197, 189,  23,  18,  22}
  },
  { /* above mode 2 */
    { /* left mode 0 */  88,  88, 147, 150,  42,  46,  45, 196, 205},
    { /* left mode 1 */  43,  97, 183, 117,  85,  38,  35, 179,  61},
    { /* left mode 2 */  39,  53, 200,  87,  26,  21,  43, 232, 171},
    { /* left mode 3 */  56,  34,  51, 104, 114, 102,  29,  93,  77},
    { /* left mode 4 */ 107,  54,  32,  26,  51,   1,  81,  43,  31},
    { /* left mode 5 */  39,  28,  85, 171,  58, 165,  90,  98,  64},
    { /* left mode 6 */  34,  22, 116, 206,  23,  34,  43, 166,  73},
    { /* left mode 7 */  68,  25, 106,  22,  64, 171,  36, 225, 114},
    { /* left mode 8 */  34,  19,  21, 102, 132, 188,  16,  76, 124},
    { /* left mode 9 */  62,  18,  78,  95,  85,  57,  50,  48,  51}
  },
  { /* above mode 3 */
    { /* left mode 0 */ 193, 101,  35, 159, 215, 111,  89,  46, 111},
    { /* left mode 1 */  60, 148,  31, 172, 219, 228,  21,  18, 111},
    { /* left mode 2 */ 112, 113,  77,  85, 179, 255,  38, 120, 114},
    { /* left mode 3 */  40,  42, 1, 196, 245, 209,  10,  25, 109},
    { /* left mode 4 */ 100,  80, 8,  43, 154,   1,  51,  26,  71},
    { /* left mode 5 */  88,  43,  29, 140, 166, 213,  37,  43, 154},
    { /* left mode 6 */  61,  63,  30, 155,  67,  45,  68,   1, 209},
    { /* left mode 7 */ 142,  78,  78,  16, 255, 128,  34, 197, 171},
    { /* left mode 8 */  41,  40, 5, 102, 211, 183, 4,   1, 221},
    { /* left mode 9 */  51,  50,  17, 168, 209, 192,  23,  25,  82}
  },
  { /* above mode 4 */
    { /* left mode 0 */ 125,  98,  42,  88, 104,  85, 117, 175,  82},
    { /* left mode 1 */  95,  84,  53,  89, 128, 100, 113, 101,  45},
    { /* left mode 2 */  75,  79, 123,  47,  51, 128,  81, 171,   1},
    { /* left mode 3 */  57,  17, 5,  71, 102,  57,  53,  41,  49},
    { /* left mode 4 */ 115,  21, 2,  10, 102, 255, 166,  23,   6},
    { /* left mode 5 */  38,  33,  13, 121,  57,  73,  26,   1,  85},
    { /* left mode 6 */  41,  10,  67, 138,  77, 110,  90,  47, 114},
    { /* left mode 7 */ 101,  29,  16,  10,  85, 128, 101, 196,  26},
    { /* left mode 8 */  57,  18,  10, 102, 102, 213,  34,  20,  43},
    { /* left mode 9 */ 117,  20,  15,  36, 163, 128,  68,   1,  26}
  },
  { /* above mode 5 */
    { /* left mode 0 */ 138,  31,  36, 171,  27, 166,  38,  44, 229},
    { /* left mode 1 */  67,  87,  58, 169,  82, 115,  26,  59, 179},
    { /* left mode 2 */  63,  59,  90, 180,  59, 166,  93,  73, 154},
    { /* left mode 3 */  40,  40,  21, 116, 143, 209,  34,  39, 175},
    { /* left mode 4 */  57,  46,  22,  24, 128,   1,  54,  17,  37},
    { /* left mode 5 */  47,  15,  16, 183,  34, 223,  49,  45, 183},
    { /* left mode 6 */  46,  17,  33, 183,   6,  98,  15,  32, 183},
    { /* left mode 7 */  65,  32,  73, 115,  28, 128,  23, 128, 205},
    { /* left mode 8 */  40,   3, 9, 115,  51, 192,  18,   6, 223},
    { /* left mode 9 */  87,  37, 9, 115,  59,  77,  64,  21,  47}
  },
  { /* above mode 6 */
    { /* left mode 0 */ 104,  55,  44, 218,   9,  54,  53, 130, 226},
    { /* left mode 1 */  64,  90,  70, 205,  40,  41,  23,  26,  57},
    { /* left mode 2 */  54,  57, 112, 184,   5,  41,  38, 166, 213},
    { /* left mode 3 */  30,  34,  26, 133, 152, 116,  10,  32, 134},
    { /* left mode 4 */  75,  32,  12,  51, 192, 255, 160,  43,  51},
    { /* left mode 5 */  39,  19,  53, 221,  26, 114,  32,  73, 255},
    { /* left mode 6 */  31,   9,  65, 234,   2,  15, 1, 118,  73},
    { /* left mode 7 */  88,  31,  35,  67, 102,  85,  55, 186,  85},
    { /* left mode 8 */  56,  21,  23, 111,  59, 205,  45,  37, 192},
    { /* left mode 9 */  55,  38,  70, 124,  73, 102, 1,  34,  98}
  },
  { /* above mode 7 */
    { /* left mode 0 */ 102,  61,  71,  37,  34,  53,  31, 243, 192},
    { /* left mode 1 */  69,  60,  71,  38,  73, 119,  28, 222,  37},
    { /* left mode 2 */  68,  45, 128,  34,   1,  47,  11, 245, 171},
    { /* left mode 3 */  62,  17,  19,  70, 146,  85,  55,  62,  70},
    { /* left mode 4 */  75,  15, 9,   9,  64, 255, 184, 119,  16},
    { /* left mode 5 */  37,  43,  37, 154, 100, 163,  85, 160,   1},
    { /* left mode 6 */  63,   9,  92, 136,  28,  64,  32, 201,  85},
    { /* left mode 7 */  86,   6,  28,   5,  64, 255,  25, 248,   1},
    { /* left mode 8 */  56,   8,  17, 132, 137, 255,  55, 116, 128},
    { /* left mode 9 */  58,  15,  20,  82, 135,  57,  26, 121,  40}
  },
  { /* above mode 8 */
    { /* left mode 0 */ 164,  50,  31, 137, 154, 133,  25,  35, 218},
    { /* left mode 1 */  51, 103,  44, 131, 131, 123,  31,   6, 158},
    { /* left mode 2 */  86,  40,  64, 135, 148, 224,  45, 183, 128},
    { /* left mode 3 */  22,  26,  17, 131, 240, 154,  14,   1, 209},
    { /* left mode 4 */  83,  12,  13,  54, 192, 255,  68,  47,  28},
    { /* left mode 5 */  45,  16,  21,  91,  64, 222, 7,   1, 197},
    { /* left mode 6 */  56,  21,  39, 155,  60, 138,  23, 102, 213},
    { /* left mode 7 */  85,  26,  85,  85, 128, 128,  32, 146, 171},
    { /* left mode 8 */  18,  11, 7,  63, 144, 171, 4,   4, 246},
    { /* left mode 9 */  35,  27,  10, 146, 174, 171,  12,  26, 128}
  },
  { /* above mode 9 */
    { /* left mode 0 */ 190,  80,  35,  99, 180,  80, 126,  54,  45},
    { /* left mode 1 */  85, 126,  47,  87, 176,  51,  41,  20,  32},
    { /* left mode 2 */ 101,  75, 128, 139, 118, 146, 116, 128,  85},
    { /* left mode 3 */  56,  41,  15, 176, 236,  85,  37,   9,  62},
    { /* left mode 4 */ 146,  36,  19,  30, 171, 255,  97,  27,  20},
    { /* left mode 5 */  71,  30,  17, 119, 118, 255,  17,  18, 138},
    { /* left mode 6 */ 101,  38,  60, 138,  55,  70,  43,  26, 142},
    { /* left mode 7 */ 138,  45,  61,  62, 219,   1,  81, 188,  64},
    { /* left mode 8 */  32,  41,  20, 117, 151, 142,  20,  21, 163},
    { /* left mode 9 */ 112,  19,  12,  61, 195, 128,  48,   4,  24}
  }
 };
 static const int kf_y_mode_tree[] =
 {
  -B_PRED, 2,
  4, 6,
  -DC_PRED, -V_PRED,
  -H_PRED, -TM_PRED
 };
 static const int y_mode_tree[] =
 {
  -DC_PRED, 2,
  4, 6,
  -V_PRED, -H_PRED,
  -TM_PRED, -B_PRED
 };
 static const int uv_mode_tree[6] =
 {
  -DC_PRED, 2,
  -V_PRED, 4,
  -H_PRED, -TM_PRED
 };
 static const int b_mode_tree[18] =
 {
  -B_DC_PRED, 2,                 /* 0 = DC_NODE */
  -B_TM_PRED, 4,                /* 1 = TM_NODE */
  -B_VE_PRED, 6,               /* 2 = VE_NODE */
  8, 12,                  /* 3 = COM_NODE */
  -B_HE_PRED, 10,              /* 4 = HE_NODE */
  -B_RD_PRED, -B_VR_PRED,         /* 5 = RD_NODE */
  -B_LD_PRED, 14,              /* 6 = LD_NODE */
  -B_VL_PRED, 16,            /* 7 = VL_NODE */
  -B_HD_PRED, -B_HU_PRED         /* 8 = HD_NODE */
 };
 static const int small_mv_tree[14] =
 {
  2, 8,
  4, 6,
  -0, -1,
  -2, -3,
  10, 12,
  -4, -5,
  -6, -7
 };
 static const int mv_ref_tree[8] =
 {
  -ZEROMV, 2,
  -NEARESTMV, 4,
  -NEARMV, 6,
  -NEWMV, -SPLITMV
 };
 static const int submv_ref_tree[6] =
 {
  -LEFT4X4, 2,
  -ABOVE4X4, 4,
  -ZERO4X4, -NEW4X4
 };
 static const int split_mv_tree[6] =
 {
  -3, 2,
  -2, 4,
  -0, -1
 };
 static const unsigned char default_b_mode_probs[] =
 { 120,  90,  79, 133,  87,  85,  80, 111, 151};
 static const unsigned char mv_counts_to_probs[6][4] =
 {
  { 7,   1,   1, 143 },
  {  14,  18,  14, 107 },
  { 135,  64,  57,  68 },
  {  60,  56, 128,  65 },
  { 159, 134, 128,  34 },
  { 234, 188, 128,  28 }
 };
 static const unsigned char split_mv_probs[3] =
 { 110, 111, 150};
 static const unsigned char submv_ref_probs2[5][3] =
 {
  { 147, 136, 18 },
  { 106, 145,  1 },
  { 179, 121,  1 },
  { 223,   1, 34 },
  { 208,   1,  1 }
 };
 const static int mv_partitions[4][16] =
 {
  {0, 0, 0, 0, 0, 0, 0, 0, 1, 1,  1,  1,  1,  1,  1,  1 },
  {0, 0, 1, 1, 0, 0, 1, 1, 0, 0,  1,  1,  0,  0,  1,  1 },
  {0, 0, 1, 1, 0, 0, 1, 1, 2, 2,  3,  3,  2,  2,  3,  3 },
  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }
 };
--- a/vp8/dixie/predict.c
+++ b/vp8/dixie/predict.c
--- a/vp8/dixie/predict.h
+++ b/vp8/dixie/predict.h
@@ -1,36 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef PREDICT_H
 #define PREDICT_H
 void
 vp8_dixie_predict_init(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_predict_destroy(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_predict_process_row(struct vp8_decoder_ctx *ctx,
                              unsigned int            row,
                              unsigned int            start_col,
                              unsigned int            num_cols);
 void
 vp8_dixie_release_ref_frame(struct ref_cnt_img *rcimg);
 struct ref_cnt_img *
 vp8_dixie_ref_frame(struct ref_cnt_img *rcimg);
 struct ref_cnt_img *
 vp8_dixie_find_free_ref_frame(struct ref_cnt_img *frames);
 #endif
--- a/vp8/dixie/tokens.c
+++ b/vp8/dixie/tokens.c
@@ -1,443 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx/internal/vpx_codec_internal.h"
 #include "dixie.h"
 #include "tokens.h"
 #include <stdlib.h>
 #include <string.h>
 #include <malloc.h>
 enum
 {
    EOB_CONTEXT_NODE,
    ZERO_CONTEXT_NODE,
    ONE_CONTEXT_NODE,
    LOW_VAL_CONTEXT_NODE,
    TWO_CONTEXT_NODE,
    THREE_CONTEXT_NODE,
    HIGH_LOW_CONTEXT_NODE,
    CAT_ONE_CONTEXT_NODE,
    CAT_THREEFOUR_CONTEXT_NODE,
    CAT_THREE_CONTEXT_NODE,
    CAT_FIVE_CONTEXT_NODE
 };
 enum
 {
    ZERO_TOKEN,
    ONE_TOKEN,
    TWO_TOKEN,
    THREE_TOKEN,
    FOUR_TOKEN,
    DCT_VAL_CATEGORY1,
    DCT_VAL_CATEGORY2,
    DCT_VAL_CATEGORY3,
    DCT_VAL_CATEGORY4,
    DCT_VAL_CATEGORY5,
    DCT_VAL_CATEGORY6,
    DCT_EOB_TOKEN,
    MAX_ENTROPY_TOKENS
 };
 struct extrabits
 {
    short         min_val;
    short         length;
    unsigned char probs[12];
 };
 static const unsigned int left_context_index[25] =
 {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
    4, 4, 5, 5, 6, 6, 7, 7, 8
 };
 static const unsigned int above_context_index[25] =
 {
    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
    4, 5, 4, 5, 6, 7, 6, 7, 8
 };
 #define X(n) ((n) * PREV_COEF_CONTEXTS * ENTROPY_NODES)
 static const unsigned int bands_x[16] =
 {
    X(0), X(1), X(2), X(3), X(6), X(4), X(5), X(6),
    X(6), X(6), X(6), X(6), X(6), X(6), X(6), X(7)
 };
 #undef X
 static const struct extrabits extrabits[MAX_ENTROPY_TOKENS] =
 {
    {  0, -1, {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //ZERO_TOKEN
    {  1, 0,  {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //ONE_TOKEN
    {  2, 0,  {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //TWO_TOKEN
    {  3, 0,  {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //THREE_TOKEN
    {  4, 0,  {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //FOUR_TOKEN
    {  5, 0,  { 159,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //DCT_VAL_CATEGORY1
    {  7, 1,  { 145, 165,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //DCT_VAL_CATEGORY2
    { 11, 2,  { 140, 148, 173,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //DCT_VAL_CATEGORY3
    { 19, 3,  { 135, 140, 155, 176,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, //DCT_VAL_CATEGORY4
    { 35, 4,  { 130, 134, 141, 157, 180,   0,
                  0,   0,   0,   0,   0,   0   } }, //DCT_VAL_CATEGORY5
    { 67, 10, { 129, 130, 133, 140, 153, 177,
                196, 230, 243, 254, 254,   0   } }, //DCT_VAL_CATEGORY6
    {  0, -1, {   0,   0,   0,   0,   0,   0,
                  0,   0,   0,   0,   0,   0   } }, // EOB TOKEN
 };
 static const unsigned int zigzag[16] =
 {
    0,  1,  4,  8,  5,  2,  3,  6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 #define DECODE_AND_APPLYSIGN(value_to_sign) \
    v = (bool_get_bit(bool) ? -value_to_sign \
                            : value_to_sign) * dqf[!!c];
 #define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
    if (!bool_get(bool, probability)) goto branch;
 #define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
    if (!bool_get(bool, probability)) \
    { \
        prob = type_probs; \
        if(c<15) {\
            ++c; \
            prob += bands_x[c]; \
            goto branch; \
        }\
        else \
            goto BLOCK_FINISHED; /*for malformed input */\
    }
 #define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
    DECODE_AND_APPLYSIGN(val) \
    prob = type_probs + (ENTROPY_NODES*2); \
    if(c < 15){\
        b_tokens[zigzag[c]] = v; \
        ++c; \
        goto DO_WHILE; }\
    b_tokens[zigzag[15]] = v; \
    goto BLOCK_FINISHED;
 #define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
    val += bool_get(bool, extrabits[t].probs[bits_count]) << bits_count;
 static int
 decode_mb_tokens(struct bool_decoder  *bool,
                 token_entropy_ctx_t   left,
                 token_entropy_ctx_t   above,
                 short                *tokens,
                 enum prediction_mode  mode,
                 coeff_probs_table_t   probs,
                 short                 factor[TOKEN_BLOCK_TYPES][2])
 {
    int            i, stop, type;
    int            c, t, v;
    int            val, bits_count;
    int            eob_mask;
    short         *b_tokens;   /* tokens for this block */
    unsigned char *type_probs; /* probabilities for this block type */
    unsigned char *prob;
    short         *dqf;
    eob_mask = 0;
    if (mode != B_PRED && mode != SPLITMV)
    {
        i = 24;
        stop = 24;
        type = 1;
        b_tokens = tokens + 24 * 16;
        dqf = factor[TOKEN_BLOCK_Y2];
    }
    else
    {
        i = 0;
        stop = 16;
        type = 3;
        b_tokens = tokens;
        dqf = factor[TOKEN_BLOCK_Y1];
    }
    /* Save a pointer to the coefficient probs for the current type.
     * Need to repeat this whenever type changes.
     */
    type_probs = probs[type][0][0];
 BLOCK_LOOP:
    t = left[left_context_index[i]] + above[above_context_index[i]];
    c = !type; /* all blocks start at 0 except type 0, which starts
                * at 1. */
    prob = type_probs;
    prob += t * ENTROPY_NODES;
 DO_WHILE:
    prob += bands_x[c];
    DECODE_AND_BRANCH_IF_ZERO(prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
 CHECK_0_:
    DECODE_AND_LOOP_IF_ZERO(prob[ZERO_CONTEXT_NODE], CHECK_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[ONE_CONTEXT_NODE],
                              ONE_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[LOW_VAL_CONTEXT_NODE],
                              LOW_VAL_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[HIGH_LOW_CONTEXT_NODE],
                              HIGH_LOW_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[CAT_THREEFOUR_CONTEXT_NODE],
                              CAT_THREEFOUR_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[CAT_FIVE_CONTEXT_NODE],
                              CAT_FIVE_CONTEXT_NODE_0_);
    val = extrabits[DCT_VAL_CATEGORY6].min_val;
    bits_count = extrabits[DCT_VAL_CATEGORY6].length;
    do
    {
        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
        bits_count -- ;
    }
    while (bits_count >= 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_FIVE_CONTEXT_NODE_0_:
    val = extrabits[DCT_VAL_CATEGORY5].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_THREEFOUR_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(prob[CAT_THREE_CONTEXT_NODE],
                              CAT_THREE_CONTEXT_NODE_0_);
    val = extrabits[DCT_VAL_CATEGORY4].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_THREE_CONTEXT_NODE_0_:
    val = extrabits[DCT_VAL_CATEGORY3].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 HIGH_LOW_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(prob[CAT_ONE_CONTEXT_NODE],
                              CAT_ONE_CONTEXT_NODE_0_);
    val = extrabits[DCT_VAL_CATEGORY2].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 CAT_ONE_CONTEXT_NODE_0_:
    val = extrabits[DCT_VAL_CATEGORY1].min_val;
    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
 LOW_VAL_CONTEXT_NODE_0_:
    DECODE_AND_BRANCH_IF_ZERO(prob[TWO_CONTEXT_NODE],
                              TWO_CONTEXT_NODE_0_);
    DECODE_AND_BRANCH_IF_ZERO(prob[THREE_CONTEXT_NODE],
                              THREE_CONTEXT_NODE_0_);
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
 THREE_CONTEXT_NODE_0_:
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
 TWO_CONTEXT_NODE_0_:
    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
 ONE_CONTEXT_NODE_0_:
    DECODE_AND_APPLYSIGN(1);
    prob = type_probs + ENTROPY_NODES;
    if (c < 15)
    {
        b_tokens[zigzag[c]] = v;
        ++c;
        goto DO_WHILE;
    }
    b_tokens[zigzag[15]] = v;
 BLOCK_FINISHED:
    eob_mask |= (c > 1) << i;
    t = (c != !type);   // any nonzero data?
    eob_mask |= t << 31;
    left[left_context_index[i]] = above[above_context_index[i]] = t;
    b_tokens += 16;
    i++;
    if (i < stop)
        goto BLOCK_LOOP;
    if (i == 25)
    {
        type = 0;
        i = 0;
        stop = 16;
        type_probs = probs[type][0][0];
        b_tokens = tokens;
        dqf = factor[TOKEN_BLOCK_Y1];
        goto BLOCK_LOOP;
    }
    if (i == 16)
    {
        type = 2;
        type_probs = probs[type][0][0];
        stop = 24;
        dqf = factor[TOKEN_BLOCK_UV];
        goto BLOCK_LOOP;
    }
    return eob_mask;
 }
 static void
 reset_row_context(token_entropy_ctx_t *left)
 {
    memset(left, 0, sizeof(*left));
 }
 static void
 reset_above_context(token_entropy_ctx_t *above, unsigned int cols)
 {
    memset(above, 0, cols * sizeof(*above));
 }
 static void
 reset_mb_context(token_entropy_ctx_t  *left,
                 token_entropy_ctx_t  *above,
                 enum prediction_mode  mode)
 {
    /* Reset the macroblock context on the left and right. We have to
     * preserve the context of the second order block if this mode
     * would not have updated it.
     */
    memset(left, 0, sizeof((*left)[0]) * 8);
    memset(above, 0, sizeof((*above)[0]) * 8);
    if (mode != B_PRED && mode != SPLITMV)
    {
        (*left)[8] = 0;
        (*above)[8] = 0;
    }
 }
 void
 vp8_dixie_tokens_process_row(struct vp8_decoder_ctx *ctx,
                             unsigned int            partition,
                             unsigned int            row,
                             unsigned int            start_col,
                             unsigned int            num_cols)
 {
    struct token_decoder *tokens = &ctx->tokens[partition];
    short                *coeffs = tokens->coeffs + 25 * 16 * start_col;
    unsigned int          col;
    token_entropy_ctx_t  *above = ctx->above_token_entropy_ctx
                                  + start_col;
    token_entropy_ctx_t  *left = &tokens->left_token_entropy_ctx;
    struct mb_info       *mbi = ctx->mb_info_rows[row] + start_col;
    if (row == 0)
        reset_above_context(above, num_cols);
    if (start_col == 0)
        reset_row_context(left);
    for (col = start_col; col < start_col + num_cols; col++)
    {
        memset(coeffs, 0, 25 * 16 * sizeof(short));
        if (mbi->base.skip_coeff)
        {
            reset_mb_context(left, above, mbi->base.y_mode);
            mbi->base.eob_mask = 0;
        }
        else
        {
            struct dequant_factors *dqf;
            dqf = ctx->dequant_factors  + mbi->base.segment_id;
            mbi->base.eob_mask =
                decode_mb_tokens(&tokens->bool,
                                 *left, *above,
                                 coeffs,
                                 mbi->base.y_mode,
                                 ctx->entropy_hdr.coeff_probs,
                                 dqf->factor);
        }
        above++;
        mbi++;
        coeffs += 25 * 16;
    }
 }
 void
 vp8_dixie_tokens_init(struct vp8_decoder_ctx *ctx)
 {
    unsigned int  partitions = ctx->token_hdr.partitions;
    if (ctx->frame_hdr.frame_size_updated)
    {
        unsigned int i;
        unsigned int coeff_row_sz =
            ctx->mb_cols * 25 * 16 * sizeof(short);
        for (i = 0; i < partitions; i++)
        {
            free(ctx->tokens[i].coeffs);
            ctx->tokens[i].coeffs = memalign(16, coeff_row_sz);
            if (!ctx->tokens[i].coeffs)
                vpx_internal_error(&ctx->error, VPX_CODEC_MEM_ERROR,
                                   NULL);
        }
        free(ctx->above_token_entropy_ctx);
        ctx->above_token_entropy_ctx =
            calloc(ctx->mb_cols, sizeof(*ctx->above_token_entropy_ctx));
        if (!ctx->above_token_entropy_ctx)
            vpx_internal_error(&ctx->error, VPX_CODEC_MEM_ERROR, NULL);
    }
 }
 void
 vp8_dixie_tokens_destroy(struct vp8_decoder_ctx *ctx)
 {
    int i;
    for (i = 0; i < MAX_PARTITIONS; i++)
        free(ctx->tokens[i].coeffs);
    free(ctx->above_token_entropy_ctx);
 }
--- a/vp8/dixie/tokens.h
+++ b/vp8/dixie/tokens.h
@@ -1,28 +0,0 @@
 /*
 *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef TOKENS_H
 #define TOKENS_H
 void
 vp8_dixie_tokens_init(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_tokens_destroy(struct vp8_decoder_ctx *ctx);
 void
 vp8_dixie_tokens_process_row(struct vp8_decoder_ctx *ctx,
                             unsigned int            partition,
                             unsigned int            row,
                             unsigned int            start_col,
                             unsigned int            num_cols);
 #endif
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`Adrian Grange <agrange@google.com>`
							`Johann Koenig <johannkoenig@google.com>`